In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_openml

# Fetch the wine quality dataset from OpenML by its dataset ID (ID=1464)
wine_data = fetch_openml(data_id=1464)

# Convert to a pandas DataFrame
data = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
data['quality'] = wine_data.target

# Check data types of the columns
print("Data types of columns:")
print(data.dtypes)

# Convert all columns to numeric (in case there are non-numeric values stored as strings)
# We can use `pd.to_numeric()` with errors='coerce' to force invalid parsing to NaN
data = data.apply(pd.to_numeric, errors='coerce')

# Show basic information about the dataset
print("\nDataset information:")
print(data.info())

# Check if any columns have NaN values (due to invalid string conversion)
print("\nCheck for missing values after conversion:")
print(data.isnull().sum())

# Drop rows with missing values (optional, or you could fill NaNs with a value like 0 or mean)
data = data.dropna()

# Show the first few rows of the data
print("\nFirst few rows of the dataset:")
print(data.head())

# Split the data into features (X) and target (y)
X = data.drop('quality', axis=1)
y = data['quality']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5

# Print the performance metrics
print(f"\nModel Evaluation:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")

# Optionally, show the feature importances (which features are most important in predicting wine quality)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print("\nFeature Importances:")
print(feature_importance_df.sort_values(by='Importance', ascending=False))



Data types of columns:
V1            int64
V2            int64
V3            int64
V4            int64
quality    category
dtype: object

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   V1       748 non-null    int64
 1   V2       748 non-null    int64
 2   V3       748 non-null    int64
 3   V4       748 non-null    int64
 4   quality  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB
None

Check for missing values after conversion:
V1         0
V2         0
V3         0
V4         0
quality    0
dtype: int64

First few rows of the dataset:
   V1  V2     V3  V4  quality
0   2  50  12500  98        2
1   0  13   3250  28        2
2   1  16   4000  35        2
3   2  20   5000  45        2
4   1  24   6000  77        1

Model Evaluation:
Mean Squared Error: 0.1995
Root Mean Squared Error: 0.4466

Feature Importances:
  Featu