In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
import pandas as pd
import numpy as np


In [11]:
file_path = 'winequality-red.csv'
dataset = pd.read_csv(file_path, delimiter=';')

dataset.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
missing_values = dataset.isnull().sum()
missing_values

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [13]:
data_types = dataset.dtypes
data_types

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [14]:
quality_distribution = dataset['quality'].value_counts().sort_index()
quality_distribution

quality
3     10
4     53
5    681
6    638
7    199
8     18
Name: count, dtype: int64

In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = dataset.drop('quality', axis=1)
y = dataset['quality']

X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)

balanced_dataset = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['quality'])], axis=1)
balanced_dataset['quality'].value_counts()

quality
5    681
6    681
7    681
4    681
8    681
3    681
Name: count, dtype: int64

In [16]:
outliers = dataset[(np.abs((dataset - dataset.mean()) / dataset.std()) > 3).any(axis=1)]
outliers

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
13,7.8,0.610,0.29,1.6,0.114,9.0,29.0,0.99740,3.26,1.56,9.10,5
14,8.9,0.620,0.18,3.8,0.176,52.0,145.0,0.99860,3.16,0.88,9.20,5
15,8.9,0.620,0.19,3.9,0.170,51.0,148.0,0.99860,3.17,0.93,9.20,5
17,8.1,0.560,0.28,1.7,0.368,16.0,56.0,0.99680,3.11,1.28,9.30,5
19,7.9,0.320,0.51,1.8,0.341,17.0,56.0,0.99690,3.04,1.08,9.20,6
...,...,...,...,...,...,...,...,...,...,...,...,...
1505,6.7,0.760,0.02,1.8,0.078,6.0,12.0,0.99600,3.55,0.63,9.95,3
1558,6.9,0.630,0.33,6.7,0.235,66.0,115.0,0.99787,3.22,0.56,9.50,5
1570,6.4,0.360,0.53,2.2,0.230,19.0,35.0,0.99340,3.37,0.93,12.40,6
1574,5.6,0.310,0.78,13.9,0.074,23.0,92.0,0.99677,3.39,0.48,10.50,6


In [17]:
correlation_matrix = dataset.corr()
correlation_matrix['quality'].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

In [18]:
outliers = dataset[(np.abs((dataset - dataset.mean()) / dataset.std()) > 3).any(axis=1)]

duplicates = dataset.duplicated().sum()

outliers_count = len(outliers)
outliers_count, duplicates


(148, 240)

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(dataset.drop('quality', axis=1))

scaled_dataset = pd.DataFrame(scaled_features, columns=dataset.columns[:-1])
scaled_dataset['quality'] = dataset['quality']

unscaled_dataset = dataset.copy()

scaled_dataset.head(), unscaled_dataset.head()

(   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
 0      -0.528360          0.961877    -1.391472       -0.453218  -0.243707   
 1      -0.298547          1.967442    -1.391472        0.043416   0.223875   
 2      -0.298547          1.297065    -1.186070       -0.169427   0.096353   
 3       1.654856         -1.384443     1.484154       -0.453218  -0.264960   
 4      -0.528360          0.961877    -1.391472       -0.453218  -0.243707   
 
    free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
 0            -0.466193             -0.379133  0.558274  1.288643  -0.579207   
 1             0.872638              0.624363  0.028261 -0.719933   0.128950   
 2            -0.083669              0.229047  0.134264 -0.331177  -0.048089   
 3             0.107592              0.411500  0.664277 -0.979104  -0.461180   
 4            -0.466193             -0.379133  0.558274  1.288643  -0.579207   
 
     alcohol  quality  
 0 -0.960246      

In [20]:
from sklearn.model_selection import train_test_split

X = scaled_dataset.drop('quality', axis=1)
y = scaled_dataset['quality']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

len(X_train), len(X_val), len(X_test)


(959, 320, 320)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

mae, mse, r2

(0.49406662960072334, 0.3922958416597431, 0.3383226402800532)

In [22]:
tolerance = 0.5
accuracy = (abs(y_val - y_val_pred) <= tolerance).mean()
accuracy

0.584375

In [23]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)

mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
mse_rf = mean_squared_error(y_val, y_val_pred_rf)
r2_rf = r2_score(y_val, y_val_pred_rf)

mae_rf, mse_rf, r2_rf

(0.41265625000000006, 0.3028653125, 0.48916328177760215)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

Fitting 3 folds for each of 108 candidates, totalling 324 fits


{'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [25]:
rf_model_optimized = RandomForestRegressor(
    max_depth=None,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

rf_model_optimized.fit(X_train, y_train)
y_val_pred_rf_optimized = rf_model_optimized.predict(X_val)

mae_rf_optimized = mean_absolute_error(y_val, y_val_pred_rf_optimized)
mse_rf_optimized = mean_squared_error(y_val, y_val_pred_rf_optimized)
r2_rf_optimized = r2_score(y_val, y_val_pred_rf_optimized)

mae_rf_optimized, mse_rf_optimized, r2_rf_optimized


(0.42428437500000005, 0.31183402616252526, 0.474035936172315)

In [26]:
tolerance = 0.5
accuracy_rf_optimized = (abs(y_val - y_val_pred_rf_optimized) <= tolerance).mean()
accuracy_rf_optimized


0.66875

In [27]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid_xgb,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

xgb_grid_search.fit(X_train, y_train)
best_params_xgb = xgb_grid_search.best_params_
best_params_xgb


Fitting 3 folds for each of 108 candidates, totalling 324 fits


{'colsample_bytree': 0.8,
 'learning_rate': 0.01,
 'max_depth': 10,
 'n_estimators': 300,
 'subsample': 0.8}

In [28]:
xgb_model_optimized = XGBRegressor(
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=300,
    subsample=0.8,
    random_state=42
)

xgb_model_optimized.fit(X_train, y_train)
y_val_pred_xgb_optimized = xgb_model_optimized.predict(X_val)

mae_xgb_optimized = mean_absolute_error(y_val, y_val_pred_xgb_optimized)
mse_xgb_optimized = mean_squared_error(y_val, y_val_pred_xgb_optimized)
r2_xgb_optimized = r2_score(y_val, y_val_pred_xgb_optimized)

mae_xgb_optimized, mse_xgb_optimized, r2_xgb_optimized


(0.41658047437667844, 0.3051340305822322, 0.4853366814643051)

In [29]:
tolerance = 0.5
accuracy_xgb_optimized = (abs(y_val - y_val_pred_xgb_optimized) <= tolerance).mean()
accuracy_xgb_optimized

0.690625

In [30]:
param_grid_xgb_fine = {
    'n_estimators': [300, 400, 500],
    'learning_rate': [0.005, 0.01, 0.02],
    'max_depth': [8, 10, 12],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

xgb_grid_search_fine = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid_xgb_fine,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

xgb_grid_search_fine.fit(X_train, y_train)
best_params_xgb_fine = xgb_grid_search_fine.best_params_
best_params_xgb_fine


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


{'colsample_bytree': 0.9,
 'learning_rate': 0.02,
 'max_depth': 8,
 'n_estimators': 300,
 'reg_alpha': 0,
 'reg_lambda': 1.5,
 'subsample': 0.7}

In [31]:
xgb_model_fine_tuned = XGBRegressor(
    colsample_bytree=0.9,
    learning_rate=0.02,
    max_depth=8,
    n_estimators=300,
    reg_alpha=0,
    reg_lambda=1.5,
    subsample=0.7,
    random_state=42
)

xgb_model_fine_tuned.fit(X_train, y_train)
y_val_pred_xgb_fine_tuned = xgb_model_fine_tuned.predict(X_val)

mae_xgb_fine_tuned = mean_absolute_error(y_val, y_val_pred_xgb_fine_tuned)
mse_xgb_fine_tuned = mean_squared_error(y_val, y_val_pred_xgb_fine_tuned)
r2_xgb_fine_tuned = r2_score(y_val, y_val_pred_xgb_fine_tuned)

mae_xgb_fine_tuned, mse_xgb_fine_tuned, r2_xgb_fine_tuned


(0.40771944373846053, 0.30567300769881667, 0.4844275998030204)

In [32]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression

base_models = [
    ('random_forest', RandomForestRegressor(
        max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state=42
    )),
    ('xgboost', XGBRegressor(
        colsample_bytree=0.9, learning_rate=0.02, max_depth=8, n_estimators=300, reg_alpha=0, reg_lambda=1.5, subsample=0.7, random_state=42
    ))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),
    n_jobs=-1
)

stacking_model.fit(X_train, y_train)
y_val_pred_stacking = stacking_model.predict(X_val)

mae_stacking = mean_absolute_error(y_val, y_val_pred_stacking)
mse_stacking = mean_squared_error(y_val, y_val_pred_stacking)
r2_stacking = r2_score(y_val, y_val_pred_stacking)

mae_stacking, mse_stacking, r2_stacking


(0.41743675500290667, 0.30463568562445703, 0.4861772296956993)

In [33]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor

base_models = [
    ('random_forest', RandomForestRegressor(
        max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state=42
    )),
    ('xgboost', XGBRegressor(
        colsample_bytree=0.9, learning_rate=0.02, max_depth=8, n_estimators=300, reg_alpha=0, reg_lambda=1.5, subsample=0.7, random_state=42
    ))
]

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(),
    n_jobs=-1
)

param_grid_meta = {
    'final_estimator__alpha': [0.1, 1.0, 10.0],
    'final_estimator__solver': ['auto', 'svd', 'cholesky', 'lsqr']
}

grid_search_meta = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid_meta,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search_meta.fit(X_train, y_train)
best_params_meta = grid_search_meta.best_params_
best_params_meta


Fitting 3 folds for each of 12 candidates, totalling 36 fits


{'final_estimator__alpha': 10.0, 'final_estimator__solver': 'auto'}

In [34]:
stacking_model_optimized = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=10.0, solver='auto'),
    n_jobs=-1
)

stacking_model_optimized.fit(X_train, y_train)
y_val_pred_stacking_optimized = stacking_model_optimized.predict(X_val)

mae_stacking_optimized = mean_absolute_error(y_val, y_val_pred_stacking_optimized)
mse_stacking_optimized = mean_squared_error(y_val, y_val_pred_stacking_optimized)
r2_stacking_optimized = r2_score(y_val, y_val_pred_stacking_optimized)

mae_stacking_optimized, mse_stacking_optimized, r2_stacking_optimized

(0.4202781190698513, 0.305672205308001, 0.4844289531791718)

In [35]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor

base_models_updated = [
    ('random_forest', RandomForestRegressor(
        max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state=42
    )),
    ('xgboost', XGBRegressor(
        colsample_bytree=0.9, learning_rate=0.02, max_depth=8, n_estimators=300, reg_alpha=0, reg_lambda=1.5, subsample=0.7, random_state=42
    )),
    ('extra_trees', ExtraTreesRegressor(n_estimators=100, random_state=42)),
    ('knn', KNeighborsRegressor(n_neighbors=5))
]

stacking_model_updated = StackingRegressor(
    estimators=base_models_updated,
    final_estimator=Ridge(alpha=10.0, solver='auto'),
    n_jobs=-1
)

stacking_model_updated.fit(X_train, y_train)
y_val_pred_stacking_updated = stacking_model_updated.predict(X_val)

mae_stacking_updated = mean_absolute_error(y_val, y_val_pred_stacking_updated)
mse_stacking_updated = mean_squared_error(y_val, y_val_pred_stacking_updated)
r2_stacking_updated = r2_score(y_val, y_val_pred_stacking_updated)

mae_stacking_updated, mse_stacking_updated, r2_stacking_updated


(0.3961372738091586, 0.29006083583327325, 0.5107603302642489)

In [36]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV

param_grid_extra_trees = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_extra_trees = GridSearchCV(
    estimator=ExtraTreesRegressor(random_state=42),
    param_grid=param_grid_extra_trees,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search_extra_trees.fit(X_train, y_train)
best_params_extra_trees = grid_search_extra_trees.best_params_
best_params_extra_trees


Fitting 3 folds for each of 108 candidates, totalling 324 fits


{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [37]:
from sklearn.neighbors import KNeighborsRegressor

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2] 
}

grid_search_knn = GridSearchCV(
    estimator=KNeighborsRegressor(),
    param_grid=param_grid_knn,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search_knn.fit(X_train, y_train)
best_params_knn = grid_search_knn.best_params_
best_params_knn


Fitting 3 folds for each of 16 candidates, totalling 48 fits


{'n_neighbors': 9, 'p': 1, 'weights': 'distance'}

In [38]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

base_models_updated = [
    ('random_forest', RandomForestRegressor(
        max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state=42
    )),
    ('xgboost', XGBRegressor(
        colsample_bytree=0.9, learning_rate=0.02, max_depth=8, n_estimators=300, reg_alpha=0, reg_lambda=1.5, subsample=0.7, random_state=42
    )),
    ('extra_trees', ExtraTreesRegressor(
        n_estimators=300, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42
    )),
    ('knn', KNeighborsRegressor(
        n_neighbors=9, p=1, weights='distance'
    ))
]

stacking_model_updated = StackingRegressor(
    estimators=base_models_updated,
    final_estimator=Ridge(alpha=10.0, solver='auto'),
    n_jobs=-1
)

stacking_model_updated.fit(X_train, y_train)
y_val_pred_stacking_updated = stacking_model_updated.predict(X_val)

mae_stacking_updated = mean_absolute_error(y_val, y_val_pred_stacking_updated)
mse_stacking_updated = mean_squared_error(y_val, y_val_pred_stacking_updated)
r2_stacking_updated = r2_score(y_val, y_val_pred_stacking_updated)

mae_stacking_updated, mse_stacking_updated, r2_stacking_updated


(0.39435420486176354, 0.28846783845527135, 0.5134472063082509)

In [39]:
y_test_pred_stacking_updated = stacking_model_updated.predict(X_test)

mae_stacking_test = mean_absolute_error(y_test, y_test_pred_stacking_updated)
mse_stacking_test = mean_squared_error(y_test, y_test_pred_stacking_updated)
r2_stacking_test = r2_score(y_test, y_test_pred_stacking_updated)

tolerance = 0.5
accuracy_stacking_test = (abs(y_test - y_test_pred_stacking_updated) <= tolerance).mean()

mae_stacking_test, mse_stacking_test, r2_stacking_test, accuracy_stacking_test


(0.45842162470522235, 0.398865523282976, 0.43570282420313977, 0.634375)