# Homework_03_Wine_quality

## Multi-Class Classification with Boosting Algorithms - White Wine Quality

Afet Ibadova 453818

# Import libraries

In [20]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [22]:
from ucimlrepo import fetch_ucirepo

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, log_loss, confusion_matrix
from sklearn.metrics import classification_report

# Step 1: Data Loading

In [23]:
# URL of the White Wine Quality dataset on the UCI Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

# Read the dataset into a DataFrame
wine_quality = pd.read_csv(url, sep=';')

# Assuming 'quality' is target variable and the other columns are features
X = wine_quality.drop(columns=['quality'])
y = wine_quality['quality']


In [24]:
X.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


# Step 2: Data Preprocessing and Splitting

**2. Create a target variable for multi-class classification. You can categorize wine quality scores into classes (e.g., low, medium, high) or keep them as is (but please use at least 3 classes).**

In [25]:
# Create a function to categorize wine quality scores into classes
def categorize_quality(quality):
    if quality <= 4:
        return "low"
    elif quality <= 6:
        return "medium"
    else:
        return "high"

# Apply the categorize_quality function to create the 'class' target variable
y_class = y.apply(categorize_quality)

# Add the 'class' target variable to the DataFrame
wine_quality['class'] = y_class


**4. Split the data into training and testing sets using an 80-20 or similar ratio.**

In [26]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (3918, 11)
X_test shape: (980, 11)
y_train shape: (3918,)
y_test shape: (980,)


**5. Perform any necessary SEDA, data preprocessing and feature engineering.**

In [27]:
# Handling Missing Values:
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)


In [28]:
#Data Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [29]:
data_types = wine_quality.dtypes
print(data_types)


fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
class                    object
dtype: object


In [30]:
wine_quality = pd.get_dummies(wine_quality, columns=['class'], drop_first=True)

In [31]:
#Label Encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

# Step 3: Feature Selection

**6. Apply initial feature selection techniques.**

In [None]:
# Select the top K features (e.g., K=5)
k_best = SelectKBest(score_func=chi2, k=5)
X_new = k_best.fit_transform(X, y)


In [None]:
corr_matrix = X.corrwith(y)
selected_features = corr_matrix[corr_matrix.abs().nlargest(K).index]


# Step 4: Model Training and Hyperparameter Tuning

7. Train and fine-tune the following models separately using only your training data (is the data well balanced? if not maybe we can take this into account using some hyperparameters or we should rebalance our dataset using techniques from ML1 course?):

- XGBoost
- LightGBM
- CatBoost
- Classical Gradient Boosting Model
- extra task: please use sklearn Histogram-Based Gradient Boosting model (alternative for LightGBM) Please consider which evaluation metrics (https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel) and cost functions to choose!!! Your are solving multiclass classification problem!

In [33]:
# Define the models
models = {
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'HistGradientBoosting': HistGradientBoostingClassifier()
}

In [34]:
# Iterate through models and perform hyperparameter tuning
best_models = {}
for name, model in models.items():
    param_grid = {
        # Define hyperparameters for tuning
    }

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_models[name] = grid_search.best_estimator_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1329
[LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 11
[LightGBM] [Info] Start training from score -5.565286
[LightGBM] [Info] Start training from score -3.346083
[LightGBM] [Info] Start training from score -1.212002
[LightGBM] [Info] Start training from score -0.796864
[LightGBM] [Info] Start training from score -1.739548
[LightGBM] [Info] Start training from score -3.331694
[LightGBM] [Info] Start training from score -6.663899
Learning rate set to 0.084834
0:	learn: 1.8122467	total: 242ms	remaining: 4m 1s
1:	learn: 1.7173513	total: 283ms	remaining: 2m 21s
2:	learn: 1.6325040	total: 322ms	remaining: 1m 47s
3:	learn: 1.5697865	total: 367ms	remaining: 1m 31s
4:	learn: 1.5119885	total: 411ms	remaining: 1m 21s
5:	learn: 1.4633242	total: 455ms	remaining: 1m 15s

70:	learn: 0.9344303	total: 3.17s	remaining: 41.5s
71:	learn: 0.9325436	total: 3.21s	remaining: 41.4s
72:	learn: 0.9308881	total: 3.24s	remaining: 41.2s
73:	learn: 0.9294142	total: 3.28s	remaining: 41s
74:	learn: 0.9282249	total: 3.32s	remaining: 40.9s
75:	learn: 0.9271785	total: 3.35s	remaining: 40.7s
76:	learn: 0.9257612	total: 3.38s	remaining: 40.6s
77:	learn: 0.9239326	total: 3.42s	remaining: 40.4s
78:	learn: 0.9218695	total: 3.46s	remaining: 40.3s
79:	learn: 0.9197734	total: 3.49s	remaining: 40.1s
80:	learn: 0.9186100	total: 3.52s	remaining: 40s
81:	learn: 0.9166959	total: 3.56s	remaining: 39.8s
82:	learn: 0.9134310	total: 3.59s	remaining: 39.6s
83:	learn: 0.9114014	total: 3.62s	remaining: 39.5s
84:	learn: 0.9086045	total: 3.66s	remaining: 39.4s
85:	learn: 0.9065463	total: 3.7s	remaining: 39.3s
86:	learn: 0.9048438	total: 3.76s	remaining: 39.5s
87:	learn: 0.9039736	total: 3.8s	remaining: 39.4s
88:	learn: 0.9018256	total: 3.85s	remaining: 39.4s
89:	learn: 0.9007066	total: 3.89s	rem

234:	learn: 0.7075857	total: 8.66s	remaining: 28.2s
235:	learn: 0.7064658	total: 8.69s	remaining: 28.1s
236:	learn: 0.7054224	total: 8.72s	remaining: 28.1s
237:	learn: 0.7044906	total: 8.75s	remaining: 28s
238:	learn: 0.7038377	total: 8.79s	remaining: 28s
239:	learn: 0.7030280	total: 8.82s	remaining: 27.9s
240:	learn: 0.7013176	total: 8.85s	remaining: 27.9s
241:	learn: 0.6996992	total: 8.87s	remaining: 27.8s
242:	learn: 0.6986484	total: 8.9s	remaining: 27.7s
243:	learn: 0.6978140	total: 8.93s	remaining: 27.7s
244:	learn: 0.6972372	total: 8.95s	remaining: 27.6s
245:	learn: 0.6964374	total: 8.98s	remaining: 27.5s
246:	learn: 0.6956102	total: 9.01s	remaining: 27.5s
247:	learn: 0.6945056	total: 9.04s	remaining: 27.4s
248:	learn: 0.6935688	total: 9.07s	remaining: 27.4s
249:	learn: 0.6927192	total: 9.11s	remaining: 27.3s
250:	learn: 0.6922307	total: 9.14s	remaining: 27.3s
251:	learn: 0.6906628	total: 9.18s	remaining: 27.3s
252:	learn: 0.6899550	total: 9.22s	remaining: 27.2s
253:	learn: 0.688

396:	learn: 0.5604169	total: 12.4s	remaining: 18.8s
397:	learn: 0.5594327	total: 12.4s	remaining: 18.7s
398:	learn: 0.5587499	total: 12.4s	remaining: 18.7s
399:	learn: 0.5579668	total: 12.4s	remaining: 18.6s
400:	learn: 0.5570090	total: 12.4s	remaining: 18.6s
401:	learn: 0.5564711	total: 12.4s	remaining: 18.5s
402:	learn: 0.5557161	total: 12.5s	remaining: 18.5s
403:	learn: 0.5551364	total: 12.5s	remaining: 18.4s
404:	learn: 0.5543512	total: 12.5s	remaining: 18.4s
405:	learn: 0.5538032	total: 12.5s	remaining: 18.3s
406:	learn: 0.5528663	total: 12.5s	remaining: 18.3s
407:	learn: 0.5521014	total: 12.6s	remaining: 18.2s
408:	learn: 0.5514846	total: 12.6s	remaining: 18.2s
409:	learn: 0.5505610	total: 12.6s	remaining: 18.1s
410:	learn: 0.5496779	total: 12.6s	remaining: 18.1s
411:	learn: 0.5486150	total: 12.7s	remaining: 18.1s
412:	learn: 0.5479078	total: 12.7s	remaining: 18s
413:	learn: 0.5472654	total: 12.7s	remaining: 18s
414:	learn: 0.5457790	total: 12.7s	remaining: 17.9s
415:	learn: 0.54

560:	learn: 0.4538710	total: 15.6s	remaining: 12.2s
561:	learn: 0.4534382	total: 15.7s	remaining: 12.2s
562:	learn: 0.4529870	total: 15.7s	remaining: 12.2s
563:	learn: 0.4524978	total: 15.7s	remaining: 12.2s
564:	learn: 0.4522177	total: 15.8s	remaining: 12.1s
565:	learn: 0.4516481	total: 15.8s	remaining: 12.1s
566:	learn: 0.4510649	total: 15.8s	remaining: 12.1s
567:	learn: 0.4507336	total: 15.8s	remaining: 12s
568:	learn: 0.4499831	total: 15.8s	remaining: 12s
569:	learn: 0.4495902	total: 15.9s	remaining: 12s
570:	learn: 0.4490226	total: 15.9s	remaining: 11.9s
571:	learn: 0.4486629	total: 15.9s	remaining: 11.9s
572:	learn: 0.4482044	total: 15.9s	remaining: 11.9s
573:	learn: 0.4478122	total: 16s	remaining: 11.9s
574:	learn: 0.4475775	total: 16s	remaining: 11.8s
575:	learn: 0.4469484	total: 16s	remaining: 11.8s
576:	learn: 0.4464608	total: 16.1s	remaining: 11.8s
577:	learn: 0.4457107	total: 16.1s	remaining: 11.7s
578:	learn: 0.4452871	total: 16.1s	remaining: 11.7s
579:	learn: 0.4448488	to

727:	learn: 0.3733959	total: 19.6s	remaining: 7.31s
728:	learn: 0.3731224	total: 19.6s	remaining: 7.28s
729:	learn: 0.3727391	total: 19.6s	remaining: 7.25s
730:	learn: 0.3725163	total: 19.6s	remaining: 7.22s
731:	learn: 0.3719914	total: 19.7s	remaining: 7.2s
732:	learn: 0.3716462	total: 19.7s	remaining: 7.17s
733:	learn: 0.3713640	total: 19.7s	remaining: 7.14s
734:	learn: 0.3709563	total: 19.7s	remaining: 7.11s
735:	learn: 0.3707384	total: 19.8s	remaining: 7.08s
736:	learn: 0.3703594	total: 19.8s	remaining: 7.06s
737:	learn: 0.3701958	total: 19.8s	remaining: 7.03s
738:	learn: 0.3699442	total: 19.8s	remaining: 7s
739:	learn: 0.3693862	total: 19.9s	remaining: 6.98s
740:	learn: 0.3691072	total: 19.9s	remaining: 6.95s
741:	learn: 0.3687259	total: 19.9s	remaining: 6.92s
742:	learn: 0.3683979	total: 19.9s	remaining: 6.89s
743:	learn: 0.3677524	total: 19.9s	remaining: 6.86s
744:	learn: 0.3672357	total: 20s	remaining: 6.83s
745:	learn: 0.3668626	total: 20s	remaining: 6.8s
746:	learn: 0.3666137

895:	learn: 0.3118397	total: 23.3s	remaining: 2.71s
896:	learn: 0.3113163	total: 23.3s	remaining: 2.68s
897:	learn: 0.3106485	total: 23.4s	remaining: 2.65s
898:	learn: 0.3104059	total: 23.4s	remaining: 2.63s
899:	learn: 0.3100841	total: 23.4s	remaining: 2.6s
900:	learn: 0.3097301	total: 23.5s	remaining: 2.58s
901:	learn: 0.3090303	total: 23.5s	remaining: 2.55s
902:	learn: 0.3087242	total: 23.5s	remaining: 2.53s
903:	learn: 0.3083792	total: 23.6s	remaining: 2.5s
904:	learn: 0.3080096	total: 23.6s	remaining: 2.48s
905:	learn: 0.3076268	total: 23.6s	remaining: 2.45s
906:	learn: 0.3071608	total: 23.6s	remaining: 2.42s
907:	learn: 0.3069325	total: 23.6s	remaining: 2.4s
908:	learn: 0.3066250	total: 23.7s	remaining: 2.37s
909:	learn: 0.3064519	total: 23.7s	remaining: 2.34s
910:	learn: 0.3061531	total: 23.7s	remaining: 2.32s
911:	learn: 0.3059722	total: 23.7s	remaining: 2.29s
912:	learn: 0.3055798	total: 23.8s	remaining: 2.26s
913:	learn: 0.3053429	total: 23.8s	remaining: 2.24s
914:	learn: 0.3

**XGBoost**

In [35]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create the XGBoost classifier
model = xgb.XGBClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


**LightGBM**

In [36]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create the LightGBM classifier
model = LGBMClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1329
[LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 11
[LightGBM] [Info] Start training from score -5.565286
[LightGBM] [Info] Start training from score -3.346083
[LightGBM] [Info] Start training from score -1.212002
[LightGBM] [Info] Start training from score -0.796864
[LightGBM] [Info] Start training from score -1.739548
[LightGBM] [Info] Start training from score -3.331694
[LightGBM] [Info] Start training from score -6.663899






















**CatBoost**

In [37]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create the CatBoost classifier
model = CatBoostClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


0:	learn: 1.8097556	total: 638ms	remaining: 3m 10s
1:	learn: 1.7014596	total: 1.2s	remaining: 2m 59s
2:	learn: 1.6289488	total: 1.82s	remaining: 3m
3:	learn: 1.5570684	total: 2.4s	remaining: 2m 57s
4:	learn: 1.4964472	total: 2.94s	remaining: 2m 53s
5:	learn: 1.4371724	total: 3.49s	remaining: 2m 50s
6:	learn: 1.3912826	total: 4.05s	remaining: 2m 49s
7:	learn: 1.3502812	total: 4.55s	remaining: 2m 46s
8:	learn: 1.3104955	total: 5.09s	remaining: 2m 44s
9:	learn: 1.2741005	total: 5.69s	remaining: 2m 45s
10:	learn: 1.2409983	total: 6.33s	remaining: 2m 46s
11:	learn: 1.2120920	total: 6.98s	remaining: 2m 47s
12:	learn: 1.1863633	total: 7.65s	remaining: 2m 48s
13:	learn: 1.1620525	total: 8.24s	remaining: 2m 48s
14:	learn: 1.1398881	total: 8.83s	remaining: 2m 47s
15:	learn: 1.1186831	total: 9.47s	remaining: 2m 48s
16:	learn: 1.0994962	total: 10.1s	remaining: 2m 48s
17:	learn: 1.0811254	total: 10.8s	remaining: 2m 48s
18:	learn: 1.0645642	total: 11.4s	remaining: 2m 48s
19:	learn: 1.0477817	total: 

158:	learn: 0.4246049	total: 1m 25s	remaining: 1m 15s
159:	learn: 0.4219480	total: 1m 25s	remaining: 1m 15s
160:	learn: 0.4199389	total: 1m 26s	remaining: 1m 14s
161:	learn: 0.4171840	total: 1m 27s	remaining: 1m 14s
162:	learn: 0.4144714	total: 1m 27s	remaining: 1m 13s
163:	learn: 0.4121777	total: 1m 28s	remaining: 1m 13s
164:	learn: 0.4104878	total: 1m 28s	remaining: 1m 12s
165:	learn: 0.4095472	total: 1m 29s	remaining: 1m 12s
166:	learn: 0.4066831	total: 1m 30s	remaining: 1m 11s
167:	learn: 0.4042889	total: 1m 30s	remaining: 1m 11s
168:	learn: 0.4029391	total: 1m 31s	remaining: 1m 10s
169:	learn: 0.4017918	total: 1m 31s	remaining: 1m 10s
170:	learn: 0.3992991	total: 1m 32s	remaining: 1m 9s
171:	learn: 0.3968355	total: 1m 32s	remaining: 1m 9s
172:	learn: 0.3950765	total: 1m 33s	remaining: 1m 8s
173:	learn: 0.3932563	total: 1m 34s	remaining: 1m 8s
174:	learn: 0.3914985	total: 1m 34s	remaining: 1m 7s
175:	learn: 0.3892526	total: 1m 35s	remaining: 1m 7s
176:	learn: 0.3872826	total: 1m 35

**Classical Gradient Boosting**

In [38]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create the GradientBoostingClassifier
model = GradientBoostingClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best estimator and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


**8.** Try to conduct the feature selection process based on the measures built into the models. Create a variable that is pure noise and does not add any value to the model (draw it from some distribution, e.g. a uniform distribution) and add it to the training set. If any variable is less important than the created noise, please remove it from the model. Of course, at the end, remove this noise from the model as well.

In [45]:
# Calculate feature importances for each model
import numpy as np
# XGBoost Feature Importance
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_feature_importances = xgb_model.feature_importances_

# LightGBM Feature Importance
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)
lgb_feature_importances = lgb_model.feature_importances_

# CatBoost Feature Importance
catboost_model = CatBoostClassifier()
catboost_model.fit(X_train, y_train)
catboost_feature_importances = catboost_model.get_feature_importance()

# Classical Gradient Boosting Feature Importance
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_feature_importances = gb_model.feature_importances_

# Identify the noise feature (assuming it's the last feature)
noise_feature_importance = xgb_feature_importances[-1]





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1329
[LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 11
[LightGBM] [Info] Start training from score -5.565286
[LightGBM] [Info] Start training from score -3.346083
[LightGBM] [Info] Start training from score -1.212002
[LightGBM] [Info] Start training from score -0.796864
[LightGBM] [Info] Start training from score -1.739548
[LightGBM] [Info] Start training from score -3.331694
[LightGBM] [Info] Start training from score -6.663899
Learning rate set to 0.084834
0:	learn: 1.8122467	total: 28.2ms	remaining: 28.2s
1:	learn: 1.7173513	total: 50.5ms	remaining: 25.2s
2:	learn: 1.6325040	total: 73.8ms	remaining: 24.5s
3:	learn: 1.5697865	total: 97.9ms	remaining: 24.4s
4:	learn: 1.5119885	total: 120ms	remaining: 24s
5:	learn: 1.4633242	total: 144ms	remaining: 23.8s
6:

74:	learn: 0.9282249	total: 2.26s	remaining: 27.9s
75:	learn: 0.9271785	total: 2.29s	remaining: 27.8s
76:	learn: 0.9257612	total: 2.31s	remaining: 27.8s
77:	learn: 0.9239326	total: 2.34s	remaining: 27.7s
78:	learn: 0.9218695	total: 2.37s	remaining: 27.6s
79:	learn: 0.9197734	total: 2.39s	remaining: 27.5s
80:	learn: 0.9186100	total: 2.42s	remaining: 27.5s
81:	learn: 0.9166959	total: 2.45s	remaining: 27.4s
82:	learn: 0.9134310	total: 2.47s	remaining: 27.3s
83:	learn: 0.9114014	total: 2.48s	remaining: 27s
84:	learn: 0.9086045	total: 2.49s	remaining: 26.8s
85:	learn: 0.9065463	total: 2.51s	remaining: 26.7s
86:	learn: 0.9048438	total: 2.55s	remaining: 26.7s
87:	learn: 0.9039736	total: 2.57s	remaining: 26.7s
88:	learn: 0.9018256	total: 2.6s	remaining: 26.6s
89:	learn: 0.9007066	total: 2.63s	remaining: 26.6s
90:	learn: 0.8983618	total: 2.66s	remaining: 26.6s
91:	learn: 0.8953441	total: 2.7s	remaining: 26.6s
92:	learn: 0.8934044	total: 2.73s	remaining: 26.6s
93:	learn: 0.8925935	total: 2.76s	r

238:	learn: 0.7038377	total: 7.17s	remaining: 22.8s
239:	learn: 0.7030280	total: 7.19s	remaining: 22.8s
240:	learn: 0.7013176	total: 7.21s	remaining: 22.7s
241:	learn: 0.6996992	total: 7.24s	remaining: 22.7s
242:	learn: 0.6986484	total: 7.26s	remaining: 22.6s
243:	learn: 0.6978140	total: 7.29s	remaining: 22.6s
244:	learn: 0.6972372	total: 7.31s	remaining: 22.5s
245:	learn: 0.6964374	total: 7.34s	remaining: 22.5s
246:	learn: 0.6956102	total: 7.37s	remaining: 22.5s
247:	learn: 0.6945056	total: 7.4s	remaining: 22.4s
248:	learn: 0.6935688	total: 7.42s	remaining: 22.4s
249:	learn: 0.6927192	total: 7.44s	remaining: 22.3s
250:	learn: 0.6922307	total: 7.46s	remaining: 22.3s
251:	learn: 0.6906628	total: 7.48s	remaining: 22.2s
252:	learn: 0.6899550	total: 7.5s	remaining: 22.2s
253:	learn: 0.6886133	total: 7.53s	remaining: 22.1s
254:	learn: 0.6878234	total: 7.55s	remaining: 22.1s
255:	learn: 0.6869549	total: 7.57s	remaining: 22s
256:	learn: 0.6856127	total: 7.6s	remaining: 22s
257:	learn: 0.68510

401:	learn: 0.5564711	total: 10.8s	remaining: 16.1s
402:	learn: 0.5557161	total: 10.9s	remaining: 16.1s
403:	learn: 0.5551364	total: 10.9s	remaining: 16s
404:	learn: 0.5543512	total: 10.9s	remaining: 16s
405:	learn: 0.5538032	total: 10.9s	remaining: 16s
406:	learn: 0.5528663	total: 10.9s	remaining: 15.9s
407:	learn: 0.5521014	total: 11s	remaining: 15.9s
408:	learn: 0.5514846	total: 11s	remaining: 15.9s
409:	learn: 0.5505610	total: 11s	remaining: 15.9s
410:	learn: 0.5496779	total: 11s	remaining: 15.8s
411:	learn: 0.5486150	total: 11.1s	remaining: 15.8s
412:	learn: 0.5479078	total: 11.1s	remaining: 15.8s
413:	learn: 0.5472654	total: 11.1s	remaining: 15.7s
414:	learn: 0.5457790	total: 11.1s	remaining: 15.7s
415:	learn: 0.5450813	total: 11.2s	remaining: 15.7s
416:	learn: 0.5439338	total: 11.2s	remaining: 15.7s
417:	learn: 0.5426834	total: 11.2s	remaining: 15.6s
418:	learn: 0.5421322	total: 11.2s	remaining: 15.6s
419:	learn: 0.5414938	total: 11.3s	remaining: 15.6s
420:	learn: 0.5406255	tota

567:	learn: 0.4507336	total: 15.5s	remaining: 11.8s
568:	learn: 0.4499831	total: 15.5s	remaining: 11.8s
569:	learn: 0.4495902	total: 15.6s	remaining: 11.7s
570:	learn: 0.4490226	total: 15.6s	remaining: 11.7s
571:	learn: 0.4486629	total: 15.6s	remaining: 11.7s
572:	learn: 0.4482044	total: 15.6s	remaining: 11.7s
573:	learn: 0.4478122	total: 15.7s	remaining: 11.6s
574:	learn: 0.4475775	total: 15.7s	remaining: 11.6s
575:	learn: 0.4469484	total: 15.7s	remaining: 11.6s
576:	learn: 0.4464608	total: 15.7s	remaining: 11.5s
577:	learn: 0.4457107	total: 15.7s	remaining: 11.5s
578:	learn: 0.4452871	total: 15.8s	remaining: 11.5s
579:	learn: 0.4448488	total: 15.8s	remaining: 11.4s
580:	learn: 0.4446320	total: 15.8s	remaining: 11.4s
581:	learn: 0.4440730	total: 15.8s	remaining: 11.4s
582:	learn: 0.4431877	total: 15.9s	remaining: 11.3s
583:	learn: 0.4428354	total: 15.9s	remaining: 11.3s
584:	learn: 0.4426787	total: 15.9s	remaining: 11.3s
585:	learn: 0.4419834	total: 15.9s	remaining: 11.3s
586:	learn: 

726:	learn: 0.3739118	total: 19.1s	remaining: 7.19s
727:	learn: 0.3733959	total: 19.2s	remaining: 7.16s
728:	learn: 0.3731224	total: 19.2s	remaining: 7.13s
729:	learn: 0.3727391	total: 19.2s	remaining: 7.1s
730:	learn: 0.3725163	total: 19.2s	remaining: 7.07s
731:	learn: 0.3719914	total: 19.2s	remaining: 7.05s
732:	learn: 0.3716462	total: 19.3s	remaining: 7.02s
733:	learn: 0.3713640	total: 19.3s	remaining: 6.99s
734:	learn: 0.3709563	total: 19.3s	remaining: 6.96s
735:	learn: 0.3707384	total: 19.3s	remaining: 6.93s
736:	learn: 0.3703594	total: 19.3s	remaining: 6.91s
737:	learn: 0.3701958	total: 19.4s	remaining: 6.88s
738:	learn: 0.3699442	total: 19.4s	remaining: 6.85s
739:	learn: 0.3693862	total: 19.4s	remaining: 6.82s
740:	learn: 0.3691072	total: 19.4s	remaining: 6.79s
741:	learn: 0.3687259	total: 19.5s	remaining: 6.76s
742:	learn: 0.3683979	total: 19.5s	remaining: 6.74s
743:	learn: 0.3677524	total: 19.5s	remaining: 6.71s
744:	learn: 0.3672357	total: 19.5s	remaining: 6.68s
745:	learn: 0

892:	learn: 0.3129066	total: 22.9s	remaining: 2.75s
893:	learn: 0.3127868	total: 23s	remaining: 2.72s
894:	learn: 0.3123127	total: 23s	remaining: 2.69s
895:	learn: 0.3118397	total: 23s	remaining: 2.67s
896:	learn: 0.3113163	total: 23s	remaining: 2.64s
897:	learn: 0.3106485	total: 23s	remaining: 2.62s
898:	learn: 0.3104059	total: 23.1s	remaining: 2.59s
899:	learn: 0.3100841	total: 23.1s	remaining: 2.57s
900:	learn: 0.3097301	total: 23.1s	remaining: 2.54s
901:	learn: 0.3090303	total: 23.1s	remaining: 2.51s
902:	learn: 0.3087242	total: 23.2s	remaining: 2.49s
903:	learn: 0.3083792	total: 23.2s	remaining: 2.46s
904:	learn: 0.3080096	total: 23.2s	remaining: 2.44s
905:	learn: 0.3076268	total: 23.2s	remaining: 2.41s
906:	learn: 0.3071608	total: 23.3s	remaining: 2.38s
907:	learn: 0.3069325	total: 23.3s	remaining: 2.36s
908:	learn: 0.3066250	total: 23.3s	remaining: 2.33s
909:	learn: 0.3064519	total: 23.3s	remaining: 2.31s
910:	learn: 0.3061531	total: 23.3s	remaining: 2.28s
911:	learn: 0.3059722	

In [65]:
# Create a list of selected features
selected_features = []

# Iterate through the features and add those with importance greater than noise
for i, importance in enumerate(xgb_feature_importances):
    if importance >= noise_feature_importance:
        selected_features.append(i)

X_test_array = np.array(X_test)

# Filter training and testing data with the selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test_array[:, selected_features]


**9.** For each model, perform hyperparameter tuning using techniques like grid search or random search with cross-validation (e.g., GridSearchCV or RandomizedSearchCV). Tune hyperparameters such as learning rate, maximum depth, number of estimators, and any model-specific hyperparameters - first of all, focus on HP that are related to overfitting.

In [69]:
#XGB

# Define a parameter grid for hyperparameter tuning
param_grid = {
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.5],
    "max_depth": [3, 4, 5, 6, 7],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [1, 2, 3, 4, 5],
}

# Initialize and fit the GridSearchCV for XGBoost
xgb_tuned_model = xgb.XGBClassifier()
xgb_grid_search = GridSearchCV(xgb_tuned_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train_selected, y_train)

# Get the best XGBoost model and its hyperparameters
best_xgb_model = xgb_grid_search.best_estimator_
best_xgb_params = xgb_grid_search.best_params_

# Evaluate the tuned models on the validation set
xgb_val_accuracy = best_xgb_model.score(X_test_selected, y_test)


In [70]:
#LGBM

# Define a parameter grid for LightGBM hyperparameter tuning
lgb_param_grid = {
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.5],
    "max_depth": [3, 4, 5, 6, 7],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "min_child_weight": [1, 2, 3, 4, 5],
}

# Initialize and fit the GridSearchCV for LightGBM
lgb_tuned_model = LGBMClassifier()
lgb_grid_search = GridSearchCV(lgb_tuned_model, lgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
lgb_grid_search.fit(X_train_selected, y_train)

# Get the best LightGBM model and its hyperparameters
best_lgb_model = lgb_grid_search.best_estimator_
best_lgb_params = lgb_grid_search.best_params_

# Evaluate the tuned LightGBM model on the validation set
lgb_val_accuracy = best_lgb_model.score(X_test_selected, y_test)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65
[LightGBM] [Info] Number of data points in the train set: 3918, number of used features: 1
[LightGBM] [Info] Start training from score -5.565286
[LightGBM] [Info] Start training from score -3.346083
[LightGBM] [Info] Start training from score -1.212002
[LightGBM] [Info] Start training from score -0.796864
[LightGBM] [Info] Start training from score -1.739548
[LightGBM] [Info] Start training from score -3.331694
[LightGBM] [Info] Start training from score -6.663899












In [None]:
# CatBoost

# Define a parameter grid for CatBoost hyperparameter tuning
catboost_param_grid = {
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.5],
    "depth": [3, 4, 5, 6, 7, 8, 9, 10],
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS', 'No'],  
    'subsample': [0.6, 0.8],
}

# Initialize and fit the GridSearchCV for CatBoost
catboost_tuned_model = CatBoostClassifier()
catboost_grid_search = GridSearchCV(catboost_tuned_model, catboost_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
catboost_grid_search.fit(X_train_selected, y_train)

# Get the best CatBoost model and its hyperparameters
best_catboost_model = catboost_grid_search.best_estimator_
best_catboost_params = catboost_grid_search.best_params_

# Evaluate the tuned CatBoost model on the validation set
catboost_val_accuracy = best_catboost_model.score(X_test_selected, y_test)


In [79]:
# Hyperparameter Tuning for Classical Gradient Boosting

# Define a parameter grid for Classical Gradient Boosting hyperparameter tuning
gb_param_grid = {
    "learning_rate": [0.01, 0.1, 0.2, 0.3, 0.5],
    "max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
    "subsample": [0.7, 0.8, 0.9, 1.0],
}

# Initialize and fit the GridSearchCV for Classical Gradient Boosting
gb_tuned_model = GradientBoostingClassifier()
gb_grid_search = GridSearchCV(gb_tuned_model, gb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
gb_grid_search.fit(X_train_selected, y_train)

# Get the best Classical Gradient Boosting model and its hyperparameters
best_gb_model = gb_grid_search.best_estimator_
best_gb_params = gb_grid_search.best_params_

# Evaluate the tuned Classical Gradient Boosting model on the validation set
gb_val_accuracy = best_gb_model.score(X_test_selected, y_test)


# **Step 5: Model Comparison**

10. Evaluate the tuned models on the  testing datasets using appropriate classification metrics (e.g., accuracy, precision, recall, F1-score) - please remember that you are solving multi-class classification problem so you have choose proper evaluation strategy - sometimes confusion matrix is the best!!!
11. Compare the performance of XGBoost, LightGBM, CatBoost, and Classical Gradient Boosting. Discuss which model performed the best for our dataset and why

In [81]:
# Make predictions on the test set for each model
xgb_predictions = best_xgb_model.predict(X_test_selected)
lgb_predictions = best_lgb_model.predict(X_test_selected)
#catboost_predictions = best_catboost_model.predict(X_test_selected)
gb_predictions = best_gb_model.predict(X_test_selected)

# Evaluate each model's performance
models = {
    'XGBoost': xgb_predictions,
    'LightGBM': lgb_predictions,
    #'CatBoost': catboost_predictions,
    'Classical Gradient Boosting': gb_predictions
}

metrics = {}

for model_name, predictions in models.items():
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    confusion = confusion_matrix(y_test, predictions)
    
    metrics[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'Confusion Matrix': confusion
    }

# Print and compare the metrics
for model_name, model_metrics in metrics.items():
    print(f"Model: {model_name}")
    print("Metrics:")
    for metric, value in model_metrics.items():
        print(f"{metric}: {value}")
    print("\n")



Model: XGBoost
Metrics:
Accuracy: 0.025510204081632654
Precision: 0.0006507705122865472
Recall: 0.025510204081632654
F1-score: 0.0012691643821707789
Confusion Matrix: [[  0   5   0   0   0   0]
 [  0  25   0   0   0   0]
 [  0 291   0   0   0   0]
 [  0 432   0   0   0   0]
 [  0 192   0   0   0   0]
 [  0  35   0   0   0   0]]


Model: LightGBM
Metrics:
Accuracy: 0.025510204081632654
Precision: 0.0006507705122865472
Recall: 0.025510204081632654
F1-score: 0.0012691643821707789
Confusion Matrix: [[  0   5   0   0   0   0]
 [  0  25   0   0   0   0]
 [  0 291   0   0   0   0]
 [  0 432   0   0   0   0]
 [  0 192   0   0   0   0]
 [  0  35   0   0   0   0]]


Model: Classical Gradient Boosting
Metrics:
Accuracy: 0.025510204081632654
Precision: 0.0006507705122865472
Recall: 0.025510204081632654
F1-score: 0.0012691643821707789
Confusion Matrix: [[  0   5   0   0   0   0]
 [  0  25   0   0   0   0]
 [  0 291   0   0   0   0]
 [  0 432   0   0   0   0]
 [  0 192   0   0   0   0]
 [  0  35   0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
