# **Modelling and Tuning**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV




In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
import pickle

# Load train/val/test split data from notebook3
with open(r'Nata_Files\\train_test_split.pkl', 'rb') as f:
    notebook3_data = pickle.load(f)

# Core datasets
X = notebook3_data.get('X')
y = notebook3_data.get('y')

# Splits and processed feature sets
X_train = notebook3_data.get('X_train')
y_train = notebook3_data.get('y_train')
X_val = notebook3_data.get('X_val')
y_val = notebook3_data.get('y_val')
X_test = notebook3_data.get('X_test')
y_test = notebook3_data.get('y_test')
X_train_val = notebook3_data.get('X_train_val')
y_train_val = notebook3_data.get('y_train_val')

numeric_cols = notebook3_data.get('numeric_cols')
kf = notebook3_data.get('kf')
rkf = notebook3_data.get('rkf')
skf = notebook3_data.get('skf')

print("Train/Val/Test split data loaded successfully!")
if X is not None and y is not None:
    print(f"Full dataset X shape: {X.shape} | y shape: {y.shape}")
if X_train is not None:
    print(f"X_train shape: {X_train.shape}")
if X_val is not None:
    print(f"X_val shape: {X_val.shape}")
if X_test is not None:
    print(f"X_test shape: {X_test.shape}")
if kf is not None:
    try:
        print(f"kf splits: {kf.get_n_splits()}")
    except Exception:
        print("kf loaded (object), get_n_splits() unavailable for this object")
if rkf is not None:
    try:
        print(f"rkf splits: {rkf.get_n_splits()}")
    except Exception:
        print("rkf loaded (object), get_n_splits() unavailable for this object")

Train/Val/Test split data loaded successfully!
Full dataset X shape: (5196, 14) | y shape: (5196,)
X_train shape: (3117, 14)
X_val shape: (1039, 14)
X_test shape: (1040, 14)
kf splits: 10
rkf splits: 14


In [4]:

def fit(model, X, y):
    model.fit(X, y)

def predict_proba(model, X_val):
    return model.predict_proba(X_val)
    
def predict(model, X_val):
    return model.predict(X_val)

## **Model Selection**

In [5]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
logr_proba = logr.predict_proba(X_val)[:,1]
logr_pred = logr.predict(X_val)
logr_proba_tr = logr.predict_proba(X_train)[:,1]
logr_pred_tr = logr.predict(X_train)

dtc = DecisionTreeClassifier(max_depth= 5)
dtc.fit(X_train, y_train)
dtc_proba = dtc.predict_proba(X_val)[:,1]
dtc_pred = dtc.predict(X_val)
dtc_proba_tr = dtc.predict_proba(X_train)[:,1]
dtc_pred_tr = dtc.predict(X_train)


rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_proba = rf.predict_proba(X_val)[:,1]
rf_pred = rf.predict(X_val)
rf_proba_tr = rf.predict_proba(X_train)[:,1]
rf_pred_tr = rf.predict(X_train)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_proba = knn.predict_proba(X_val)[:,1]
knn_pred = knn.predict(X_val)
knn_proba_tr = knn.predict_proba(X_train)[:,1]
knn_pred_tr = knn.predict(X_train)

lgb = LGBMClassifier(n_estimators=100, random_state=42)
lgb.fit(X_train, y_train)
lgb_proba = lgb.predict_proba(X_val)[:,1]
lgb_pred = lgb.predict(X_val)
lgb_proba_tr = lgb.predict_proba(X_train)[:,1]
lgb_pred_tr = lgb.predict(X_train)

[LightGBM] [Info] Number of positive: 1980, number of negative: 1137
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000709 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1624
[LightGBM] [Info] Number of data points in the train set: 3117, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635226 -> initscore=0.554704
[LightGBM] [Info] Start training from score 0.554704


In [6]:
def get_metrics(y_val, y_proba, y_pred, model, dataset):
    return {
        "Model" : model,
        "Set" : dataset,
        "AUC": roc_auc_score(y_val, y_proba),
        "Accuracy": accuracy_score(y_val, y_pred),
    }

In [7]:
models_metrics = []

models_metrics.append(get_metrics(y_train, logr_proba_tr, logr_pred_tr, "Logistic Regression", "Train"))
models_metrics.append(get_metrics(y_train, dtc_proba_tr, dtc_pred_tr, "DTClassifier", "Train"))
models_metrics.append(get_metrics(y_train, rf_proba_tr, rf_pred_tr, "Random Forest", "Train"))
models_metrics.append(get_metrics(y_train, knn_proba_tr, knn_pred_tr, "KNClassifier", "Train"))
models_metrics.append(get_metrics(y_train, lgb_proba_tr, lgb_pred_tr, "LGBM", "Train"))

models_metrics.append(get_metrics(y_val, logr_proba, logr_pred, "Logistic Regression", "Validation"))
models_metrics.append(get_metrics(y_val, dtc_proba, dtc_pred, "DTClassifier", "Validation"))
models_metrics.append(get_metrics(y_val, rf_proba, rf_pred, "Random Forest", "Validation"))
models_metrics.append(get_metrics(y_val, knn_proba, knn_pred, "KNClassifier", "Validation"))
models_metrics.append(get_metrics(y_val, lgb_proba, lgb_pred, "LGBM", "Validation"))

In [8]:
df_models_metrics = pd.DataFrame(models_metrics)
df_models_metrics = df_models_metrics.pivot_table(
    index=["Model", "Set"],
    values=["AUC", "Accuracy"]
)

df_models_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,Accuracy
Model,Set,Unnamed: 2_level_1,Unnamed: 3_level_1
DTClassifier,Train,0.831928,0.766121
DTClassifier,Validation,0.76821,0.728585
KNClassifier,Train,0.897784,0.814244
KNClassifier,Validation,0.774752,0.72666
LGBM,Train,0.995953,0.967276
LGBM,Validation,0.830563,0.762271
Logistic Regression,Train,0.79732,0.741739
Logistic Regression,Validation,0.809067,0.750722
Random Forest,Train,1.0,1.0
Random Forest,Validation,0.84005,0.770934


#### So, we are going to create a one-level stacking that as LGBM and Random Forest as baseline models and Logistic Regression as metamodel

### **LightGBM**

In [9]:
lgb_clf = LGBMClassifier(random_state=42)

### Hyperparameter Tuning

In [11]:
param_set = {
    'num_leaves': [15, 31, 63],
    'max_depth': [5, 10, -1], # -1 means no limit
    'learning_rate': [0.1, 0.03, 0.01],
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_samples': [10, 20, 50, 100],
    'min_split_gain': [0.0, 0.1, 0.2],
}

lgb_clf_rs = RandomizedSearchCV(lgb_clf, param_set, n_iter= 50, cv = 5, scoring = 'accuracy')
lgb_clf_rs.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1584, number of negative: 909
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1611
[LightGBM] [Info] Number of data points in the train set: 2493, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635379 -> initscore=0.555363
[LightGBM] [Info] Start training from score 0.555363
[LightGBM] [Info] Number of positive: 1584, number of negative: 909
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1614
[LightGBM] [Info] Number of data points in the train set: 2493, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635379 -> initscore=0.555363
[LightGBM] [I

0,1,2
,estimator,LGBMClassifie...ndom_state=42)
,param_distributions,"{'colsample_bytree': [0.8, 1.0], 'learning_rate': [0.1, 0.03, ...], 'max_depth': [5, 10, ...], 'min_child_samples': [10, 20, ...], ...}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,10
,learning_rate,0.01
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.2
,min_child_weight,0.001


In [12]:
print("Best params:", lgb_clf_rs.best_params_)

Best params: {'subsample': 1.0, 'num_leaves': 31, 'n_estimators': 500, 'min_split_gain': 0.2, 'min_child_samples': 50, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


In [13]:
best_lgb = lgb_clf_rs.best_estimator_

best_lgb_proba = best_lgb.predict_proba(X_val)[:,1]
best_lgb_pred = best_lgb.predict(X_val)

print(f"AUC: {roc_auc_score(y_val, best_lgb_proba)}, Accuracy: {accuracy_score(y_val, best_lgb_pred)}")

AUC: 0.8353881826177341, Accuracy: 0.7680461982675649


### **Random Forest**

In [14]:
rf_clf = RandomForestClassifier(random_state=42)

In [15]:
X.columns

Index(['ambient_humidity', 'baking_duration', 'cooling_period',
       'cream_fat_content', 'egg_temperature', 'egg_yolk_count',
       'final_temperature', 'lemon_zest_ph', 'oven_temperature',
       'preheating_time', 'salt_ratio', 'sugar_content', 'vanilla_extract',
       'is_lisboa'],
      dtype='object')

In [16]:
importance = rf.feature_importances_

indices = list(np.argsort(importance)[::-1])

for element in indices:
    print(f"Feature: {X.columns[element]} - Importance: {importance[element]}")

Feature: egg_yolk_count - Importance: 0.10840812721067075
Feature: salt_ratio - Importance: 0.10289073973622166
Feature: baking_duration - Importance: 0.09959498972190452
Feature: egg_temperature - Importance: 0.08711457204738061
Feature: sugar_content - Importance: 0.07674973579902677
Feature: final_temperature - Importance: 0.07622579113228839
Feature: vanilla_extract - Importance: 0.07542481317687365
Feature: oven_temperature - Importance: 0.07478334430249484
Feature: cooling_period - Importance: 0.0727805802682693
Feature: preheating_time - Importance: 0.0710808774993299
Feature: cream_fat_content - Importance: 0.059380564738362666
Feature: ambient_humidity - Importance: 0.05594002094390092
Feature: lemon_zest_ph - Importance: 0.03381916201833646
Feature: is_lisboa - Importance: 0.005806681404939649


So we know that tree based models tolerate irrelevant features, so we're going only to drop is_lisboa feature as that has extremely low importance (<0.01)

In [17]:
X_train_sel = X_train.drop(columns = ['is_lisboa'])
X_val_sel = X_val.drop(columns = ['is_lisboa'])

### Hyperparameter Tuning

In [19]:
param_range = { 
    'n_estimators': [100, 200, 400, 600],
    'max_depth': [None, 5, 10, 15],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', 0.7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_clf_rs = RandomizedSearchCV(rf_clf, param_range, n_iter= 50, cv = 5, scoring = 'accuracy', error_score= 'raise')
rf_clf_rs.fit(X_train_sel,y_train)  

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_distributions,"{'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, ...], 'max_features': ['sqrt', 'log2', ...], ...}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [20]:
print("Best params:", rf_clf_rs.best_params_)

Best params: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini', 'bootstrap': False}


In [21]:
best_rfc = rf_clf_rs.best_estimator_

best_rfc_proba = best_rfc.predict_proba(X_val_sel)[:,1]
best_rfc_pred = best_rfc.predict(X_val_sel)

print(f"AUC: {roc_auc_score(y_val, best_rfc_proba)}, Accuracy: {accuracy_score(y_val, best_rfc_pred)}")

AUC: 0.8377568561605501, Accuracy: 0.7680461982675649


### **KNCLassifier**

In [22]:
knn = KNeighborsClassifier()

### Hyperparameter Tuning

In [24]:
param_space = { 
    'n_neighbors': [1,2,3,4,5,7,9,11,15,21,31],
    'weights': ['uniform', 'distance'],
    'p': [1,2],  #1 - Manhattan, 2 - Euclidean
    'leaf_size': [10,20,30,40,50],
    'metric': ['minkowski', 'cosine', 'manhattan'],
}

knn_clf_rs= RandomizedSearchCV(knn, param_space, n_iter= 50, cv = 5, scoring = 'accuracy', error_score= 'raise')
knn_clf_rs.fit(X_train,y_train)  

0,1,2
,estimator,KNeighborsClassifier()
,param_distributions,"{'leaf_size': [10, 20, ...], 'metric': ['minkowski', 'cosine', ...], 'n_neighbors': [1, 2, ...], 'p': [1, 2], ...}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_neighbors,9
,weights,'distance'
,algorithm,'auto'
,leaf_size,40
,p,2
,metric,'manhattan'
,metric_params,
,n_jobs,


In [25]:
print("Best params:", knn_clf_rs.best_params_)

Best params: {'weights': 'distance', 'p': 2, 'n_neighbors': 9, 'metric': 'manhattan', 'leaf_size': 40}


In [26]:
best_knn = knn_clf_rs.best_estimator_

best_knn_proba = best_knn.predict_proba(X_val)[:,1]
best_knn_pred = best_knn.predict(X_val)

print(f"AUC: {roc_auc_score(y_val, best_knn_proba)}, Accuracy: {accuracy_score(y_val, best_knn_pred)}")

AUC: 0.8181518349724155, Accuracy: 0.7516843118383061


## Stacking

So now comes the part where we will combine our tuned models, and set them as baseline models to our Logisti Regression metamodel

In [27]:
estimators = [('rfc', best_rfc),
              ('lgb', best_lgb),
              ('knn', best_knn)]

In [28]:
stc = StackingClassifier(estimators= estimators, final_estimator=LogisticRegression()).fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1980, number of negative: 1137
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1624
[LightGBM] [Info] Number of data points in the train set: 3117, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635226 -> initscore=0.554704
[LightGBM] [Info] Start training from score 0.554704
[LightGBM] [Info] Number of positive: 1584, number of negative: 909
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1611
[LightGBM] [Info] Number of data points in the train set: 2493, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635379 -> initscore=0.555363
[LightGBM] [Info] Start training from score 0.555363
[LightGBM] [Info] Numbe

In [29]:
stc_proba = stc.predict_proba(X_val)[:,1]
stc_pred = stc.predict(X_val)

print(f"AUC: {roc_auc_score(y_val, stc_proba)}, Accuracy: {accuracy_score(y_val, stc_pred)}")

AUC: 0.8439793715519308, Accuracy: 0.7709335899903753
