# **Modelling and Tuning**

In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt


In [5]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [6]:
import pickle

# Load train/val/test split data from notebook3
with open(r'Nata_Files\\train_test_split.pkl', 'rb') as f:
    notebook3_data = pickle.load(f)

# Core datasets
X = notebook3_data.get('X')
y = notebook3_data.get('y')

# Splits and processed feature sets
X_train = notebook3_data.get('X_train')
y_train = notebook3_data.get('y_train')
X_val = notebook3_data.get('X_val')
y_val = notebook3_data.get('y_val')
X_test = notebook3_data.get('X_test')
y_test = notebook3_data.get('y_test')
X_train_val = notebook3_data.get('X_train_val')
y_train_val = notebook3_data.get('y_train_val')

numeric_cols = notebook3_data.get('numeric_cols')
kf = notebook3_data.get('kf')
rkf = notebook3_data.get('rkf')
skf = notebook3_data.get('skf')

print("Train/Val/Test split data loaded successfully!")
if X is not None and y is not None:
    print(f"Full dataset X shape: {X.shape} | y shape: {y.shape}")
if X_train is not None:
    print(f"X_train shape: {X_train.shape}")
if X_val is not None:
    print(f"X_val shape: {X_val.shape}")
if X_test is not None:
    print(f"X_test shape: {X_test.shape}")
if kf is not None:
    try:
        print(f"kf splits: {kf.get_n_splits()}")
    except Exception:
        print("kf loaded (object), get_n_splits() unavailable for this object")
if rkf is not None:
    try:
        print(f"rkf splits: {rkf.get_n_splits()}")
    except Exception:
        print("rkf loaded (object), get_n_splits() unavailable for this object")

Train/Val/Test split data loaded successfully!
Full dataset X shape: (5196, 14) | y shape: (5196,)
X_train shape: (3117, 14)
X_val shape: (1039, 14)
X_test shape: (1040, 14)
kf splits: 10
rkf splits: 14


In [8]:
class Predictor:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict_proba(self, X_val):
        return self.model.predict_proba(X_val)
    
    def predict(self, X_val):
        return self.model.predict(X_val)

In [9]:
def fit_model(X,y,model):
    return model.fit(X,y)

In [10]:
def eval_clf_model(X,y, model):
    return model.score(X,y)

In [11]:
def avg_score_clf(method,X,y,model):
    score_train = []
    score_val = []
    for train_index, val_index in method.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        trained_model = fit_model(model, X_train, y_train)
        value_train = eval_clf_model(trained_model, X_train, y_train)
        value_val = eval_clf_model(trained_model, X_val, y_val)
        score_train.append(value_train)
        score_val.append(value_val)

    print('Train:', np.mean(score_train))
    print('Validation:', np.mean(score_val))

In [12]:
#def avg_score_clf_skf(method, X, y, model):
 #   score_train = []
  #  score_val = []
   # for train_index, val_index in method.split(X, y):
    #    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
     #   y_train, y_val = y.iloc[train_index], y.iloc[val_index]
      #  trained_model = fit_model(model, X_train, y_train)
       # value_train = eval_clf_model(trained_model, X_train, y_train)
        #value_val = eval_clf_model(trained_model, X_val, y_val)
    #    score_train.append(value_train)
     #   score_val.append(value_val)

    #print('Train:', np.mean(score_train))
    #print('Validation:', np.mean(score_val))

## **Model Selection**

Que modelos podemos usar?
Isto é um classification problem então podemos descartar logo alguns.
Podemos testar :
- Logistic Regression
- Decision Tree Classifier
- Random Forest
- K nearest Classifier
- stacking /ensemble

## **Model Evaluation**

In [39]:
logr = Predictor(LogisticRegression())
logr.fit(X_train, y_train)
logr_proba = logr.predict_proba(X_val)[:,1]
logr_pred = logr.predict(X_val)
logr_proba_tr = logr.predict_proba(X_train)[:,1]
logr_pred_tr = logr.predict(X_train)

dtc = Predictor(DecisionTreeClassifier(max_depth= 5))
dtc.fit(X_train, y_train)
dtc_proba = dtc.predict_proba(X_val)[:,1]
dtc_pred = dtc.predict(X_val)
dtc_proba_tr = logr.predict_proba(X_train)[:,1]
dtc_pred_tr = logr.predict(X_train)


rf = Predictor(RandomForestClassifier())
rf.fit(X_train, y_train)
rf_proba = rf.predict_proba(X_val)[:,1]
rf_pred = rf.predict(X_val)
rf_proba_tr = logr.predict_proba(X_train)[:,1]
rf_pred_tr = logr.predict(X_train)

knn_clf = Predictor(KNeighborsClassifier())
knn_clf.fit(X_train, y_train)
knn_clf_proba = knn_clf.predict_proba(X_val)[:,1]
knn_clf_pred = knn_clf.predict(X_val)
knn_clf_proba_tr = logr.predict_proba(X_train)[:,1]
knn_clf_pred_tr = logr.predict(X_train)

#### **K-Neighbors Classifier**

In [18]:
knn_clf = KNeighborsClassifier()

##### Stratified K-Fold CV

In [19]:
#avg_score_clf_skf(skf, X, y, knn_clf)

In [44]:
def get_metrics(y_val, y_proba, y_pred, model, dataset):
    return {
        "Model" : model,
        "Set": dataset,
        "AUC": roc_auc_score(y_val, y_proba),
        "Accuracy": accuracy_score(y_val, y_pred),
    }

In [45]:
models_metrics = []

models_metrics.append(get_metrics(y_train, logr_proba_tr, logr_pred_tr, "Logistic Regression", "Train"))
models_metrics.append(get_metrics(y_train, dtc_proba_tr, dtc_pred_tr, "DTClassifier", "Train"))
models_metrics.append(get_metrics(y_train, rf_proba_tr, rf_pred_tr, "Random Forest", "Train"))
models_metrics.append(get_metrics(y_train, knn_clf_proba_tr, knn_clf_pred_tr, "KNClassifier", "Train"))

models_metrics.append(get_metrics(y_val, logr_proba, logr_pred, "Logistic Regression", "Validation"))
models_metrics.append(get_metrics(y_val, dtc_proba, dtc_pred, "DTClassifier", "Validation"))
models_metrics.append(get_metrics(y_val, rf_proba, rf_pred, "Random Forest", "Validation"))
models_metrics.append(get_metrics(y_val, knn_clf_proba, knn_clf_pred, "KNClassifier", "Validation"))


In [48]:
#df_models_metrics = pd.DataFrame(models_metrics)
#df_models_metrics.set_index(["Model", "Set"], inplace=True)

#df_models_metrics.T

df_models_metrics = df_models_metrics.pivot_table(
    index=["Model", "Set"],
    values=["AUC", "Accuracy"]
)

df_models_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,Accuracy
Model,Set,Unnamed: 2_level_1,Unnamed: 3_level_1
DTClassifier,Train,0.79732,0.741739
DTClassifier,Validation,0.76821,0.728585
KNClassifier,Train,0.79732,0.741739
KNClassifier,Validation,0.774752,0.72666
Logistic Regression,Train,0.79732,0.741739
Logistic Regression,Validation,0.809067,0.750722
Random Forest,Train,0.79732,0.741739
Random Forest,Validation,0.839058,0.770934


Fazer ensemble (bagging)?
Perguntar ao professor se podemos usar só RF? Vale a pena testar ensemble dps? ir testando
A data partitioning aqui está a ser hold-out method, gostava de testar Stratified CV da mesma maneira

FAZER:
- Stacking :

        - Baseline models: Logistic Regression and RF
        - Metamodel: LightGBM or Logistic Regression or RF
- Feature Selection (antes de hyperparameter tuning)


##### **Feature Selection on Logistic Regression**

In [50]:
lasso = LassoCV()

In [51]:
lasso.fit(X_train, y_train)

0,1,2
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,copy_X,True
,cv,
,verbose,False


In [52]:
coef = pd.Series(lasso.coef_, index = X_train.columns)
coef

ambient_humidity     0.000000
baking_duration     -0.135690
cooling_period      -0.017361
cream_fat_content   -0.000000
egg_temperature      0.044335
egg_yolk_count       0.183838
final_temperature   -0.061456
lemon_zest_ph        0.001352
oven_temperature    -0.000000
preheating_time     -0.000000
salt_ratio          -0.003262
sugar_content        0.049816
vanilla_extract      0.057433
is_lisboa           -0.000000
dtype: float64

In [53]:
coef[coef==0]

ambient_humidity     0.0
cream_fat_content   -0.0
oven_temperature    -0.0
preheating_time     -0.0
is_lisboa           -0.0
dtype: float64

In [54]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
coef.sort_values(ascending=False)


Lasso picked 9 variables and eliminated the other 5 variables


egg_yolk_count       0.183838
vanilla_extract      0.057433
sugar_content        0.049816
egg_temperature      0.044335
lemon_zest_ph        0.001352
cream_fat_content   -0.000000
is_lisboa           -0.000000
preheating_time     -0.000000
ambient_humidity     0.000000
oven_temperature    -0.000000
salt_ratio          -0.003262
cooling_period      -0.017361
final_temperature   -0.061456
baking_duration     -0.135690
dtype: float64

In [64]:
selected_features = X_train.columns[lasso.coef_ != 0]

X_train_lr = X_train[selected_features]
X_val_lr = X_val[selected_features]

logr = LogisticRegression()
logr.fit(X_train_lr, y_train)
logr_proba = logr.predict_proba(X_val_lr)[:,1]
logr_pred = logr.predict(X_val_lr)

print(f" AUC: {roc_auc_score(y_val, logr_proba)}, Accuracy: {accuracy_score(y_val, logr_pred)}")

 AUC: 0.8087830814743744, Accuracy: 0.7526467757459095


##### **Apply GridSearch CV to our Logistic Regression**

##### **Apply GridSearch CV to our Random Forest**

In [None]:
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()

parameter_range = { #dictionary... for each parameter u set possible values
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 7, 8, 10],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

clf = GridSearchCV(rfc, parameter_range, scoring = 'roc_auc', error_score= 'raise')
clf.fit(X_train,y_train)  

KeyboardInterrupt: 

In [None]:
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')


------------------------------------------------------------------------------------------------------------------------
Best parameters found:
 {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
------------------------------------------------------------------------------------------------------------------------


In [None]:
rfc_best = RandomForestClassifier(criterion= 'entropy', max_depth= None, max_features='sqrt', min_samples_split= 2, n_estimators= 100).fit(X_train,y_train)


In [None]:
rfc_best_proba = rfc_best.predict_proba(X_val)[:,1]
rfc_best_pred = rfc_best.predict(X_val)


In [None]:
rfc_best_evaluation = []

rfc_best_evaluation.append(get_metrics(y_val, rfc_best_proba, rfc_best_pred, "RFGridSearchCV"))

In [None]:
df_rfc_best_evaluation = pd.DataFrame(rfc_best_evaluation)
df_rfc_best_evaluation.set_index("Model", inplace=True)

df_rfc_best_evaluation

Unnamed: 0_level_0,AUC,Accuracy
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
RFGridSearchCV,0.838165,0.780558


Fazer:
- os scores no training para checkar overfitting
- STACKING com logistic regression e Random forest OU RF e Neural Networks
- feature selection na RandomForestClassifier