# **Modelling and Tuning**

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
import pickle

# Load train/val/test split data from notebook3
with open(r'Nata_Files\\train_test_split.pkl', 'rb') as f:
    notebook3_data = pickle.load(f)

# Core datasets
X = notebook3_data.get('X')
y = notebook3_data.get('y')

# Splits and processed feature sets
X_train = notebook3_data.get('X_train')
y_train = notebook3_data.get('y_train')
X_val = notebook3_data.get('X_val')
y_val = notebook3_data.get('y_val')
X_test = notebook3_data.get('X_test')
y_test = notebook3_data.get('y_test')
X_train_val = notebook3_data.get('X_train_val')
y_train_val = notebook3_data.get('y_train_val')

numeric_cols = notebook3_data.get('numeric_cols')
kf = notebook3_data.get('kf')
rkf = notebook3_data.get('rkf')
skf = notebook3_data.get('skf')

print("Train/Val/Test split data loaded successfully!")
if X is not None and y is not None:
    print(f"Full dataset X shape: {X.shape} | y shape: {y.shape}")
if X_train is not None:
    print(f"X_train shape: {X_train.shape}")
if X_val is not None:
    print(f"X_val shape: {X_val.shape}")
if X_test is not None:
    print(f"X_test shape: {X_test.shape}")
if kf is not None:
    try:
        print(f"kf splits: {kf.get_n_splits()}")
    except Exception:
        print("kf loaded (object), get_n_splits() unavailable for this object")
if rkf is not None:
    try:
        print(f"rkf splits: {rkf.get_n_splits()}")
    except Exception:
        print("rkf loaded (object), get_n_splits() unavailable for this object")

Train/Val/Test split data loaded successfully!
Full dataset X shape: (5196, 14) | y shape: (5196,)
X_train shape: (3117, 14)
X_val shape: (1039, 14)
X_test shape: (1040, 14)
kf splits: 10
rkf splits: 14


In [4]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold


In [None]:
def fit(model, X, y):
    model.fit(X, y)

def predict_proba(model, X_val):
    return model.predict_proba(X_val)
    
def predict(model, X_val):
    return model.predict(X_val)

## **Model Selection**

In [None]:
def get_metrics(y_val, y_proba, y_pred, model):
    return {
        "Model" : model,
        "AUC": roc_auc_score(y_val, y_proba),
        "Accuracy": accuracy_score(y_val, y_pred),
    }

In [None]:
models_metrics = []

models_metrics.append(get_metrics(y_val, logr_proba, logr_pred, "Logistic Regression"))
models_metrics.append(get_metrics(y_val, dtc_proba, dtc_pred, "DTClassifier"))
models_metrics.append(get_metrics(y_val, rf_proba, rf_pred, "Random Forest"))
models_metrics.append(get_metrics(y_val, knn_clf_proba, knn_clf_pred, "KNClassifier"))

In [None]:
df_models_metrics = pd.DataFrame(models_metrics)
df_models_metrics.set_index("Model", inplace=True)

df_models_metrics

Unnamed: 0_level_0,AUC,Accuracy
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.809067,0.750722
DTClassifier,0.76821,0.728585
Random Forest,0.840843,0.767084
KNClassifier,0.774752,0.72666
