# **Modelling and Tuning**

In [2]:
import pandas as pd
import numpy as np
import pickle, os

In [3]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [4]:
import pickle

# Load train/val/test split data from notebook3
with open(r'Nata_Files\\train_test_split_fixed.pkl', 'rb') as f:
    notebook3_data = pickle.load(f)


X_train = notebook3_data['X_train']
X_val = notebook3_data['X_val']
X_test = notebook3_data['X_test']
y_train = notebook3_data['y_train']
y_val = notebook3_data['y_val']
y_test = notebook3_data['y_test']


In [5]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier



In [6]:
def fit(model, X, y):
    model.fit(X, y)

def predict_proba(model, X_val):
    return model.predict_proba(X_val)
    
def predict(model, X_val):
    return model.predict(X_val)

## **Model Selection**

In [7]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
logr_proba = logr.predict_proba(X_val)[:,1]
logr_pred = logr.predict(X_val)
logr_proba_tr = logr.predict_proba(X_train)[:,1]
logr_pred_tr = logr.predict(X_train)

dtc = DecisionTreeClassifier(max_depth= 5)
dtc.fit(X_train, y_train)
dtc_proba = dtc.predict_proba(X_val)[:,1]
dtc_pred = dtc.predict(X_val)
dtc_proba_tr = dtc.predict_proba(X_train)[:,1]
dtc_pred_tr = dtc.predict(X_train)


rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_proba = rf.predict_proba(X_val)[:,1]
rf_pred = rf.predict(X_val)
rf_proba_tr = rf.predict_proba(X_train)[:,1]
rf_pred_tr = rf.predict(X_train)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_proba = knn.predict_proba(X_val)[:,1]
knn_pred = knn.predict(X_val)
knn_proba_tr = knn.predict_proba(X_train)[:,1]
knn_pred_tr = knn.predict(X_train)

lgb = LGBMClassifier(n_estimators=100, random_state=42)
lgb.fit(X_train, y_train)
lgb_proba = lgb.predict_proba(X_val)[:,1]
lgb_pred = lgb.predict(X_val)
lgb_proba_tr = lgb.predict_proba(X_train)[:,1]
lgb_pred_tr = lgb.predict(X_train)

[LightGBM] [Info] Number of positive: 1981, number of negative: 1138
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000623 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 3119, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.635139 -> initscore=0.554329
[LightGBM] [Info] Start training from score 0.554329


In [8]:
def get_metrics(y_val, y_proba, y_pred, model, dataset):
    return {
        "Model" : model,
        "Set" : dataset,
        "AUC": roc_auc_score(y_val, y_proba),
        "Accuracy": accuracy_score(y_val, y_pred),
    }

In [11]:
models_metrics = []

models_metrics.append(get_metrics(y_train, logr_proba_tr, logr_pred_tr, "Logistic Regression", "Train"))
models_metrics.append(get_metrics(y_train, dtc_proba_tr, dtc_pred_tr, "DTClassifier", "Train"))
models_metrics.append(get_metrics(y_train, rf_proba_tr, rf_pred_tr, "Random Forest", "Train"))
models_metrics.append(get_metrics(y_train, knn_proba_tr, knn_pred_tr, "KNClassifier", "Train"))
models_metrics.append(get_metrics(y_train, lgb_proba_tr, lgb_pred_tr, "LightGBM", "Train"))

models_metrics.append(get_metrics(y_val, logr_proba, logr_pred, "Logistic Regression", "Validation"))
models_metrics.append(get_metrics(y_val, dtc_proba, dtc_pred, "DTClassifier", "Validation"))
models_metrics.append(get_metrics(y_val, rf_proba, rf_pred, "Random Forest", "Validation"))
models_metrics.append(get_metrics(y_val, knn_proba, knn_pred, "KNClassifier", "Validation"))
models_metrics.append(get_metrics(y_val, lgb_proba, lgb_pred, "LightGBM", "Validation"))

In [12]:
df_models_metrics = pd.DataFrame(models_metrics)
df_models_metrics = df_models_metrics.pivot_table(index=["Model", "Set"], values=["AUC", "Accuracy"])

df_models_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC,Accuracy
Model,Set,Unnamed: 2_level_1,Unnamed: 3_level_1
DTClassifier,Train,0.832089,0.764027
DTClassifier,Validation,0.776802,0.728846
KNClassifier,Train,0.894342,0.819493
KNClassifier,Validation,0.757484,0.701923
LightGBM,Train,0.997112,0.972748
LightGBM,Validation,0.799442,0.742308
Logistic Regression,Train,0.80007,0.741263
Logistic Regression,Validation,0.790423,0.7375
Random Forest,Train,1.0,1.0
Random Forest,Validation,0.81629,0.765385
