# Model Training and Evaluation

In [5]:
import warnings
warnings.filterwarnings("ignore")
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score,auc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

import pandas as pd

# Load the data
X = pd.read_csv('datasets/X.csv')
y = pd.read_csv('datasets/y.csv')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


In [6]:
models = {
    # "Logistic Regression": LogisticRegression(),
    # "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    # "SVM": SVC(),
    # "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(),
    # "AdaBoost": AdaBoostClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(),
}

In [7]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #create a dataframe that stores all the metrics not just the last one
    # Initialize a list to store metrics if it doesn't exist
    if 'metrics_list' not in locals():
        metrics_list = []

    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred).tolist()  # convert to list for DataFrame compatibility

    metrics_list.append({
        "Model": name,
        "Accuracy": acc,
        "Recall": rec,
        "Precision": prec,
        "F1 Score": f1,
        "Confusion Matrix": cm
    })

# After the loop, create the DataFrame
metrics_df = pd.DataFrame(metrics_list)


[LightGBM] [Info] Number of positive: 7977, number of negative: 28023
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1155
[LightGBM] [Info] Number of data points in the train set: 36000, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221583 -> initscore=-1.256463
[LightGBM] [Info] Start training from score -1.256463
Learning rate set to 0.047586
0:	learn: 0.5864932	total: 21.8ms	remaining: 21.8s
1:	learn: 0.5128768	total: 50.6ms	remaining: 25.2s
2:	learn: 0.4529205	total: 84.2ms	remaining: 28s
3:	learn: 0.4031770	total: 109ms	remaining: 27.2s
4:	learn: 0.3651944	total: 133ms	remaining: 26.5s
5:	learn: 0.3338812	total: 161ms	remaining: 26.7s
6:	learn: 0.3124075	total: 180ms	remaining: 25.5s
7:	learn: 0.2965033	total: 205ms	remaining: 25.4s
8:	learn: 0.

In [15]:
metrics_df


Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score,Confusion Matrix
4,Random Forest,0.927556,0.759763,0.902525,0.825013,"[[6811, 166], [486, 1537]]"
5,XGBoost,0.934333,0.801285,0.89558,0.845813,"[[6788, 189], [402, 1621]]"
6,LightGBM,0.931333,0.778547,0.902579,0.835987,"[[6807, 170], [448, 1575]]"
7,CatBoost,0.935444,0.790905,0.910125,0.846337,"[[6819, 158], [423, 1600]]"


In [19]:
import pickle
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

pickle.dump(xgb_model, open('xgb_model.pkl', 'wb'))




In [None]:
# #hypertuning top 3 models

# top_models = {
#     'XGBoost': (
#         XGBClassifier(),
#         {
#             'n_estimators': [100, 200],
#             'learning_rate': [0.01, 0.1],
#             'max_depth': [3, 6, 8]
#         }
#     ),
#     'CatBoost': (
#         CatBoostClassifier(verbose=0),
#         {
#             'iterations': [100, 200],
#             'learning_rate': [0.01, 0.1],
#             'depth': [4, 6, 8]
#         }
#     ),
#     'LightGBM': (
#         LGBMClassifier(),
#         {
#             'n_estimators': [100, 200],
#             'learning_rate': [0.01, 0.1],
#             'num_leaves': [31, 50]
#         }
#     )
# }

# grid_list = []
# for name, (model, params) in top_models.items():

#     grid_model = GridSearchCV(model, params, cv=5)
#     grid_model.fit(X_train, y_train)
#     y_pred = grid_model.predict(X_test)

#     acc = accuracy_score(y_test, y_pred)
#     rec = recall_score(y_test, y_pred)
#     prec = precision_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     cm = confusion_matrix(y_test, y_pred).tolist()  # convert to list for DataFrame compatibility

#     grid_list.append({
#         "Model": name,
#         "Accuracy": acc,
#         "Recall": rec,
#         "Precision": prec,
#         "F1 Score": f1,
#         "Confusion Matrix": cm
#     })

# # After the loop, create the DataFrame
# grid_df = pd.DataFrame(grid_list)



In [None]:
# grid_df

Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score,Confusion Matrix
0,XGBoost,0.934556,0.800995,0.894942,0.845366,"[[6801, 189], [400, 1610]]"
1,CatBoost,0.929444,0.780597,0.88996,0.831699,"[[6796, 194], [441, 1569]]"
2,LightGBM,0.933222,0.804478,0.886027,0.843286,"[[6782, 208], [393, 1617]]"


In [None]:
# metrics_df

Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score,Confusion Matrix
0,Logistic Regression,0.893222,0.744776,0.769666,0.757016,"[[6542, 448], [513, 1497]]"
1,Decision Tree,0.896889,0.79005,0.758357,0.773879,"[[6484, 506], [422, 1588]]"
2,Random Forest,0.926444,0.761692,0.893232,0.822234,"[[6807, 183], [479, 1531]]"
3,SVM,0.892889,0.697512,0.797497,0.744161,"[[6634, 356], [608, 1402]]"
4,KNN,0.890889,0.638308,0.8342,0.723224,"[[6735, 255], [727, 1283]]"
5,XGBoost,0.934111,0.808955,0.886104,0.845774,"[[6781, 209], [384, 1626]]"
6,LightGBM,0.929556,0.786567,0.885218,0.832982,"[[6785, 205], [429, 1581]]"
7,CatBoost,0.934333,0.795025,0.899268,0.84394,"[[6811, 179], [412, 1598]]"
8,AdaBoost,0.910444,0.764179,0.82227,0.792161,"[[6658, 332], [474, 1536]]"
9,Gradient Boosting,0.920778,0.762189,0.867006,0.811226,"[[6755, 235], [478, 1532]]"
