In [1]:
import sys
sys.path.append('../src')
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from main import evaluate_model_performance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import joblib

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
df = pd.read_csv('../data/engineered_labeled_applications.csv')
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
df['label'] = pd.to_numeric(df['label'], errors='coerce')
len(df)

438557

In [3]:
df = df.dropna(subset=['label'])

In [4]:
train_cols = ['gender', 'own_car', 'own_estate', 'mobile_flag', 'work_phone',
       'label', 'num_children_1', 'num_children_2More',
       'income_type_Commercial associate', 'income_type_State servant',
       'education_type_Academic degree', 'education_type_Higher education',
       'education_type_Incomplete higher', 'education_type_Lower secondary',
       'family_status_Civil marriage', 'family_status_Separated',
       'family_status_Single / not married', 'family_status_Widow',
       'housing_type_Co-op apartment', 'housing_type_Municipal apartment',
       'housing_type_Office apartment', 'housing_type_Rented apartment',
       'housing_type_With parents', 'occupation_High Tech Work',
       'occupation_Labor Work', 'occupation_Office Work',
       'num_family_members_1.0', 'num_family_members_3.0',
       'num_family_members_4More', 'gp_income_total_low',
       'gp_income_total_medium', 'gp_income_total_high',
       'gp_income_total_highest', 'gp_age_y_young', 'gp_age_y_middle',
       'gp_age_y_old', 'gp_age_y_oldest', 'gp_employed_y_low',
       'gp_employed_y_medium', 'gp_employed_y_high', 'gp_employed_y_highest']
df_train = df[train_cols]
Y = df_train['label']
X = df_train.drop(columns=['label'])

In [5]:
Y = Y.astype('int')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    stratify=Y, test_size=0.2, 
                                                    random_state=1332)

In [7]:
smote = SMOTE(random_state=1332)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [8]:
# Initialize the logistic regression model
log_reg = LogisticRegression(random_state=1332)

# Fit the model to the training data
log_reg.fit(X_train_smote, y_train_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
evaluate_model_performance(model=log_reg, X_test=X_test, y_test=y_test)

Model: LogisticRegression
Accuracy: 0.7389
Precision: 0.0184
Recall: 0.2764
F1 Score: 0.0345
AUC Score: 0.5430

Confusion Matrix:
[[5354 1815]
 [  89   34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.75      0.85      7169
           1       0.02      0.28      0.03       123

    accuracy                           0.74      7292
   macro avg       0.50      0.51      0.44      7292
weighted avg       0.97      0.74      0.84      7292



In [10]:
tree = DecisionTreeClassifier(max_depth=12,
                               min_samples_split=8,
                               random_state=1332)
tree.fit(X_train_smote, y_train_smote)

In [11]:
evaluate_model_performance(model=log_reg, X_test=X_test, y_test=y_test)

Model: LogisticRegression
Accuracy: 0.7389
Precision: 0.0184
Recall: 0.2764
F1 Score: 0.0345
AUC Score: 0.5430

Confusion Matrix:
[[5354 1815]
 [  89   34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.75      0.85      7169
           1       0.02      0.28      0.03       123

    accuracy                           0.74      7292
   macro avg       0.50      0.51      0.44      7292
weighted avg       0.97      0.74      0.84      7292



In [12]:
rand_forest = RandomForestClassifier(n_estimators=250,
                              max_depth=12,
                              min_samples_leaf=16
                              )
rand_forest.fit(X_train_smote, y_train_smote)

In [13]:
evaluate_model_performance(model=rand_forest, X_test=X_test, y_test=y_test)

Model: RandomForestClassifier
Accuracy: 0.8201
Precision: 0.0388
Recall: 0.4065
F1 Score: 0.0708
AUC Score: 0.6653

Confusion Matrix:
[[5930 1239]
 [  73   50]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.83      0.90      7169
           1       0.04      0.41      0.07       123

    accuracy                           0.82      7292
   macro avg       0.51      0.62      0.49      7292
weighted avg       0.97      0.82      0.89      7292



In [14]:
# svm = svm.SVC(C=0.8, kernel='linear', probability=True)
# svm.fit(X_train_smote, y_train_smote)

In [15]:
# evaluate_model_performance(model=svm, X_test=X_test, y_test=y_test)

In [16]:
lgbm = LGBMClassifier(num_leaves=31,
                       max_depth=8, 
                       learning_rate=0.02,
                       n_estimators=250,
                       subsample = 0.8,
                       colsample_bytree =0.8
                      )
lgbm.fit(X_train_smote, y_train_smote)

[LightGBM] [Info] Number of positive: 28672, number of negative: 28672
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 57344, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [17]:
evaluate_model_performance(model=lgbm, X_test=X_test, y_test=y_test)

Model: LGBMClassifier
Accuracy: 0.8079
Precision: 0.0363
Recall: 0.4065
F1 Score: 0.0666
AUC Score: 0.6562

Confusion Matrix:
[[5841 1328]
 [  73   50]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      7169
           1       0.04      0.41      0.07       123

    accuracy                           0.81      7292
   macro avg       0.51      0.61      0.48      7292
weighted avg       0.97      0.81      0.88      7292



In [18]:
xgbc = XGBClassifier(max_depth=12,
                      n_estimators=250,
                      min_child_weight=8, 
                      subsample=0.8, 
                      learning_rate =0.02,    
                      seed=42)

xgbc.fit(X_train_smote, y_train_smote)

In [19]:
evaluate_model_performance(model=xgbc, X_test=X_test, y_test=y_test)

Model: XGBClassifier
Accuracy: 0.8778
Precision: 0.0524
Recall: 0.3659
F1 Score: 0.0917
AUC Score: 0.7043

Confusion Matrix:
[[6356  813]
 [  78   45]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.89      0.93      7169
           1       0.05      0.37      0.09       123

    accuracy                           0.88      7292
   macro avg       0.52      0.63      0.51      7292
weighted avg       0.97      0.88      0.92      7292



In [20]:
catboost = CatBoostClassifier(iterations=250,
                           learning_rate=0.2,
                           od_type='Iter',
                           verbose=25,
                           depth=16,
                           random_seed=42)

catboost.fit(X_train_smote, y_train_smote)

0:	learn: 0.5384995	total: 2.72s	remaining: 11m 16s
25:	learn: 0.1420017	total: 54s	remaining: 7m 44s
50:	learn: 0.1219887	total: 1m 30s	remaining: 5m 54s
75:	learn: 0.1141317	total: 2m 21s	remaining: 5m 24s
100:	learn: 0.1116182	total: 3m 15s	remaining: 4m 48s
125:	learn: 0.1102344	total: 4m 9s	remaining: 4m 5s
150:	learn: 0.1093273	total: 5m 4s	remaining: 3m 19s
175:	learn: 0.1089343	total: 5m 59s	remaining: 2m 31s
200:	learn: 0.1087216	total: 6m 43s	remaining: 1m 38s
225:	learn: 0.1086058	total: 7m 16s	remaining: 46.4s
249:	learn: 0.1084918	total: 7m 50s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x22a79eced50>

In [21]:
evaluate_model_performance(model=catboost, X_test=X_test, y_test=y_test)

Model: CatBoostClassifier
Accuracy: 0.9169
Precision: 0.0741
Recall: 0.3415
F1 Score: 0.1217
AUC Score: 0.7371

Confusion Matrix:
[[6644  525]
 [  81   42]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      7169
           1       0.07      0.34      0.12       123

    accuracy                           0.92      7292
   macro avg       0.53      0.63      0.54      7292
weighted avg       0.97      0.92      0.94      7292



In [22]:
# param_grid = {
#     'n_estimators': [100, 300, 500],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 5, 7],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'gamma': [0, 0.1, 0.2],
# }
# grid_xgb = GridSearchCV(xgbc, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
# grid_xgb.fit(X_train, y_train)
# best_xgb = grid_xgb.best_estimator_

In [23]:
# evaluate_model_performance(model=best_xgb, X_test=X_test, y_test=y_test)

In [24]:
# scores = cross_val_score(catboost, X, Y, cv=5, scoring='accuracy')  # Non-SMOTE data (X, Y)
# print(f"Cross-validation scores: {scores}")
# print(f"Mean accuracy: {scores.mean()}")

In [25]:
catboost.save_model('../models/catboost_model.cbm')

In [26]:
joblib.dump(catboost, '../models/catboost_model.pkl')

['../models/catboost_model.pkl']

In [27]:
X_train.columns

Index(['gender', 'own_car', 'own_estate', 'mobile_flag', 'work_phone',
       'num_children_1', 'num_children_2More',
       'income_type_Commercial associate', 'income_type_State servant',
       'education_type_Academic degree', 'education_type_Higher education',
       'education_type_Incomplete higher', 'education_type_Lower secondary',
       'family_status_Civil marriage', 'family_status_Separated',
       'family_status_Single / not married', 'family_status_Widow',
       'housing_type_Co-op apartment', 'housing_type_Municipal apartment',
       'housing_type_Office apartment', 'housing_type_Rented apartment',
       'housing_type_With parents', 'occupation_High Tech Work',
       'occupation_Labor Work', 'occupation_Office Work',
       'num_family_members_1.0', 'num_family_members_3.0',
       'num_family_members_4More', 'gp_income_total_low',
       'gp_income_total_medium', 'gp_income_total_high',
       'gp_income_total_highest', 'gp_age_y_young', 'gp_age_y_middle',
       