In [92]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler, FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
import xgboost as xgb
from scipy.stats import randint, uniform
import numpy as np


In [17]:
data  = pd.read_csv(r"../data/gd.csv")

In [18]:
data.head()

Unnamed: 0,Account_status,Duration_months,Credit_history,loan_Purpose,Credit_amount,Savings/Bonds_AC,Present_Employment_since,loan_wage_ratio,Sex,Marital_status,...,Assets/Physical_property,Age,Other_loans,Housing,Existing Credit,Job_status,Dependents,Telephone,Foreign_worker,Credit_risk
0,A11,6,A34,A43,1169,A65,A75,4,Male,single,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,Female,divorced/separated/married,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,Male,single,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,Male,single,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,Male,single,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [19]:
data.columns

Index(['Account_status', 'Duration_months', 'Credit_history', 'loan_Purpose',
       'Credit_amount', 'Savings/Bonds_AC', 'Present_Employment_since',
       'loan_wage_ratio', 'Sex', 'Marital_status', 'co-debtors', 'Tenure',
       'Assets/Physical_property', 'Age', 'Other_loans', 'Housing',
       'Existing Credit', 'Job_status', 'Dependents', 'Telephone',
       'Foreign_worker', 'Credit_risk'],
      dtype='object')

In [20]:
data['Credit_risk'] = data['Credit_risk'].map({2:1, 1:0})

In [21]:
data.head()

Unnamed: 0,Account_status,Duration_months,Credit_history,loan_Purpose,Credit_amount,Savings/Bonds_AC,Present_Employment_since,loan_wage_ratio,Sex,Marital_status,...,Assets/Physical_property,Age,Other_loans,Housing,Existing Credit,Job_status,Dependents,Telephone,Foreign_worker,Credit_risk
0,A11,6,A34,A43,1169,A65,A75,4,Male,single,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,Female,divorced/separated/married,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,Male,single,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,Male,single,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,Male,single,...,A124,53,A143,A153,2,A173,2,A191,A201,1


<pre>

>> For XGBoostClassifier, we requires a binary format targer values, so that's why we are changing Credit Risk to 0 and 1.



In [22]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [48]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)

In [99]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Account_status            1000 non-null   object
 1   Duration_months           1000 non-null   int64 
 2   Credit_history            1000 non-null   object
 3   loan_Purpose              1000 non-null   object
 4   Credit_amount             1000 non-null   int64 
 5   Savings/Bonds_AC          1000 non-null   object
 6   Present_Employment_since  1000 non-null   object
 7   loan_wage_ratio           1000 non-null   int64 
 8   Sex                       1000 non-null   object
 9   Marital_status            1000 non-null   object
 10  co-debtors                1000 non-null   object
 11  Tenure                    1000 non-null   int64 
 12  Assets/Physical_property  1000 non-null   object
 13  Age                       1000 non-null   int64 
 14  Other_loans              

In [100]:
ordinal_categories = [
    ["A61", "A62", "A63", "A64", "A65"], 
    ["A71", "A72", "A73", "A74", "A75"], 
    ["A124", "A123", "A122", "A121"], 
    ["A171", "A172", "A173", "A174"]
]


ohe_columns = ["Account_status", "Credit_history", "loan_Purpose", "Sex", 
               "Marital_status", "co-debtors", "Other_loans", "Housing", 
               "Dependents", "Telephone", "Foreign_worker"]
rob_scaling = ["Duration_months", "Credit_amount", "Age"]
ord_enc = ["Savings/Bonds_AC", "Present_Employment_since", "Assets/Physical_property", 
           "Job_status"]




In [193]:
ct = ColumnTransformer(
    transformers = [
        ("cat", OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown="ignore"), ohe_columns), 
        ("scale", RobustScaler(), rob_scaling), 
        ("enc", OrdinalEncoder(categories=ordinal_categories), ord_enc)
    ], 
    remainder="passthrough"
)

In [26]:
y_train.value_counts()

Credit_risk
0    560
1    240
Name: count, dtype: int64

<pre>

We will do SMOTE technique to balance the dataset.



In [194]:
base_pipeline = Pipeline(
    [
        ("preprocess", ct), 
        # ("to_dense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)), 
        ("smote", SMOTE(random_state=42)), 
        ("model", XGBClassifier(
            tree_method = "hist",
            # eval_metric = "logloss",
            # early_stopping_rounds = 50
        ))
    ]
)

In [28]:
param_grid_xgb = {
    "model__learning_rate": [0.01, 0.05, 0.1], 
    "model__max_depth": [3, 5, 7, 10], 
    "model__n_estimators": [100, 300, 500, 1000]

}

In [29]:
XGB_gs = GridSearchCV(
    estimator=base_pipeline, 
    param_grid = param_grid_xgb, 
    scoring = "recall", 
    cv = 5, 
    n_jobs = 4, 
    verbose = 2
)

In [30]:
XGB_gs.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_grid,"{'model__learning_rate': [0.01, 0.05, ...], 'model__max_depth': [3, 5, ...], 'model__n_estimators': [100, 300, ...]}"
,scoring,'recall'
,n_jobs,4
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('scale', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,"[['A61', 'A62', ...], ['A71', 'A72', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,sampling_strategy,'auto'
,random_state,
,k_neighbors,5

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
XGB_gs.best_params_

{'model__learning_rate': 0.01,
 'model__max_depth': 3,
 'model__n_estimators': 100}

In [32]:
XGB_gs.best_score_

np.float64(0.7291666666666667)

In [33]:
test_pred_xgb =  XGB_gs.predict(X_test)
train_pred_xgb = XGB_gs.predict(X_train)

In [34]:
accuracy_score(y_test, test_pred_xgb)

0.715

In [35]:
accuracy_score(y_train, train_pred_xgb)

0.73125

In [36]:
print(classification_report(y_test, test_pred_xgb))

              precision    recall  f1-score   support

           0       0.90      0.67      0.77       140
           1       0.52      0.82      0.63        60

    accuracy                           0.71       200
   macro avg       0.71      0.74      0.70       200
weighted avg       0.78      0.71      0.73       200



<pre>

- My Recall for Bad Credit is 0.82, which is decent, 
- As I don't want to miss any Bad Credit, but I want to achienve it more, 
- So, let's tune our model again, even at the cost of Flase Positive, which is what a Bank Manager would desire for.

>> Also, this GridSearchCV is so expensive, that we may not want that, 
   instead we will go with randomizedsearchcv.



In [195]:
new_param_xgb = {
    "model__n_estimators": randint(100, 1000), 
    "model__learning_rate": uniform(0.01, 0.1), 
    "model__max_depth": randint(7, 10), 
    "model__min_child_weight": randint(1, 10), 
    "model__gamma": uniform(0, 0.5), 
    "model__subsample": uniform(0.6, 0.4), 
    "model__colsample_bytree": uniform(0.7, 0.3), 
    "model__scale_pos_weight": uniform(2, 1)
}

In [None]:
# new_param_xgb = {
#     "model__scale_pos_weight": [1, 1.5, 1.2, 2, 1.8], 
#     "model__max_depth": [4, 5, 6, 7, 8], 
#     "model__min_child_weight": [1, 2, 3], 
#     "model__gamma": [0, 0.1, 0.5], 
#     "model__learning_rate": [0.01, 0.1, 0.05], 
#     "model__n_estimators": [300, 500, 800], 
#     "model__subsample": [0.8, 1.0], 
#     "model__colsample_bytree": [0.8, 1.0]
}

In [None]:
# XGB_gs = GridSearchCV(
#     estimator=base_pipeline, 
#     param_grid = new_param_xgb, 
#     scoring = "recall", 
#     cv = 5, 
#     n_jobs = -1, 
#     verbose = 2
# )

In [39]:
# XGB_gs.fit(X_train, y_train)

In [104]:
fit_params = {
    "model__eval_set": [(X_val, y_val)], 
    # "model__eval_metric": "logloss", 
    # "model__early_stopping_rounds": 50, 
    "model__verbose": False
}

In [196]:
random_xgb = RandomizedSearchCV(
    estimator = base_pipeline, 
    param_distributions  = new_param_xgb, 
    scoring = "recall", 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
    n_jobs=-1, 
    random_state=42, 
    verbose=3, 
    n_iter = 300, 
)

In [197]:
random_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


0,1,2
,estimator,"Pipeline(step...=None, ...))])"
,param_distributions,"{'model__colsample_bytree': <scipy.stats....0021CFF98B9D0>, 'model__gamma': <scipy.stats....0021CFF988D00>, 'model__learning_rate': <scipy.stats....0021CFF98A680>, 'model__max_depth': <scipy.stats....0021CFF98BC40>, ...}"
,n_iter,300
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('cat', ...), ('scale', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,"[['A61', 'A62', ...], ['A71', 'A72', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.7007018171877334)
,device,
,early_stopping_rounds,
,enable_categorical,False


In [198]:
random_xgb.best_score_

np.float64(0.8485829959514171)

In [199]:
xgb_test2_pred = random_xgb.predict(X_test)

In [200]:
accuracy_score(y_test, xgb_test2_pred)

0.635

In [201]:
random_xgb.best_params_

{'model__colsample_bytree': np.float64(0.7007018171877334),
 'model__gamma': np.float64(0.4838496941035432),
 'model__learning_rate': np.float64(0.010525381855838492),
 'model__max_depth': 9,
 'model__min_child_weight': 5,
 'model__n_estimators': 143,
 'model__scale_pos_weight': np.float64(2.8074302518167613),
 'model__subsample': np.float64(0.9848395143056471)}

In [202]:
print(classification_report(y_test, xgb_test2_pred))

              precision    recall  f1-score   support

           0       0.89      0.54      0.68       140
           1       0.44      0.85      0.58        60

    accuracy                           0.64       200
   macro avg       0.67      0.70      0.63       200
weighted avg       0.76      0.64      0.65       200



<pre>

>> We can clearly see an improvement in the Recall of Bad Credit Risk Customers: 0.85, 

>> Now, we will try lightgbm as well, then will deploy it!



In [203]:
from lightgbm import LGBMClassifier

In [176]:
lgbm_pipeline = Pipeline([
    ("preprocess", ct), 
    ("smote", SMOTE(random_state=47)), 
    ("model", LGBMClassifier(
        n_estimators = 500, 
        learning_rate = 0.05, 
        max_depth = -1, 
        random_state = 47
    ))
])

In [177]:
param_lgb = {
    "model__n_estimators": randint(100, 1000),
    "model__learning_rate": uniform(0.01, 0.1),
    "model__max_depth": randint(3, 12),
    "model__num_leaves": randint(20, 100),
    "model__min_child_samples": randint(5, 30),
    "model__subsample": uniform(0.6, 0.4),
    "model__colsample_bytree": uniform(0.6, 0.4),
    "model__reg_alpha": uniform(0, 1),
    "model__reg_lambda": uniform(0, 1),
    "model__class_weight": ["balanced", None]
}

In [178]:
from sklearn.metrics import make_scorer, recall_score

recall_scorer = make_scorer(recall_score, pos_label=1)


In [180]:
random_lgb = RandomizedSearchCV(
    estimator=lgbm_pipeline,
    param_distributions=param_lgb,
    scoring=recall_scorer,
    n_iter=100,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1,
    random_state=42
)

In [181]:
random_lgb.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


0,1,2
,estimator,Pipeline(step...m_state=47))])
,param_distributions,"{'model__class_weight': ['balanced', None], 'model__colsample_bytree': <scipy.stats....0021CFDCE99C0>, 'model__learning_rate': <scipy.stats....0021CFF0DC100>, 'model__max_depth': <scipy.stats....0021CFDCEB8E0>, ...}"
,n_iter,100
,scoring,"make_scorer(r..., pos_label=1)"
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('cat', ...), ('scale', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'if_binary'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,"[['A61', 'A62', ...], ['A71', 'A72', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,sampling_strategy,'auto'
,random_state,47
,k_neighbors,5

0,1,2
,boosting_type,'gbdt'
,num_leaves,84
,max_depth,6
,learning_rate,np.float64(0....8486277398677)
,n_estimators,119
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [182]:
random_lgb.best_params_

{'model__class_weight': 'balanced',
 'model__colsample_bytree': np.float64(0.6326376721600961),
 'model__learning_rate': np.float64(0.010518486277398677),
 'model__max_depth': 6,
 'model__min_child_samples': 17,
 'model__n_estimators': 119,
 'model__num_leaves': 84,
 'model__reg_alpha': np.float64(0.07094091699992766),
 'model__reg_lambda': np.float64(0.3967838272138884),
 'model__subsample': np.float64(0.6203074124157587)}

In [183]:
random_lgb.best_score_

np.float64(0.6044534412955466)

In [184]:
lgbtest_pred = random_lgb.predict(X_test)



In [185]:
accuracy_score(y_test, lgbtest_pred)

0.72

In [186]:
print(classification_report(y_test, lgbtest_pred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80       140
           1       0.53      0.57      0.55        60

    accuracy                           0.72       200
   macro avg       0.67      0.68      0.67       200
weighted avg       0.73      0.72      0.72       200



<pre>

>> This lgbm was quite bad, we will not focus or tune on tha for now, 

>> The best model we got is form XGBoost, and we will save that model and will deploy it.



## Model Saving for Deployment.

In [204]:
best_pipeline = random_xgb.best_estimator_

In [205]:
import joblib
import os

In [206]:
folder_path = "../best_model"

os.makedirs(folder_path, exist_ok = True)

joblib.dump(best_pipeline, os.path.join(folder_path, "xgb_pipeline.joblib"))

['../best_model\\xgb_pipeline.joblib']

<pre>

Model has been saved and ready to deploy!!

Good Luck!

