# Modeling

## Navigation
<ul>
<li><a href="#oversampling">Oversampling using SMOTE</a></li>
<li><a href="#lr">Logistic Regression model</a></li>
    <ul>
      <li><a href="#thresh">Threshold adjustment</a></li>
    </ul>

<li><a href="#rf">Random Forest model</a></li>
<li><a href="#knn">K-Nearest Neighbor model</a></li>
<li><a href="#xgbm">XGBoost model</a></li>
<li><a href="#voting">Ensembling: Voting Classifier</a></li>
    <ul>
    <li><a href="#majority">Majority voting</a></li>
    <li><a href="#average">Average voting</a></li>
    </ul>
<li><a href="#xgb">Final fit</a></li>
<li><a href="#joblib">Export joblib</a></li>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import imblearn.over_sampling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [3]:
df = pd.read_csv("../data/processed/clean.csv")

In [4]:
# split data
X = df.drop("LOAN_DEFAULT" , axis = 1)
y = df["LOAN_DEFAULT"]
X_train , X_val , y_train , y_val = train_test_split(X, y, test_size = 0.2, random_state=7)
# X_train_sub_set , X_vald_sub_set , y_train_sub_set , y_vald_sub_set = train_test_split(x[0:1000], y[0:1000] , test_size = 0.2, random_state=42)

# scaling data
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_val_scaled = ss.transform(X_val)

In [207]:
# check for imbalance
y.value_counts(normalize=True)

0    0.779556
1    0.220444
Name: LOAN_DEFAULT, dtype: float64

> Data is imbalanced

## Oversampling using SMOTE
<a id='oversampling'></a>
<a href="#">Back to top</a>

In [5]:
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
ratio = {1 : n_pos * 4, 0 : n_neg} 

smote = imblearn.over_sampling.SMOTE(sampling_strategy=ratio, random_state = 10)
X_tr_smote, y_tr_smote = smote.fit_resample(X_train, y_train)

  f"After over-sampling, the number of samples ({n_samples})"


In [209]:
y_tr_smote.value_counts(normalize=True)

1    0.531496
0    0.468504
Name: LOAN_DEFAULT, dtype: float64

# Logistics Regression
<a id='lr'></a>
<a href="#">Back to top</a>

In [19]:
#SMOTE
lr = LogisticRegression(C=0.01, penalty='l2', solver='lbfgs')
lr.fit(ss.fit_transform(X_tr_smote), y_tr_smote)
y_train_pred = lr.predict(ss.fit_transform(X_tr_smote))
y_val_pred = lr.predict(X_val_scaled)
print(f"training score: {lr.score(ss.fit_transform(X_tr_smote), y_tr_smote)}")
print(f"validation score: {lr.score(X_val_scaled, y_val)}")
print("*"*50)
print("Training classification report")
print(classification_report(y_tr_smote, y_train_pred))
print("*"*50)
print("Validation classification report")
print(classification_report(y_val, y_val_pred))

training score: 0.6364855430597701
validation score: 0.46002482644476117
**************************************************
Training classification report
              precision    recall  f1-score   support

           0       0.65      0.49      0.56    135558
           1       0.63      0.77      0.69    153784

    accuracy                           0.64    289342
   macro avg       0.64      0.63      0.62    289342
weighted avg       0.64      0.64      0.63    289342

**************************************************
Validation classification report
              precision    recall  f1-score   support

           0       0.81      0.40      0.54     34000
           1       0.24      0.67      0.35      9502

    accuracy                           0.46     43502
   macro avg       0.53      0.54      0.44     43502
weighted avg       0.69      0.46      0.50     43502



### Perform threshhold adjustment to see if there is any changes
<a id='thresh'></a>
<a href="#">Back to top</a>

In [248]:
thresh_ps = np.linspace(.10,.50,1000)
model_val_probs = lr.predict_proba(ss.fit_transform(X_val))[:,1] # positive class probs

f1_scores, prec_scores, rec_scores, acc_scores = [], [], [], []
for p in thresh_ps:
    model_val_labels = model_val_probs >= p
    f1_scores.append(f1_score(y_val, model_val_labels))    
    prec_scores.append(precision_score(y_val, model_val_labels))
    rec_scores.append(recall_score(y_val, model_val_labels))
    acc_scores.append(accuracy_score(y_val, model_val_labels))

best_f1_score = np.max(f1_scores) 
best_thresh_p = thresh_ps[np.argmax(f1_scores)]

print('Logistic Regression Model best F1 score %.3f at prob decision threshold >= %.3f' 
      % (best_f1_score, best_thresh_p))

Logistic Regression Model best F1 score 0.364 at prob decision threshold >= 0.324


In [247]:
print(classification_report(y_val, lr.predict_proba(ss.fit_transform(X_val))[:,1]>=0.324))

              precision    recall  f1-score   support

           0       0.85      0.12      0.21     34000
           1       0.23      0.92      0.36      9502

    accuracy                           0.30     43502
   macro avg       0.54      0.52      0.29     43502
weighted avg       0.71      0.30      0.25     43502



> Cutoff is showing worst results; therefore, discarded

# Random Forest
<a id='rf'></a>
<a href="#">Back to top</a>

In [6]:
# Random Forest
# Best params:  {'max_depth': 25, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 250}
# Best estimator:  RandomForestClassifier(max_depth=25, min_samples_leaf=2, n_estimators=250,
#                        random_state=7)
    
rf = RandomForestClassifier(max_depth=25, min_samples_leaf=2, n_estimators=250,
                       random_state=7, n_jobs=-1)
rf.fit(X_tr_smote, y_tr_smote)
y_train_pred = rf.predict(X_tr_smote)
print(classification_report(y_tr_smote, y_train_pred))

              precision    recall  f1-score   support

           0       0.84      0.67      0.74    135558
           1       0.75      0.89      0.82    153784

    accuracy                           0.79    289342
   macro avg       0.80      0.78      0.78    289342
weighted avg       0.80      0.79      0.78    289342



In [7]:
y_val_pred = rf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.81      0.60      0.69     34000
           1       0.26      0.50      0.34      9502

    accuracy                           0.58     43502
   macro avg       0.54      0.55      0.52     43502
weighted avg       0.69      0.58      0.61     43502



# K-Nearest Neighbor
<a id='knn'></a>
<a href="#">Back to top</a>

In [12]:
knn = KNeighborsClassifier(n_neighbors=24, metric='euclidean', n_jobs=-1)
knn.fit(ss.fit_transform(X_tr_smote), y_tr_smote)
y_train_pred = knn.predict(ss.fit_transform(X_tr_smote))
y_val_pred = knn.predict(ss.fit_transform(X_val))

In [13]:
print(classification_report(y_tr_smote, y_train_pred))

              precision    recall  f1-score   support

           0       0.67      0.68      0.67    135558
           1       0.71      0.71      0.71    153784

    accuracy                           0.69    289342
   macro avg       0.69      0.69      0.69    289342
weighted avg       0.69      0.69      0.69    289342



In [14]:
print(classification_report(y_val_pred, y_val))

              precision    recall  f1-score   support

           0       0.62      0.80      0.70     26516
           1       0.43      0.24      0.31     16986

    accuracy                           0.58     43502
   macro avg       0.53      0.52      0.51     43502
weighted avg       0.55      0.58      0.55     43502



# XGBoost model
<a id='xgbm'></a>
<a href="#">Back to top</a>

In [15]:
xgb = XGBClassifier(max_depth=25, min_samples_leaf=2, n_estimators=250, learning_rate=0.1)
xgb.fit(X_tr_smote, y_tr_smote)
y_train_pred = xgb.predict(X_tr_smote)
y_val_pred = xgb.predict(X_val)



Parameters: { "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [16]:
print(classification_report(y_tr_smote, y_train_pred))
print("*"*50)
print(classification_report(y_val, y_val_pred))


              precision    recall  f1-score   support

           0       0.96      0.88      0.92    135558
           1       0.90      0.97      0.93    153784

    accuracy                           0.93    289342
   macro avg       0.93      0.92      0.93    289342
weighted avg       0.93      0.93      0.93    289342

**************************************************
              precision    recall  f1-score   support

           0       0.80      0.70      0.75     34000
           1       0.25      0.35      0.29      9502

    accuracy                           0.63     43502
   macro avg       0.52      0.53      0.52     43502
weighted avg       0.68      0.63      0.65     43502



# Voting Classifier
<a id='voting'></a>
<a href="#">Back to top</a>

### Majority voting
<a id='majority'></a>


In [24]:
model = VotingClassifier([
    ('lr' , lr) ,
    ('rf' , rf) ,
    ('knn' , knn) ,
    ('xgb', xgb)
    ], voting='hard', n_jobs=-1 )
model.fit(ss.fit_transform(X_tr_smote), y_tr_smote)
y_train_pred = model.predict(ss.fit_transform(X_tr_smote))
y_val_pred = model.predict(ss.fit_transform(X_val))
print(classification_report(y_tr_smote, y_train_pred))
print("*"*50)
print(classification_report(y_val, y_val_pred))


              precision    recall  f1-score   support

           0       0.76      0.75      0.76    135558
           1       0.78      0.79      0.79    153784

    accuracy                           0.77    289342
   macro avg       0.77      0.77      0.77    289342
weighted avg       0.77      0.77      0.77    289342

**************************************************
              precision    recall  f1-score   support

           0       0.81      0.49      0.61     34000
           1       0.24      0.58      0.34      9502

    accuracy                           0.51     43502
   macro avg       0.53      0.54      0.48     43502
weighted avg       0.69      0.51      0.55     43502



### Average voting
<a id='average'></a>
<a href="#">Back to top</a>

In [25]:
model = VotingClassifier([
    ('lr' , lr) ,
    ('rf' , rf) ,
    ('knn' , knn) ,
    ('xgb', xgb)
    ], voting='soft', n_jobs=-1 )
model.fit(ss.fit_transform(X_tr_smote), y_tr_smote)
y_train_pred = model.predict(ss.fit_transform(X_tr_smote))
y_val_pred = model.predict(ss.fit_transform(X_val))
print(classification_report(y_tr_smote, y_train_pred))
print("*"*50)
print(classification_report(y_val, y_val_pred))


              precision    recall  f1-score   support

           0       0.87      0.72      0.79    135558
           1       0.78      0.91      0.84    153784

    accuracy                           0.82    289342
   macro avg       0.83      0.81      0.81    289342
weighted avg       0.83      0.82      0.82    289342

**************************************************
              precision    recall  f1-score   support

           0       0.81      0.46      0.59     34000
           1       0.24      0.62      0.35      9502

    accuracy                           0.49     43502
   macro avg       0.53      0.54      0.47     43502
weighted avg       0.69      0.49      0.54     43502



> Based on our results above, we confidently going to choose XGBoost since our measure here is better F1 score.

# Fit XGBoost on test data
<a id='final'></a>
<a href="#">Back to top</a>

In [146]:
# merge train and validation
val = X_val.merge(y_val, left_index=True, right_index=True)
train = X_tr_smote.merge(y_tr_smote, left_index=True, right_index=True)


In [147]:
train = pd.concat([train,val])

In [189]:
# read test csv
test = pd.read_csv("../data/raw/test.csv")

In [149]:
X = train.drop("LOAN_DEFAULT" , axis = 1)
y = train["LOAN_DEFAULT"]

In [191]:
X_test = test.drop("LOAN_DEFAULT" , axis = 1)
y_test = test["LOAN_DEFAULT"]

In [190]:
xgb.fit(X, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [192]:
y_pred = xgb.predict(X_test)

In [193]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.68      0.68     36303
           1       0.65      0.65      0.65     33654

    accuracy                           0.67     69957
   macro avg       0.66      0.66      0.66     69957
weighted avg       0.67      0.67      0.67     69957



> Slight improvement

# Export models to joblib file
<a id='joblib'></a>
<a href="#">Back to top</a>

In [45]:
import joblib

models = ["lr", "knn", "rf", "xgb"]

for model in models:
    filename = f'../models/{model}.sav'
    joblib.dump(eval(model), filename)