# Set Up

In [2]:
import pandas as pd
import numpy as np 
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score

df = pd.read_csv('/content/preprocessed_upsampled.csv', header=0)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,Client_4,Client_3,Client_2,Client_1,Avg_exp_6,Avg_exp_5,Avg_exp_4,Avg_exp_3,Avg_exp_2,Avg_exp_1
0,1.0,0,1,1,0,0,4,3,1,1,...,0,0,0,0,0,0,3,6,6,4
1,2.0,4,1,1,1,1,1,3,2,2,...,0,0,0,0,1,2,1,1,1,2
2,3.0,4,1,1,1,4,2,2,2,2,...,0,0,0,0,2,2,1,2,6,4
3,4.0,1,1,1,0,5,2,2,2,2,...,0,0,0,0,3,3,8,8,7,8
4,5.0,1,0,1,0,9,1,2,1,2,...,0,0,0,0,5,8,9,9,9,4


In [15]:
from sklearn.model_selection import train_test_split

features = ['LIMIT_BAL', 'EDUCATION', 'MARRIAGE', 'PAY_2', 'PAY_3', 
            'PAY_4', 'PAY_5', 'PAY_6','BILL_AMT1', 'BILL_AMT2',
            'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
            'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
            'Avg_exp_6', 'Avg_exp_5', 'Avg_exp_4', 
            'Avg_exp_3', 'Avg_exp_2', 'Avg_exp_1', 'Closeness_6', 'Closeness_5',
            'Closeness_4', 'Closeness_3', 'Closeness_2','Closeness_1']
y = df['default payment next month'].copy() # target
X = df[features].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [4]:
def get_feature_importance(clsf, ftrs):
    imp = clsf.feature_importances_.tolist()
    feat = ftrs
    result = pd.DataFrame({'feat':feat,'score':imp})
    result = result.sort_values(by=['score'],ascending=False)
    return result

In [5]:
def get_metrics(model, X_test):
  predictions = model.predict(X_test)
  print("-------------")
  print("f1 score: {}".format(round(f1_score(y_true = y_test, y_pred = predictions),3)))
  print("Accuracy: {}".format(round(accuracy_score(y_true = y_test, y_pred = predictions),3)))
  print("ROC AUC: {}".format(round(roc_auc_score(y_test, predictions),3)))
  print("-------------")
  TP = np.sum(np.logical_and(predictions == 1, y_test == 1))
  TN = np.sum(np.logical_and(predictions == 0, y_test == 0))
  FP = np.sum(np.logical_and(predictions == 1, y_test == 0))
  FN = np.sum(np.logical_and(predictions == 0, y_test == 1))
  pred = len(predictions)

  print('True Positives: {}'.format(TP))
  print('False Positive: {}'.format(FP))
  print('True Negative: {}'.format(TN))
  print('False Negative: {}'.format(FN))
  print('Precision: {}'.format(round(TP/(TP+FP),3)))
  print('Recall: {}'.format(round(TP/(TP+FN),3)))
  print('Problematic ratio: {}'.format(round(FN/(FN+TP),3)))

# GBDT

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbdt = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

%time gbdt.fit(X_train, y_train)

CPU times: user 17.6 s, sys: 35.1 ms, total: 17.6 s
Wall time: 17.6 s


GradientBoostingClassifier(learning_rate=0.5, n_estimators=200)

In [None]:
get_metrics(gbdt, X_test)

-------------
f1 score: 0.856
Accuracy: 0.863
ROC AUC: 0.864
-------------
True Positives: 3779
False Positive: 373
True Negative: 4291
False Negative: 903
Precision: 0.91
Recall: 0.807
Problematic ratio: 0.193


# Random Forest

In [20]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier 

rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

%time rf.fit(X_train, y_train)

CPU times: user 991 ms, sys: 10 ms, total: 1 s
Wall time: 678 ms


RandomForestClassifier(criterion='entropy', n_estimators=10, n_jobs=-1)

In [21]:
get_metrics(rf, X_test)

-------------
f1 score: 0.851
Accuracy: 0.858
ROC AUC: 0.859
-------------
True Positives: 3805
False Positive: 372
True Negative: 4210
False Negative: 959
Precision: 0.911
Recall: 0.799
Problematic ratio: 0.201


# AdaBoost


In [None]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier 

ada = AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.5,
          n_estimators=200, random_state=None)

%time ada.fit(X_train, y_train)

CPU times: user 8.35 s, sys: 67.6 ms, total: 8.41 s
Wall time: 8.41 s


AdaBoostClassifier(algorithm='SAMME', learning_rate=0.5, n_estimators=200)

In [None]:
get_metrics(ada, X_test)

-------------
f1 score: 0.842
Accuracy: 0.853
ROC AUC: 0.853
-------------
True Positives: 3669
False Positive: 360
True Negative: 4304
False Negative: 1013
Precision: 0.911
Recall: 0.784
Problematic ratio: 0.216


# XGboost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

%time xgb.fit(X_train, y_train)

CPU times: user 3.77 s, sys: 36.9 ms, total: 3.81 s
Wall time: 3.8 s


XGBClassifier()

In [None]:
get_metrics(xgb, X_test)

-------------
f1 score: 0.85
Accuracy: 0.859
ROC AUC: 0.86
-------------
True Positives: 3709
False Positive: 341
True Negative: 4323
False Negative: 973
Precision: 0.916
Recall: 0.792
Problematic ratio: 0.208
