In [1]:
#!pip install xgboost
#!pip install scikit-learn
#!pip install xgboost

import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from scipy import stats
import pandas as pd
import xgboost as xgb

In [24]:
df_cleaned = pd.read_csv("../data/default_clean_v1.csv", index_col=0)
df_cleaned

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,default
0,1,20000,Female,Undergraduate,Married,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,Yes
1,2,120000,Female,Undergraduate,Single,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,Yes
2,3,90000,Female,Undergraduate,Single,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,No
3,4,50000,Female,Undergraduate,Married,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,No
4,5,50000,Male,Undergraduate,Married,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,Male,High School,Married,39,0,0,0,0,...,31237,15980,8500,20000,5003,3047,5000,1000,0,No
29996,29997,150000,Male,High School,Single,43,-1,-1,-1,-1,...,5190,0,1837,3526,8998,129,0,0,0,No
29997,29998,30000,Male,Undergraduate,Single,37,4,3,2,-1,...,20582,19357,0,0,22000,4200,2000,3100,1,Yes
29998,29999,80000,Male,High School,Married,41,1,-1,0,0,...,11855,48944,85900,3409,1178,1926,52964,1804,1,Yes


In [25]:
# Enocde target variable
df_cleaned['default'] = df_cleaned['default'].apply(lambda y: 1 if y == 'Yes' else 0)

df_cleaned = df_cleaned.drop(columns=["default payment next month"], axis=1)

In [26]:
df_cleaned

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,Female,Undergraduate,Married,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,Female,Undergraduate,Single,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,Female,Undergraduate,Single,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,Female,Undergraduate,Married,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,Male,Undergraduate,Married,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,Male,High School,Married,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,Male,High School,Single,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,Male,Undergraduate,Single,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,Male,High School,Married,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


## Model 1 -- All variables included

In [5]:
# One hot encoding of categorical variables
categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
df_cleaned = pd.get_dummies(df_cleaned, columns=categorical_cols)

In [6]:
X=df_cleaned.drop(['default','ID'],axis=1)
y=df_cleaned[['default']]

## Model 2 -- Excluding Protected Variables (age, education, sex, marriage)

In [18]:
X=df_cleaned.drop(['default','ID',"SEX", "EDUCATION","AGE","MARRIAGE"],axis=1)
y=df_cleaned[['default']]

## Model 3 -- Excluding Protected Variables (age, education, sex)

In [27]:
X=df_cleaned.drop(['default','ID',"SEX", "EDUCATION","AGE"],axis=1)
y=df_cleaned[['default']]

In [28]:
# One hot encoding of categorical variables
categorical_cols = ["MARRIAGE"]
X = pd.get_dummies(X, columns=categorical_cols)

## Evaulation Metrics

In [29]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)

In [30]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)       

In [31]:
# Accuracy
print("Accuracy: ", round(accuracy_score(y_test, y_pred),3))

Accuracy:  0.808


In [32]:
# Precision, Recall, and F1-Score
print("Precision: ", round(precision_score(y_test, y_pred),3))
print("Recall: ", round(recall_score(y_test, y_pred),3))
print("F1: ", round(f1_score(y_test, y_pred),3))

Precision:  0.615
Recall:  0.359
F1:  0.453


In [23]:
# Importance
importance = pd.DataFrame({'Feature':list(X_train.columns),'Importance':model.feature_importances_})
importance.sort_values(by=['Importance'], ascending=False)

Unnamed: 0,Feature,Importance
1,PAY_0,0.389861
2,PAY_2,0.123371
3,PAY_3,0.060994
6,PAY_6,0.038466
5,PAY_5,0.036866
4,PAY_4,0.036854
0,LIMIT_BAL,0.027243
14,PAY_AMT2,0.02634
7,BILL_AMT1,0.025949
15,PAY_AMT3,0.024948


In [12]:
#AUC
predicted_prob = model.predict_proba(X_test)[:,1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.values, predicted_prob, pos_label = 1)
auc=auc(false_positive_rate, true_positive_rate)
round(auc,3)

0.766

In [13]:
#somers' D
somersd=(2*auc) - 1
round(somersd, 3)

0.533

In [14]:
# KS
df=pd.DataFrame()
df['real'] = y_test
df['proba'] = predicted_prob
class0 = df[df['real'] ==0 ]
class1 = df[df['real'] ==1 ]
ks = stats.ks_2samp(class0['proba'],class1['proba']).statistic
p  = stats.ks_2samp(class0['proba'],class1['proba']).pvalue
print("The KS is", round(ks,3), " (p-value:", round(p,3),")")

The KS is 0.411  (p-value: 0.0 )
