In [1]:
#!pip install xgboost
#!pip install scikit-learn
#!pip install xgboost

import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from scipy import stats
import pandas as pd
import xgboost as xgb
import pickle

In [19]:
df_cleaned = pd.read_csv("../data/default_clean_v1.csv", index_col=0)
df_cleaned

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,default
0,1,20000,Female,Undergraduate,Married,24,2,2,-1,-1,...,0,0,0,689,0,0,0,0,1,Yes
1,2,120000,Female,Undergraduate,Single,26,-1,2,0,0,...,3455,3261,0,1000,1000,1000,0,2000,1,Yes
2,3,90000,Female,Undergraduate,Single,34,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,0,No
3,4,50000,Female,Undergraduate,Married,37,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,0,No
4,5,50000,Male,Undergraduate,Married,57,-1,0,-1,0,...,19146,19131,2000,36681,10000,9000,689,679,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,Male,High School,Married,39,0,0,0,0,...,31237,15980,8500,20000,5003,3047,5000,1000,0,No
29996,29997,150000,Male,High School,Single,43,-1,-1,-1,-1,...,5190,0,1837,3526,8998,129,0,0,0,No
29997,29998,30000,Male,Undergraduate,Single,37,4,3,2,-1,...,20582,19357,0,0,22000,4200,2000,3100,1,Yes
29998,29999,80000,Male,High School,Married,41,1,-1,0,0,...,11855,48944,85900,3409,1178,1926,52964,1804,1,Yes


In [20]:
# Enocde target variable
df_cleaned['default'] = df_cleaned['default'].apply(lambda y: 1 if y == 'Yes' else 0)

df_cleaned = df_cleaned.drop(columns=["default payment next month"], axis=1)

In [21]:
df_cleaned

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,Female,Undergraduate,Married,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,Female,Undergraduate,Single,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,Female,Undergraduate,Single,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,Female,Undergraduate,Married,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,Male,Undergraduate,Married,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,Male,High School,Married,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,Male,High School,Single,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,Male,Undergraduate,Single,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,Male,High School,Married,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


## Model 1 -- All variables included

In [14]:
# One hot encoding of categorical variables
categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
df_cleaned = pd.get_dummies(df_cleaned, columns=categorical_cols)

In [15]:
X=df_cleaned.drop(['default','ID'],axis=1)
y=df_cleaned[['default']]

In [16]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)

In [17]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

y_pred = model.predict(X_test) 
predicted_prob = model.predict_proba(X_test)[:,1]
col_X_train = X_train.columns

data_to_store = (y_test, y_pred, predicted_prob, col_X_train)

In [18]:
# save
pickle.dump(model, open("../model/Model1.pkl", "wb"))
pickle.dump(data_to_store, open('../model/Model1_data.pkl', 'wb'))

## Model 2 -- Excluding Protected Variables (age, education, sex, marriage)

In [5]:
X=df_cleaned.drop(['default','ID',"SEX", "EDUCATION","AGE","MARRIAGE"],axis=1)
y=df_cleaned[['default']]

In [6]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)

In [7]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

y_pred = model.predict(X_test) 
predicted_prob = model.predict_proba(X_test)[:,1]
col_X_train = X_train.columns

data_to_store = (y_test, y_pred, predicted_prob, col_X_train)

In [8]:
# save
pickle.dump(model, open("../model/Model2.pkl", "wb"))
pickle.dump(data_to_store, open('../model/Model2_data.pkl', 'wb'))

## Model 3 -- Excluding Protected Variables (age, education, sex)

In [22]:
X=df_cleaned.drop(['default','ID',"SEX", "EDUCATION","AGE"],axis=1)
y=df_cleaned[['default']]

In [23]:
# One hot encoding of categorical variables
categorical_cols = ["MARRIAGE"]
X = pd.get_dummies(X, columns=categorical_cols)

In [24]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)

In [25]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

y_pred = model.predict(X_test) 
predicted_prob = model.predict_proba(X_test)[:,1]
col_X_train = X_train.columns

data_to_store = (y_test, y_pred, predicted_prob, col_X_train)

In [26]:
# save
pickle.dump(model, open("../model/Model3.pkl", "wb"))
pickle.dump(data_to_store, open('../model/Model3_data.pkl', 'wb'))