# Week 16 Project:
## Random Forest - Credit Card Default

Source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import auc, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [23]:
# Import data from UCI ML Repo
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls' 

df = pd.read_excel(io = url, header = 1)

df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Attribute Information:
* This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:*
 - X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
 - X2: Gender (1 = male; 2 = female).
 - X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
 - X4: Marital status (1 = married; 2 = single; 3 = others).
 - X5: Age (year).
 - X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
 - X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.
 - X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005.

### EDA

In [24]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [25]:
df['default payment next month'].value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

In [26]:
# Check target balance
(df['default payment next month']==1).sum() / len(df)

0.2212

In [27]:
df.columns


Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [28]:
# Fill zero values with mean
avg_pay_hist_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

for col in avg_pay_hist_cols:
    df[col]=df[col].replace(0,df[col].mean())
    
df[avg_pay_hist_cols].describe()
    

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,-0.024904,-0.203905,-0.253533,-0.341702,-0.416576,-0.449128
std,1.123711,1.191198,1.187581,1.151315,1.105201,1.118702
min,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911
75%,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911
max,8.0,8.0,8.0,8.0,8.0,8.0


In [29]:
# Create average pay history columns by averaging all associated columns
df['avg_pay_history'] = df[avg_pay_hist_cols].mean(axis=1)



In [31]:
#drop columns
df.drop(columns=avg_pay_hist_cols, inplace = True)


df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,avg_pay_history
0,1,20000,2,2,1,24,3913,3102,689,0,0,0,0,689,0,0,0,0,1,-0.333333
1,2,120000,2,2,2,26,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,0.391156
2,3,90000,2,2,2,34,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,-0.182439
3,4,50000,2,2,1,37,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,-0.182439
4,5,50000,1,2,1,57,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,-0.485289


In [32]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'BILL_AMT1',
       'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
       'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month', 'avg_pay_history'],
      dtype='object')

In [34]:
# Fill zero values (bill amount)  with mean
avg_bill_amt_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

for col in avg_bill_amt_cols:
    df[col]=df[col].replace(0,df[col].mean())
    
df[avg_bill_amt_cols].describe()

# Create average bill amount columns by averaging all associated columns
df['avg_bill_amt'] = df[avg_bill_amt_cols].mean(axis=1)

#drop columns
df.drop(columns=avg_bill_amt_cols, inplace = True)


df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,avg_pay_history,avg_bill_amt
0,1,20000,2,2,1,24,0,689,0,0,0,0,1,-0.333333,21691.685056
1,2,120000,2,2,2,26,0,1000,1000,1000,0,2000,1,0.391156,2846.166667
2,3,90000,2,2,2,34,1518,1500,1000,1000,1000,5000,0,-0.182439,16942.166667
3,4,50000,2,2,1,37,2000,2019,1200,1100,1069,1000,0,-0.182439,38555.666667
4,5,50000,1,2,1,57,2000,36681,10000,9000,689,679,0,-0.485289,18223.166667


In [36]:
# Fill zero values (pay amount)  with mean
avg_pay_amt_cols = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

for col in avg_pay_amt_cols:
    df[col]=df[col].replace(0,df[col].mean())
    
df[avg_pay_amt_cols].describe()

# Create average pay amount columns by averaging all associated columns
df['avg_pay_amt'] = df[avg_pay_amt_cols].mean(axis=1)

#drop columns
df.drop(columns=avg_pay_amt_cols, inplace = True)


df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,default payment next month,avg_pay_history,avg_bill_amt,avg_pay_amt
0,1,20000,2,2,1,24,1,-0.333333,21691.685056,4403.204844
1,2,120000,2,2,2,26,1,0.391156,2846.166667,2577.161356
2,3,90000,2,2,2,34,0,-0.182439,16942.166667,1836.333333
3,4,50000,2,2,1,37,0,-0.182439,38555.666667,1398.0
4,5,50000,1,2,1,57,0,-0.485289,18223.166667,9841.5


In [37]:
df.dtypes

ID                              int64
LIMIT_BAL                       int64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
default payment next month      int64
avg_pay_history               float64
avg_bill_amt                  float64
avg_pay_amt                   float64
dtype: object

### Modeling

In [40]:
#Define X & y
X = df.drop('default payment next month', axis=1)
y = df['default payment next month']

print(X.shape)
print(y.shape)

(30000, 9)
(30000,)


In [42]:
y.value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

In [43]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=24)

#### Perform GridSearchCV

In [45]:
# Instantiate and fit a RandomForestClassifier
ran_forest = RandomForestClassifier(random_state=24)

In [46]:
# generate param grid to perform gridSearch
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [48]:
CV_rfc = GridSearchCV(estimator=ran_forest, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [49]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 200}

#### Fit & Train Random Forest

In [50]:
#Generate model with best params identified during grid search
rand_forest_f = RandomForestClassifier(max_features = 'auto', n_estimators=200, max_depth=8, criterion='gini', random_state=24)


In [51]:
#Fit model
rand_forest_f.fit(X_train, y_train)

  warn(


In [52]:
#predict on model
preds = rand_forest_f.predict(X_test)
print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,preds))

Accuracy for Random Forest on CV data:  0.8108
