#Import Packages and data loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv("/content/ccdefault.csv", index_col = 'ID')


In [4]:
data

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29997,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29999,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [5]:
x = data.drop("DEFAULT", axis =1).values
y = data["DEFAULT"].values

1 for defaults and 0 for non-defaults

#Train - Test Split

In [9]:
RANDOM_STATE = []
DEPTH_RES = []
IN_ACC_RES = []
OUT_ACC_RES = []
best_depth = 0
accuracy_in = 0
accuracy_out = 0
for rnd_state in range(1, 11):
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = rnd_state, stratify =y)
  for i in range(1,15):
    tree = DecisionTreeClassifier(criterion = 'gini', max_depth =i,random_state=1)
    tree.fit(X_train, y_train)
    y_pred_train = tree.predict(X_train)
    y_pred = tree.predict(X_test)
    temp_accuracy_in = metrics.accuracy_score(y_train, y_pred_train)
    temp_accuracy_out = metrics.accuracy_score(y_test, y_pred)
    if (accuracy_out <= temp_accuracy_out):
      accuracy_out = temp_accuracy_out
      accuracy_in = temp_accuracy_in
      best_depth = i
  

  RANDOM_STATE.append(rnd_state) 
  DEPTH_RES.append(best_depth)
  IN_ACC_RES.append(accuracy_in)
  OUT_ACC_RES.append(accuracy_out)



In [13]:
data_display = {
    "Random State" : RANDOM_STATE,
    "Max Depth" : DEPTH_RES,
    "In-sample accuracy" : IN_ACC_RES,
    "Out-sample accuracy" : OUT_ACC_RES
}
data_display = pd.DataFrame(data_display)
data_display = data_display.set_index('Random State', drop = True)
display(data_display)

Unnamed: 0_level_0,Max Depth,In-sample accuracy,Out-sample accuracy
Random State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4,0.822556,0.828333
2,4,0.822556,0.828333
3,4,0.822556,0.828333
4,4,0.822556,0.828333
5,4,0.822556,0.828333
6,4,0.822556,0.828333
7,4,0.822556,0.828333
8,4,0.822556,0.828333
9,4,0.822556,0.828333
10,4,0.822556,0.828333


In [14]:
data_display2 = {
    "In-sample Scores" : [np.mean(IN_ACC_RES), np.std(IN_ACC_RES)],
    "Out-sample Scores" : [np.mean(OUT_ACC_RES), np.std(OUT_ACC_RES)]
}
data_display2 = pd.DataFrame(data_display2, index=['mean', 'std'])
display(data_display2)

Unnamed: 0,In-sample Scores,Out-sample Scores
mean,0.8225556,0.828333
std,1.110223e-16,0.0


#Cross validation

In [18]:
kf = StratifiedKFold(n_splits = 10)
tree = DecisionTreeClassifier(max_depth = best_depth, min_samples_leaf = 4, min_samples_split= 2, random_state = 42)
cv_scores = cross_val_score(tree, x, y, cv=kf)


In [20]:
data_display_cv = {
    "Fold" : list(range(1, 11)),
    "Scores" : cv_scores
}
data_display_cv = pd.DataFrame(data_display_cv)
data_display_cv = data_display_cv.set_index('Fold', drop = True)
display(data_display_cv)

Unnamed: 0_level_0,Scores
Fold,Unnamed: 1_level_1
1,0.809667
2,0.808333
3,0.817333
4,0.811
5,0.819
6,0.828
7,0.833333
8,0.831667
9,0.827667
10,0.823667


In [23]:
data_display2_cv = {
    "Metric" : list(['Mean','Std']),
    "CV_Scores" : [np.mean(cv_scores), np.std(cv_scores)]
}
data_display2_cv = pd.DataFrame(data_display2_cv)
data_display2_cv = data_display2_cv.set_index('Metric', drop = True)
display(data_display2_cv)

Unnamed: 0_level_0,CV_Scores
Metric,Unnamed: 1_level_1
Mean,0.820967
Std,0.008791


#Conclusion



*   Max depth for tree used is 4.
*   We observe very similar in-sample and out-sample results.
*   CV_scores results look promising and since we did not have to reiterate or run a for loop to do same implementation to get different random states, the code performs tuning on it's own and therefore it is more efficient.





#Signing 

Name : Ananya Singh

NetID : as133

I hereby certify that I have read the University policy on Academic Integrity and that I am not in violation.