In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [41]:
df = pd.read_csv('credit.csv')
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [42]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
percent_of_income,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
years_at_residence,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_loans_count,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [44]:
df.shape

(1000, 17)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


In [46]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.Categorical(df[col]).codes

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null int8
months_loan_duration    1000 non-null int64
credit_history          1000 non-null int8
purpose                 1000 non-null int8
amount                  1000 non-null int64
savings_balance         1000 non-null int8
employment_duration     1000 non-null int8
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null int8
housing                 1000 non-null int8
existing_loans_count    1000 non-null int64
job                     1000 non-null int8
dependents              1000 non-null int64
phone                   1000 non-null int8
default                 1000 non-null int8
dtypes: int64(7), int8(10)
memory usage: 64.5 KB


In [48]:
# splitting data into train and test set
X = df.drop('default', axis=1)
y = df.pop('default')

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size=0.30)

In [52]:
model = DecisionTreeClassifier(criterion='entropy')

In [54]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [70]:
print(pd.DataFrame(model.feature_importances_,columns=['Imp'], index=X_train.columns))

                           Imp
checking_balance      0.117763
months_loan_duration  0.076999
credit_history        0.066657
purpose               0.066562
amount                0.181946
savings_balance       0.080218
employment_duration   0.038207
percent_of_income     0.030094
years_at_residence    0.054601
age                   0.131190
other_credit          0.045786
housing               0.007996
existing_loans_count  0.024294
job                   0.033347
dependents            0.025472
phone                 0.018867


In [90]:
from IPython.display import Image
from sklearn import tree
from os import system

credit_tree_file = open('credit_tree.dot', 'w')
dot_data = tree.export_graphviz(model, out_file=credit_tree_file, feature_names=list(X_train), class_names=str(list(y_train)))
credit_tree_file.close()

In [91]:
model.score(X_test, y_test)

0.6766666666666666

In [92]:
model.score(X_train, y_train)

1.0

In [94]:
y_predict = model.predict(X_test)
metrics.confusion_matrix(y_test, y_predict)

array([[166,  48],
       [ 49,  37]])

## Regularization

In [95]:
model = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=5)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1, splitter='best')

In [97]:
credit_tree_file = open('credit_tree.dot', 'w')
dot_data = tree.export_graphviz(model, out_file=credit_tree_file, feature_names=list(X_train), class_names=str(list(y_train)))
credit_tree_file.close()

In [98]:
model.score(X_test, y_test)

0.7466666666666667

In [99]:
model.score(X_train, y_train)

0.7885714285714286

In [100]:
y_pred = model.predict(X_test)
metrics.confusion_matrix(y_test, y_pred)

array([[195,  19],
       [ 57,  29]])