In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('credit.csv')
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


In [5]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.Categorical(df[col]).codes

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null int8
months_loan_duration    1000 non-null int64
credit_history          1000 non-null int8
purpose                 1000 non-null int8
amount                  1000 non-null int64
savings_balance         1000 non-null int8
employment_duration     1000 non-null int8
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null int8
housing                 1000 non-null int8
existing_loans_count    1000 non-null int64
job                     1000 non-null int8
dependents              1000 non-null int64
phone                   1000 non-null int8
default                 1000 non-null int8
dtypes: int64(7), int8(10)
memory usage: 64.5 KB


In [7]:
train_set = df.head(700)
test_set = df.tail(300)

train_labels = train_set.pop('default')
test_labels = test_set.pop('default')

In [8]:
model = DecisionTreeClassifier(criterion='entropy')

In [9]:
model.fit(train_set,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [10]:
model.score(test_set, test_labels)

0.6933333333333334

In [11]:
model.score(train_set, train_labels)

1.0

### Bagging

In [12]:
credit_labels = df.pop('default')

In [13]:
from sklearn.ensemble import BaggingClassifier

In [14]:
bgcl = BaggingClassifier(n_estimators=50, max_samples=0.8, oob_score=True)
# oob_score: out of bag score. Do you want to run ensemble on out of bag data set
# max_samples: of the 100% records i original dataframe, use 80% to create data in ensemble model


In [15]:
bgcl.fit(df, credit_labels)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=0.8, n_estimators=50,
                  n_jobs=None, oob_score=True, random_state=None, verbose=0,
                  warm_start=False)

In [16]:
bgcl.oob_score_

0.755

### Regularization

In [17]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=5)

In [18]:
model.fit(train_set, train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                       max_features=None, max_leaf_nodes=5,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [19]:
model.score(test_set, test_labels)

0.72

In [20]:
model.score(train_set, train_labels)

0.7371428571428571

### Ada Boosting

In [21]:
from sklearn.ensemble import AdaBoostClassifier

In [22]:
abc1 = AdaBoostClassifier(base_estimator=model, n_estimators=50)
# abc1 = AdaBoostClassifier(n_estimators=50)

In [23]:
abc1.fit(train_set, train_labels)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='entropy',
                                                         max_depth=5,
                                                         max_features=None,
                                                         max_leaf_nodes=5,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [29]:
test_pred = abc1.predict(test_set)
abc1.score(test_set, test_labels)

0.6566666666666666

### Gradient Boosting

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
gbc1 = GradientBoostingClassifier(n_estimators=50)
gbc1 = gbc1.fit(train_set, train_labels)

In [26]:
test_pred = gbc1.predict(test_set)

In [28]:
gbc1.score(test_set, test_labels)

0.7566666666666667

### Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier(n_estimators=6)
rfc1 = rfc1.fit(train_set, train_labels)

In [38]:
test_pred = rfc1.predict(test_set)
rfc1.score(test_set, test_labels)

0.7233333333333334