## Load csv data

In [6]:
import numpy as np
import pandas as pd
df = pd.read_csv('./loans_data.csv')
df

Unnamed: 0,safe_loans,grade,sub_grade_num,short_emp,emp_length_num,home_ownership,dti,purpose,payment_inc_ratio,delinq_2yrs,...,pub_rec,pub_rec_zero,revol_util,total_rec_late_fee,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment
0,1,B,0.6,0,11,OWN,11.18,credit_card,7.93824,0,...,0,1,82.4,0.0,11.71,1902.56,50000,10000,10000,330.76
1,1,B,0.2,0,3,MORTGAGE,29.44,credit_card,6.30496,0,...,0,1,93.9,0.0,9.91,823.48,92000,15000,15000,483.38
2,1,B,0.6,1,1,RENT,12.19,credit_card,13.49520,0,...,0,1,59.1,0.0,11.71,1622.21,25000,8500,8500,281.15
3,1,A,0.8,0,6,MORTGAGE,14.03,debt_consolidation,15.93310,0,...,0,1,27.4,0.0,7.90,3061.08,75000,31825,31825,995.82
4,1,C,1.0,0,8,RENT,6.35,credit_card,8.68129,0,...,0,1,60.5,0.0,15.96,1848.94,34000,7000,7000,245.97
5,1,B,0.4,0,11,RENT,11.80,credit_card,11.82180,0,...,0,1,57.2,0.0,10.65,2137.46,41000,12400,12400,403.91
6,1,B,0.8,0,2,RENT,10.62,debt_consolidation,6.52882,0,...,0,1,66.5,0.0,12.42,1125.28,36852,6000,6000,200.50
7,1,A,0.4,0,6,RENT,10.85,debt_consolidation,5.79000,0,...,0,1,36.4,0.0,6.62,1157.38,70000,11000,11000,337.75
8,1,B,0.2,0,2,RENT,8.11,credit_card,10.08780,0,...,0,1,52.1,0.0,9.91,1748.21,46000,12000,12000,386.70
9,1,B,0.4,0,3,RENT,19.14,credit_card,7.60482,0,...,0,1,59.1,0.0,10.65,575.94,51400,10000,10000,325.74


## Map string to number

In [7]:
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
home_map = {'MORTGAGE': 1, 'OTHER': 2, 'OWN': 3, 'RENT': 4}
purpose_map = {'car': 1, 'credit_card': 2, 'debt_consolidation': 3, 'home_improvement': 4, 'house': 5, 'major_purchase': 6, 'medical': 7, 'moving': 8, 'other': 9, 'small_business': 10, 'vacation': 11, 'wedding': 12}

df = df.replace({'grade': grade_map, 'home_ownership': home_map, 'purpose': purpose_map})
df

Unnamed: 0,safe_loans,grade,sub_grade_num,short_emp,emp_length_num,home_ownership,dti,purpose,payment_inc_ratio,delinq_2yrs,...,pub_rec,pub_rec_zero,revol_util,total_rec_late_fee,int_rate,total_rec_int,annual_inc,funded_amnt,funded_amnt_inv,installment
0,1,2,0.6,0,11,3,11.18,2,7.93824,0,...,0,1,82.4,0.0,11.71,1902.56,50000,10000,10000,330.76
1,1,2,0.2,0,3,1,29.44,2,6.30496,0,...,0,1,93.9,0.0,9.91,823.48,92000,15000,15000,483.38
2,1,2,0.6,1,1,4,12.19,2,13.49520,0,...,0,1,59.1,0.0,11.71,1622.21,25000,8500,8500,281.15
3,1,1,0.8,0,6,1,14.03,3,15.93310,0,...,0,1,27.4,0.0,7.90,3061.08,75000,31825,31825,995.82
4,1,3,1.0,0,8,4,6.35,2,8.68129,0,...,0,1,60.5,0.0,15.96,1848.94,34000,7000,7000,245.97
5,1,2,0.4,0,11,4,11.80,2,11.82180,0,...,0,1,57.2,0.0,10.65,2137.46,41000,12400,12400,403.91
6,1,2,0.8,0,2,4,10.62,3,6.52882,0,...,0,1,66.5,0.0,12.42,1125.28,36852,6000,6000,200.50
7,1,1,0.4,0,6,4,10.85,3,5.79000,0,...,0,1,36.4,0.0,6.62,1157.38,70000,11000,11000,337.75
8,1,2,0.2,0,2,4,8.11,2,10.08780,0,...,0,1,52.1,0.0,9.91,1748.21,46000,12000,12000,386.70
9,1,2,0.4,0,3,4,19.14,2,7.60482,0,...,0,1,59.1,0.0,10.65,575.94,51400,10000,10000,325.74


## Split features, target and labels

In [8]:
features = np.array(df.iloc[:, 1:])
target = np.array(df.iloc[:, 0])
labels = np.array(list(df))

## Normalize and binning to continuous data

In [9]:
def binning(arr):
    ret = []
    for x in arr:
        ret.append(round(x, 1))
    return ret

from sklearn import preprocessing
fix_list = [6, 8, 14, 17, 18, 19, 20, 21, 22, 23]
features[:, fix_list] = preprocessing.scale(features[:, fix_list])
for x in fix_list:
    features[:, x] = binning(features[:, x])
features

array([[ 2. ,  0.6,  0. , ..., -0.4, -0.3, -0.3],
       [ 2. ,  0.2,  0. , ...,  0.2,  0.3,  0.3],
       [ 2. ,  0.6,  1. , ..., -0.6, -0.5, -0.5],
       ..., 
       [ 5. ,  1. ,  1. , ..., -0.9, -0.8, -1. ],
       [ 4. ,  0.6,  0. , ..., -0.6, -0.5, -0.8],
       [ 4. ,  1. ,  0. , ...,  1.1,  1.1,  0.7]])

## Define functions to validate

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import KFold
# import pydotplus

def valid(clf):
    print("- Resubstitution validation")
    resub(clf)
    print("- K-fold cross validation")
    kfold(clf)

def accurancy(mat):
    return np.trace(mat) / np.sum(mat)

def resub(clf):
    global features, target
    X = features
    y = target
    clf = clf.fit(X, y)
    pred = clf.predict(X)
    mat = confusion_matrix(y, pred)
    print("Confusion matrix:")
    print(mat)
    print("Accuracy score:")
    print(accurancy(mat))

def kfold(clf):
    global features, target
    X = features
    y = target
    mat = np.zeros((2, 2), dtype = 'i')
    kf = KFold(n_splits = 30, shuffle = True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        tmp = confusion_matrix(y_test, pred, labels=[-1, 1])
        mat += tmp
    print("Confusion matrix:")
    print(mat)
    print("Accuracy score:")
    print(accurancy(mat))
    return mat

def picout(clf, filename):
    global labels
    dot_data = tree.export_graphviz(clf, out_file = None, feature_names = labels, class_names = ['save', 'unsave'], filled = True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png(filename + ".png")

## Decision Tree Classifier

In [11]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth = 10)
valid(clf)

- Resubstitution validation
Confusion matrix:
[[15708  7439]
 [ 6839 16517]]
Accuracy score:
0.692966045201
- K-fold cross validation
Confusion matrix:
[[14571  8576]
 [ 7891 15465]]
Accuracy score:
0.645893813302


## Gradient Boosting Classifier

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(max_depth = 9, max_features = 23, min_samples_split = 35)
valid(clf)

- Resubstitution validation
Confusion matrix:
[[18848  4299]
 [ 3998 19358]]
Accuracy score:
0.82158140335
- K-fold cross validation
Confusion matrix:
[[15821  7326]
 [ 7471 15885]]
Accuracy score:
0.681805474916


## Bagging Classifier

In [13]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(n_estimators = 30)
valid(clf)

- Resubstitution validation
Confusion matrix:
[[23136    11]
 [   27 23329]]
Accuracy score:
0.999182848418
- K-fold cross validation
Confusion matrix:
[[15878  7269]
 [ 8663 14693]]
Accuracy score:
0.657398447412


## Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(min_samples_split = 400)
valid(clf)

- Resubstitution validation
Confusion matrix:
[[15865  7282]
 [ 7715 15641]]
Accuracy score:
0.677504677118
- K-fold cross validation
Confusion matrix:
[[15392  7755]
 [ 8158 15198]]
Accuracy score:
0.657807023203
