# Tree Classification, Random Forest and SVM on a Credit Risk Dataset

In [55]:
import os
os.chdir('/Users/antoinetroadec/Documents/GitHub/Credit_risk/')

## Data Preparation
### Importing data from the txt file

In [74]:
#importing the data in a DataFrame
import pandas
data = pandas.read_table("german.data-numeric.txt",delim_whitespace=True,header=None,decimal= ".")

In [75]:
#shape of the table 
print(data.shape) # (2310, 21)

(1000, 25)


### Train Data

In [102]:
#training sample
data_train = data.sample(650)


In [103]:
#shape of the table 
print(data_train.shape)

(650, 25)


In [104]:
#transformation into numpy matrix
d_train = data_train.as_matrix()

In [105]:
#vector for the target attribute
y_app = d_train[:,24]
#matrix for the predictive attributes
X_app = d_train[:,0:23]

### Test Data

In [106]:
data_test=data[~data.index.isin(data_train.index)]

In [107]:
#shape of the table 
print(data_test.shape)

(350, 25)


In [108]:
y_test = data_test.as_matrix()[:,24] 
X_test = data_test.as_matrix()[:,0:23]

## Function for performance evaluation

In [109]:
#module for the evaluation of the classifiers
from sklearn import metrics
#function for the performance evaluation
def error_rate(model,y_test,X_test):
#prediction
    y_pred = model.predict(X_test)
    #error rate = 1 - accuracy rate (success rate)
    err = 1.0 - metrics.accuracy_score(y_test,y_pred)
    #return
    return err
#end fonction

## Classification tree
### Decision tree Classifier

In [110]:
#Decision tree - importation of the class
from sklearn.tree import DecisionTreeClassifier
#instantiation
dtree = DecisionTreeClassifier()

In [111]:
#learning
dtree.fit(X_app,y_app)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [112]:
#generation of the output -> .dot format
from sklearn import tree 
tree.export_graphviz(dtree,out_file="tree.dot",feature_names=data_train.columns[0:23])
 #visualization with graphviz

In [113]:
#importance of variables - 0 when the variable does not appear into the tree
imp = {"Importance":dtree.feature_importances_}
print(pandas.DataFrame(imp))

    Importance
0     0.120323
1     0.111642
2     0.043464
3     0.145181
4     0.055113
5     0.039868
6     0.043736
7     0.066120
8     0.042501
9     0.151196
10    0.043509
11    0.036572
12    0.016425
13    0.019902
14    0.002852
15    0.002716
16    0.017656
17    0.019184
18    0.007239
19    0.000000
20    0.005409
21    0.009393
22    0.000000


In [114]:
#error rate
print(error_rate(dtree,y_test,X_test))

0.322857142857


### Bagging
20 trees

In [115]:
#class bagging
from sklearn.ensemble import BaggingClassifier
#instantiation
baggingTree = BaggingClassifier(DecisionTreeClassifier(),n_estimators=20)
print(baggingTree)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=20, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)


In [116]:
#training
baggingTree.fit(X_app,y_app)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=20, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [117]:
#test
print(error_rate(baggingTree,y_test,X_test))

0.228571428571


Error rate came from 32.3%(tree) to 22.9% (bagging)

### Tree number
Whta is the best number of trees?

In [119]:
#train-test function for a given m
def train_test_bagging(m,X_app,y_app,X_test,y_test):
    #instantiation
    bag = BaggingClassifier(DecisionTreeClassifier(),n_estimators=m)
    #fit the model
    bag.fit(X_app,y_app)
    #prediction and calculation of the error rate
    return error_rate(bag,y_test,X_test)
#end train-test
#values of m to evaluate
m_a_tester = [1,5,10,20,50,100,500]
#initialization of the matrix for the results
import numpy
result = numpy.zeros(shape=(1,7))
#repeat 20 times the experiment for m
for expe in range(20):
#evaluate each value of m
    res = [train_test_bagging(m,X_app,y_app,X_test,y_test) for m in m_a_tester]
    #the vector with 7 values is transformed in a matrix (1, 7)
    res = numpy.asarray(res).reshape(1,7)
    #add a new row in the matrix
    result = numpy.append(result,res,axis=0)
#
#remove the first row
result = numpy.delete(result,0,axis=0)
#calculate the average of error rate for each m
mresult = numpy.mean(result,axis=0)
print(mresult)

[ 0.31742857  0.25614286  0.25214286  0.24185714  0.235       0.23142857
  0.23014286]


In [120]:
#graphical tool
import matplotlib.pyplot as plt
#label of the axes
plt.xlabel("m")
plt.ylabel("Err. Rate")
plt.plot(m_a_tester,mresult,linewidth=2)

[<matplotlib.lines.Line2D at 0x1093a70b8>]

500 trees is the best number

Let's do it using scikit-learn now.

In [121]:
# detecting the “optimal” number of trees
# using the grid search tool
from sklearn.grid_search import GridSearchCV
# the parameters to make vary
# the name of the parameter must be explicit
# we enumerate the values to try
parametres = [{"n_estimators":[1,5,10,20,50,100,500]}]
# instantiate the classifier
bag = BaggingClassifier(DecisionTreeClassifier())
#instantiation of the gris search tool
#the metric used is the accuracy rate (error rate = 1 - accuracy rate)
grid_bag = GridSearchCV(estimator=bag,param_grid=parametres,scoring="accuracy")
#launching the exploration
grille_bag = grid_bag.fit(X_app,y_app)
#print the results
print(grille_bag.grid_scores_)

[mean: 0.65231, std: 0.03867, params: {'n_estimators': 1}, mean: 0.70000, std: 0.01604, params: {'n_estimators': 5}, mean: 0.73846, std: 0.00588, params: {'n_estimators': 10}, mean: 0.74923, std: 0.03020, params: {'n_estimators': 20}, mean: 0.76154, std: 0.01654, params: {'n_estimators': 50}, mean: 0.75385, std: 0.01532, params: {'n_estimators': 100}, mean: 0.76462, std: 0.00710, params: {'n_estimators': 500}]


In [122]:
#best score
print(grille_bag.best_score_)
#parameter for the best score
print(grille_bag.best_params_) 

0.764615384615
{'n_estimators': 500}


In [123]:
#valuation of the best solution on the test set
print(error_rate(grille_bag,y_test,X_test))

0.24


## Random Forest
## n=20

In [127]:
# RandomForest class
from sklearn.ensemble import RandomForestClassifier
# instantiation
rf = RandomForestClassifier(n_estimators=20)
# training phase
rf.fit(X_app,y_app)
# test error rate
print(error_rate(rf,y_test,X_test))
# importance of variables...
print(rf.feature_importances_)
#with their names
imp = {"Importance":rf.feature_importances_}
print(pandas.DataFrame(imp))

0.28
[ 0.09868516  0.1166408   0.07614889  0.12864958  0.05127822  0.05604085
  0.03597425  0.0495375   0.05963165  0.10042489  0.0374859   0.01984932
  0.01451595  0.03104455  0.00366922  0.02643353  0.0098401   0.01743267
  0.01030885  0.01165811  0.02282914  0.0057007   0.01622019]
    Importance
0     0.098685
1     0.116641
2     0.076149
3     0.128650
4     0.051278
5     0.056041
6     0.035974
7     0.049537
8     0.059632
9     0.100425
10    0.037486
11    0.019849
12    0.014516
13    0.031045
14    0.003669
15    0.026434
16    0.009840
17    0.017433
18    0.010309
19    0.011658
20    0.022829
21    0.005701
22    0.016220


### Boosting

In [128]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier
# instantiation
ab=AdaBoostClassifier(algorithm="SAMME",n_estimators=20,base_estimator=
                    DecisionTreeClassifier())
print(ab)
# training phase
ab.fit(X_app,y_app)
# test error rate
print(error_rate(ab,y_test,X_test))

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=20, random_state=None)
0.311428571429


## n=500

In [129]:
# RandomForest class
from sklearn.ensemble import RandomForestClassifier
# instantiation
rf = RandomForestClassifier(n_estimators=500)
# training phase
rf.fit(X_app,y_app)
# test error rate
print(error_rate(rf,y_test,X_test))
# importance of variables...
print(rf.feature_importances_)
#with their names
imp = {"Importance":rf.feature_importances_}
print(pandas.DataFrame(imp))

0.245714285714
[ 0.10217331  0.1152557   0.06493912  0.12998077  0.05173975  0.05944774
  0.04216673  0.05042849  0.05265149  0.11242279  0.03206363  0.02490707
  0.01456531  0.02508909  0.00384332  0.02633619  0.01194166  0.01405179
  0.01028584  0.01396855  0.01933744  0.00382522  0.018579  ]
    Importance
0     0.102173
1     0.115256
2     0.064939
3     0.129981
4     0.051740
5     0.059448
6     0.042167
7     0.050428
8     0.052651
9     0.112423
10    0.032064
11    0.024907
12    0.014565
13    0.025089
14    0.003843
15    0.026336
16    0.011942
17    0.014052
18    0.010286
19    0.013969
20    0.019337
21    0.003825
22    0.018579


## Boosting

In [130]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier
# instantiation
ab=AdaBoostClassifier(algorithm="SAMME",n_estimators=500,base_estimator=
                    DecisionTreeClassifier())
print(ab)
# training phase
ab.fit(X_app,y_app)
# test error rate
print(error_rate(ab,y_test,X_test))

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=500, random_state=None)
0.311428571429


## SVM

In [133]:
# import SVM
from sklearn import svm
# instantiation
svmmod = svm.SVC(kernel='linear', C=1)
# training phase
svmmod.fit(X_app,y_app)
# test error rate
print(error_rate(svmmod,y_test,X_test))


0.24


## Neural Network
Neural Network is not available for scikit-learn .17 on Feb 2nd

In [135]:
# import Neural Network
from sklearn import MLPClassifier
# instantiation
neural = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
# training phase
neural.fit(X_app,y_app)
# test error rate
print(error_rate(neural,y_test,X_test))

ImportError: cannot import name 'MLPClassifier'