In [79]:
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score
from IPython.display import IFrame

# Decision tree demo for Titanic data

# read data (replace with your own path)
df = pd.read_csv('C:\\Users\\olliv\\Documents\\cogsys\\titanic_v2.csv', sep=';')

# drop rows with missing values
df.dropna(axis=0, how='any', inplace=True)
df.head(10)



Unnamed: 0,pclass,sex,age,survived
0,1,female,29.0,1
1,1,male,0.9167,1
2,1,female,2.0,0
3,1,male,30.0,0
4,1,female,25.0,0
5,1,male,48.0,1
6,1,female,63.0,1
7,1,male,39.0,0
8,1,female,53.0,1
9,1,male,71.0,0


In [80]:
# type conversions
# Note that DecisionTreeClassifier can't use strings in explanatory variables
#df['sex'] = df['sex'].astype(pd.api.types.CategoricalDtype(ordered=False))
df['sex'] = df['sex'].replace(['male', 'female'],[1,2])


df['survived'] = df['survived'].astype(pd.api.types.CategoricalDtype(ordered=False))
df.dtypes

pclass         int64
sex            int64
age          float64
survived    category
dtype: object

In [81]:
# save column headings into a list
colnames = df.columns.get_values()
colnames

array(['pclass', 'sex', 'age', 'survived'], dtype=object)

In [82]:
# extract explanatory variables into a data frame
X = df.loc[:, 'pclass':'age']

# extract response variable (class variable) into a series
Y = df.loc[:, 'survived']

In [83]:
# decision tree classification
classifier = tree.DecisionTreeClassifier(max_depth=2)
classifier.fit(X,Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [84]:
# visualize
dot_data = tree.export_graphviz(classifier, out_file=None, feature_names=colnames[:3], class_names=['no','yes'])
graph = graphviz.Source(dot_data) 
graph.render("titanic")

IFrame("titanic.pdf", width=900, height=700)

In [85]:
# predict
Y_pred = classifier.predict(X)

# output confusion matrix
cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

print(classification_report(Y, Y_pred, target_names=['no', 'yes']))

Confusion matrix:
 [[585  34]
 [182 245]]
Accuracy calculated from the training set = 0.793
             precision    recall  f1-score   support

         no       0.76      0.95      0.84       619
        yes       0.88      0.57      0.69       427

avg / total       0.81      0.79      0.78      1046



In [86]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=classifier,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

Accuracies from 10 individual folds:
[ 0.83809524  0.86666667  0.84761905  0.82857143  0.76190476  0.83809524
  0.8         0.59615385  0.52884615  0.61165049]
Accuracy calculated using 10-fold cross validation = 0.752


In [91]:
# Experiment with min_samples_leaf to find the best model.

for leaf in range(201,-1,-20):
    print("min_samples_leaf: %d" % leaf)
    classifier = tree.DecisionTreeClassifier(min_samples_leaf=leaf)
    classifier.fit(X,Y)
    Y_pred = classifier.predict(X)
    cm = confusion_matrix(Y, Y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
    print("  Accuracy calculated from the training set = %.3f" % (accuracy))
    
    scores = cross_val_score(estimator=classifier,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
    print("  Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))
    
    

min_samples_leaf: 201
  Accuracy calculated from the training set = 0.779
  Accuracy calculated using 10-fold cross validation = 0.779
min_samples_leaf: 181
  Accuracy calculated from the training set = 0.779
  Accuracy calculated using 10-fold cross validation = 0.779
min_samples_leaf: 161
  Accuracy calculated from the training set = 0.779
  Accuracy calculated using 10-fold cross validation = 0.779
min_samples_leaf: 141
  Accuracy calculated from the training set = 0.787
  Accuracy calculated using 10-fold cross validation = 0.779
min_samples_leaf: 121
  Accuracy calculated from the training set = 0.787
  Accuracy calculated using 10-fold cross validation = 0.779
min_samples_leaf: 101
  Accuracy calculated from the training set = 0.787
  Accuracy calculated using 10-fold cross validation = 0.753
min_samples_leaf: 81
  Accuracy calculated from the training set = 0.787
  Accuracy calculated using 10-fold cross validation = 0.753
min_samples_leaf: 61
  Accuracy calculated from the trai