### We implement a decision tree.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import preprocessing

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'AgeBin', 'NRelatives', 'HasRelatives',
       'FareOverHundred', 'Embarked_Q', 'Embarked_S', 'Pclass_2', 'Pclass_3',
       'EmbarkedCat'],
      dtype='object')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeBin,NRelatives,HasRelatives,FareOverHundred,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,EmbarkedCat
0,1,0,3,1,22.0,1,0,7.25,S,1,1,1,0.0725,0,1,0,1,1.0
1,2,1,1,0,38.0,1,0,71.2833,C,1,1,1,0.712833,0,0,0,0,2.0
2,3,1,3,0,26.0,0,0,7.925,S,1,0,0,0.07925,0,1,0,1,1.0
3,4,1,1,0,35.0,1,0,53.1,S,1,1,1,0.531,0,1,0,0,1.0
4,5,0,3,1,35.0,0,0,8.05,S,1,0,0,0.0805,0,1,0,1,1.0


In [5]:
# One needs to only use one-hot-encoded categorical variables for scikit-learn decision tree.
variables = ['Sex', 'AgeBin', 'HasRelatives', 'Pclass_2', 'Pclass_3', 'Embarked_Q', 'Embarked_S', 'FareOverHundred']

y = train['Survived']
X = np.array(train[variables])
print(X.shape, y.shape)

(891, 8) (891,)


In [6]:
Xtest = np.array(test[variables])
testlabeled = test[variables].copy()

In [7]:
clf = tree.DecisionTreeClassifier(ccp_alpha = 0, criterion = 'entropy', max_depth=5)
clf.fit(X, y)

DecisionTreeClassifier(ccp_alpha=0, criterion='entropy', max_depth=5)

In [8]:
pred = clf.predict(Xtest)
testlabeled['Survived'] = pred

In [9]:
testlabeled.groupby('Sex').agg({'Survived': 'mean'})

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
0,0.730263
1,0.018797


In [10]:
error = np.mean(np.abs(clf.predict(X) - y))
print("Training Accuracy:", 1 - error)

Training Accuracy: 0.8361391694725028


In [11]:
#tree.plot_tree(clf)

In [12]:
#import graphviz 
#dot_data = tree.export_graphviz(clf2, out_file = None) 
#graph = graphviz.Source(dot_data) 
#graph.render("titanic")

In [13]:
from sklearn.tree import export_text
r = export_text(clf)
print(r)

|--- feature_0 <= 0.50
|   |--- feature_4 <= 0.50
|   |   |--- feature_7 <= 0.29
|   |   |   |--- feature_7 <= 0.28
|   |   |   |   |--- feature_7 <= 0.26
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_7 >  0.26
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_7 >  0.28
|   |   |   |   |--- class: 0
|   |   |--- feature_7 >  0.29
|   |   |   |--- feature_7 <= 1.49
|   |   |   |   |--- class: 1
|   |   |   |--- feature_7 >  1.49
|   |   |   |   |--- feature_7 <= 1.53
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_7 >  1.53
|   |   |   |   |   |--- class: 1
|   |--- feature_4 >  0.50
|   |   |--- feature_7 <= 0.23
|   |   |   |--- feature_6 <= 0.50
|   |   |   |   |--- feature_7 <= 0.16
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_7 >  0.16
|   |   |   |   |   |--- class: 1
|   |   |   |--- feature_6 >  0.50
|   |   |   |   |--- feature_7 <= 0.08
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature_7 >  0.08
|   |   | 

In [14]:
submission = pd.read_csv('gender_submission.csv')
submission['Survived'] = pred
submission.to_csv('submission-tree.csv', index = False)