In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree

In [66]:
df = pd.read_csv('titanic.csv')

# Clean up
df = df[df['Age'].notnull()]
cleanup_nums = {"Sex": {"male": 1, "female": 0}}
df = df.replace(cleanup_nums)
y = df['Survived']
X = df.drop('Survived', axis=1)

# Split up data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
classifier = DecisionTreeClassifier(min_impurity_decrease=0.01, max_depth=5)
classifier.fit(X_train, y_train)

# Predict result
y_pred = classifier.predict(X_test)

# Evaluate result
report = classification_report(y_test, y_pred)
print(report)

# Generate the tree file
dotfile = open("./dtree1.dot", 'w')
# Classes are marked with y[0] and y[1] where y[0] means person not surviving, and vice versa. 
ditfile = tree.export_graphviz(classifier, out_file = dotfile, feature_names = X.columns, class_names=True)
dotfile.close()

# Screenshot of the tree is in the project's folder. The file is called tree.png.


   Survived     Sex   Age  Class
0         0    male  22.0      3
1         1  female  38.0      1
2         1  female  26.0      3
3         1  female  35.0      1
4         0    male  35.0      3
[1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0]
              precision    recall  f1-score   support

           0       0.73      0.95      0.82        81
           1       0.89      0.53      0.67        62

    accuracy                           0.77       143
   macro avg       0.81      0.74      0.75       143
weighted avg       0.80      0.77      0.76       143

