# Titanic data set - machine learning

## Run this first

In [None]:
#https://mljar.com/blog/visualize-decision-tree/
# Pandas - Data manipulation and analysis library
import pandas as pd
# NumPy - mathematical functions on multi-dimensional arrays and matrices
import numpy as np
# Matplotlib - plotting library to create graphs and charts
import matplotlib.pyplot as plt

# Scikit-learn algorithms and functions
from sklearn.neighbors import KNeighborsClassifier # Scikit-learn KNeighbors Classifier
from sklearn.tree import DecisionTreeClassifier # Scikit-learn Decision Tree Classifier
from sklearn import tree
from sklearn.model_selection import KFold # Scikit-learn K-Folds cross-validator
from sklearn.model_selection import cross_val_score # evaluating cross-validator performance
k_fold = KFold(n_splits=10, shuffle=True, random_state=0) # KFold configuration

# Settings for Matplotlib graphs and charts
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8

# Display Matplotlib output inline
%matplotlib inline

import os.path
if not os.path.isfile('sampledata/machine-learning/titanic_training.xlsx'):
  !rm -rf sampledata
  !git clone https://github.com/awchisholm/sampledata.git

## Read the raw data and have a look at it

In [None]:
titanic_training = pd.read_excel('/content/sampledata/machine-learning/titanic_training.xlsx')
titanic_test = pd.read_excel('/content/sampledata/machine-learning/titanic_test.xlsx')
titanic_test_labels = pd.read_excel('/content/sampledata/machine-learning/titanic_test_labels.xlsx')
print(titanic_training.columns)
print(titanic_training.head())
print(titanic_test.columns)
print(titanic_test.head())
print(titanic_test_labels.columns)
print(titanic_test_labels.head())

## Make data for the machine learning model to use

In [None]:
labels = titanic_training['Survived']
labels_as_text = labels.map({1: 'Yes', 0: 'No'})
train_data = titanic_training.drop('Survived', axis=1)
test_data = titanic_test

## Get an estimate for how good the model is

In [None]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, labels, cv=k_fold, n_jobs=1, scoring=scoring)
print("The estimated accuracy of the model is", 100 * round(np.mean(score),2), "percent")

## Build a model using the training data and the already known labels

In [None]:
clf = DecisionTreeClassifier()
cc = clf.fit(train_data, labels)

## But this time make predictions using the model and the test data

In [None]:
prediction = clf.predict(test_data)

In [None]:
test_data['Prediction'] = prediction
test_data.head()

## Sneakily, we have the actual fate of these passengers and we can add it to the prediction to see how good our model is


In [None]:
test_data['Actual'] = titanic_test_labels
test_data.head()

## We can see how well we did by using a confusion matrix and we can get a report

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
y_actu = titanic_test_labels
y_pred = prediction
print(confusion_matrix(y_actu, y_pred))
print(classification_report(y_actu, y_pred, labels=[1,0]))

## We can also print out the decision tree model so we can see how it decides.

In [None]:
#text_representation = tree.export_text(clf)
#print(text_representation)
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=train_data.columns,  
                                class_names=labels_as_text,
                                filled=True)

graph = graphviz.Source(dot_data, format="png") 
graph