In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
# visualization

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

import graphviz
import sklearn.tree as tree

import warnings
warnings.filterwarnings('ignore')


# Titanic Database Overview

**First see the link:**
- <h> https://www.kaggle.com/c/titanic/data </h>


### Question and problem definition

Competition sites like Kaggle define the problem to solve or questions to ask while providing the datasets for training your data science model and testing the model results against a test dataset. The question or problem definition for Titanic Survival competition is [described here at Kaggle](https://www.kaggle.com/c/titanic).

> Knowing from a training set of samples listing passengers who survived or did not survive the Titanic disaster, can our model determine based on a given test dataset not containing the survival information, if these passengers in the test dataset survived or not.

We may also want to develop some early understanding about the domain of our problem. This is described on the [Kaggle competition description page here](https://www.kaggle.com/c/titanic). Here are the highlights to note.

- On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. Translated 32% survival rate.
- One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew.
- **Hypothesis**: Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.


# I. Acquire data

We start by acquiring the training and testing datasets into Pandas DataFrames. We also combine these datasets to run certain operations on both datasets together.

In [None]:
train_df = pd.read_csv('../../dataset/titanic/train.csv')
test_df = pd.read_csv('../../dataset/titanic/test.csv')

In [None]:
# print column names
print(f"Data features: \n Shape {train_df.shape}\n Columns {train_df.columns.values}") 

In [None]:
# See if there is missing data and the type of data
train_df.info()

# II. Exploratory Data Analysis

In [None]:
#visualize the histogram
_ = train_df.hist(bins=50,figsize=(10,10))

In [None]:
#visualize the histogram for the test
_ = test_df.hist(bins=50,figsize=(10,10))

In [None]:
#visualize the corelation matrix 
train_df.corr().style.background_gradient(cmap='Reds')

#scatter plt of pandas
_ = train_df.plot.scatter(x='Fare',y='Survived',s=100)

In [None]:
_ = test_df.hist(bins=100,figsize=(10,10))

# III. Let's build our first model

## One attribute model

In [None]:
X = train_df[['Fare']].copy()
y = train_df['Survived'].copy() # or df.Survived.values
y.count() #so we need no preprocessing

In [None]:
#First Step
model1 = DecisionTreeClassifier()

#Second step
model1.fit(X,y)

#Prediction
y_pred = model1.predict(X)

#Compute the accuracy of our model
print(round(np.sum(y==y_pred)/len(y)*100,2),"%")

In [None]:
dot_data  = tree.export_graphviz(model1, out_file=None,  
                class_names=['0','1'], # the target names.
                feature_names=['Fare'], # the feature names.
                filled=True, # Whether to fill in the boxes with colours.
                rounded=True, # Whether to round the corners of the boxes.
                special_characters=True)



graph = graphviz.Source(dot_data, format="png") 

graph.render("decision_tree_graphivz")

In [None]:
plt.figure(figsize=(30,30))
img = mpimg.imread('decision_tree_graphivz.png')
imgplot = plt.imshow(img)
plt.show()

In [None]:
confusion_matrix(y, y_pred, labels=[1, 0])

In [None]:
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
(tn, fp, fn, tp)

In [None]:
# print the accuracy score 
print(round(accuracy_score(y,p)*100,2),"%")

In [None]:
# global report of metric 
target_names = ['Survives', 'Death']
print(classification_report(y, y_pred, target_names=target_names))

In [None]:
sn.set(font_scale=1.4) #
sn.heatmap(confusion_matrix(y, y_pred, labels=[1, 0]), annot=True, annot_kws={"size": 12},fmt='.2f', cmap="YlGnBu", cbar=False)

The previous example is not realistic as there is no outcome (only train is used)

In [None]:
train_set = train_df[0:700].copy()
test_set = train_df[700:].copy()

print(train_set.shape,test_set.shape)

X_train = train_set[['Fare']]
y_train = train_set['Survived']

X_test = test_set[['Fare']]
y_test = test_set['Survived'] 

#First Step
dt_model = DecisionTreeClassifier()

#Second step
dt_model.fit(X_train,y_train)

#Prediction
pred_train = dt_model.predict(X_train)

pred_test = dt_model.predict(X_test)

#Prediction on the train
1 - np.sum(pred_train==y_train)/len(y_train)

#Prediction on the test
1 - np.sum(pred_test==y_test)/len(y_test)

print('Train Score (error) :',1-accuracy_score(y_train,pred_train),"or: ", 1-dt_model.score(X_train,y_train))
print('Test Score (error) :',1-accuracy_score(y_test,pred_test),"or: ", 1-dt_model.score(X_test,y_test))

#Visualization with graph : Step for installation
#1) https://graphviz.gitlab.io/_pages/Download/Download_windows.html
#2) conda install -c anaconda graphviz     or  conda install -c conda-forge python-graphviz 


# Different way to plot a tree

# https://mljar.com/blog/visualize-decision-tree/

#_ = tree.plot_tree(dt_model, 
#                class_names=['0','1'], # the target names.
#                feature_names=['Fare'], # the feature names.
#                filled=True, # Whether to fill in the boxes with colours.
#                rounded=True # Whether to round the corners of the boxes.
#               )

dot_data  = tree.export_graphviz(dt_model, out_file=None,  
                class_names=['0','1'], # the target names.
                feature_names=['Fare'], # the feature names.
                filled=True, # Whether to fill in the boxes with colours.
                rounded=True, # Whether to round the corners of the boxes.
                special_characters=True)



graph = graphviz.Source(dot_data, format="png") 

So, our model is very bad!!!