# The Kaggle Titanic Competition

# Change Log:
1.0 - Created and run

1.01 - Updated links after long dormant period and verified functionality - 9/11/21

1.02 - Obtained Deck level from Cabin.  Dropped Cabin and Ticket.  Optimized decision tree model hyperparameters methd, depth, leaf, and split.

## Project Goals:
* 1 - Create an initial simple classifier using a decision tree.
* 2 - Revise the initial version with increasing sophistication, measured by significantly improved accuracy
* 3 - Research Kaggle winner's for strategy, and impelement some learning to improve accuracy
* 4 - Submit the model to the competition as my first Kaggle entry

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

# Set a random seed
import random
random.seed(42)

# Load the dataset
in_file = '.\\data\\train.csv'
full_data = pd.read_csv(in_file)

# Print the first few entries of the RMS Titanic data
display(full_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
features_raw = full_data.drop('Survived', axis = 1)

#Get deck level from Cabin # then discard Cabin (below) (or it will jack up one hot)
features_raw['Deck'] = features_raw.Cabin.str[:1]


# Show the new dataset with 'Survived' removed
display(features_raw.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [3]:
# Removing the names, Cabin (have deck), Ticket (random dist)
features_no_names = features_raw.drop(['Name', 'Cabin', 'Ticket'], axis=1)

# One-hot encoding
features = pd.get_dummies(features_no_names)

In [4]:
for i in features.columns:
    print(i)

PassengerId
Pclass
Age
SibSp
Parch
Fare
Sex_female
Sex_male
Embarked_C
Embarked_Q
Embarked_S
Deck_A
Deck_B
Deck_C
Deck_D
Deck_E
Deck_F
Deck_G
Deck_T


In [5]:
features = features.fillna(0.0)
display(features.head())

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,1,3,22.0,1,0,7.25,0,1,0,0,1,0,0,0,0,0,0,0,0
1,2,1,38.0,1,0,71.2833,1,0,1,0,0,0,0,1,0,0,0,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,1,0,0,0,0,0,0,0,0
3,4,1,35.0,1,0,53.1,1,0,0,0,1,0,0,1,0,0,0,0,0
4,5,3,35.0,0,0,8.05,0,1,0,0,1,0,0,0,0,0,0,0,0


In [6]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

In [7]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier

# TODO: Define the classifier, and fit it to the data
#Note: I iterated on hyperparameters below to tune them.  Entropy better than gini, max depth = 5, min samples leaf = 5, min samples split = no effect
model = DecisionTreeClassifier(criterion='entropy', max_depth = 5, min_samples_leaf = 5)

model = model.fit(X_train,y_train)

In [8]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 0.8398876404494382
The test accuracy is 0.8268156424581006


# improve the model

In [None]:
# iterate hyperparameters to improve accuracy
print('Method, depth, leaf, split, training, test\n')

for method in ['entropy','gini']:
    for depth in range(1,30):
        for leaf in range(1,30):
            for split in range(2,30):

                model = DecisionTreeClassifier(criterion=method, max_depth = depth, min_samples_leaf = leaf, min_samples_split = split)
                model = model.fit(X_train,y_train)

                # Making predictions
                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)

                # Calculate the accuracy
                from sklearn.metrics import accuracy_score

                train_accuracy = accuracy_score(y_train, y_train_pred)
                test_accuracy = accuracy_score(y_test, y_test_pred)
                if test_accuracy > 0.85:
                    print(method,"," ,depth,",",leaf,",",split,",", "{:.4f}".format(train_accuracy),",", "{:.4f}".format(test_accuracy))

# My optimized solution

In [9]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier

# TODO: Define the classifier, and fit it to the data
#Note: I iterated on hyperparameters below to tune them.  Gini better than Entropy, max depth = 7-11 good, min samples leaf = 6 is golden!, min samples split = no effect
model = DecisionTreeClassifier(criterion='gini', max_depth = 10, min_samples_leaf = 3,min_samples_split =16)

model = model.fit(X_train,y_train)

In [10]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 0.8862359550561798
The test accuracy is 0.8547486033519553
