# The Kaggle Titanic Competition

# Change Log:
1.0 - Created and run

1.01 - Updated links after long dormant period and verified functionality - 9/11/21

1.02 - Obtained Deck level from Cabin.  Dropped Cabin and Ticket.  Optimized decision tree model hyperparameters methd, depth, leaf, and split.

1.03 - Eliminated the train test split, trained on full training data, ran inferrence on test data, output test set result to csv file 'bt_titanic_v1.csv'.

## Project Goals:
* 1 - Create an initial simple classifier using a decision tree.
* 2 - Revise the initial version with increasing sophistication, measured by significantly improved accuracy
* 3 - Research Kaggle winner's for strategy, and impelement some learning to improve accuracy
* 4 - Submit the model to the competition as my first Kaggle entry

In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

# Set a random seed
import random
random.seed(42)

# Load the dataset
in_file = '.\\data\\train.csv'
test_file = '.\\data\\test.csv'

full_data = pd.read_csv(in_file)
test_data = pd.read_csv(test_file)

# Print the first few entries of the RMS Titanic data
    ###display(full_data.head())

In [2]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
features_raw = full_data.drop('Survived', axis = 1)

#Get deck level from Cabin # then discard Cabin (below) (or it will jack up one hot)
features_raw['Deck'] = features_raw.Cabin.str[:1]
test_data['Deck'] = test_data.Cabin.str[:1]

# Show the new dataset with 'Survived' removed
    ###display(features_raw.head())

In [3]:
# Removing the names, Cabin (have deck), Ticket (random dist)
features_no_names = features_raw.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test_data = test_data.drop(['Name', 'Cabin', 'Ticket'], axis=1)

# One-hot encoding
features = pd.get_dummies(features_no_names)
test_set = pd.get_dummies(test_data)
test_set['Deck_T']=0

In [4]:
len(features.columns)

19

In [5]:
len(test_set.columns)

19

In [6]:
features = features.fillna(0.0)
test_set = test_set.fillna(0.0)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

In [8]:
X_train = features
X_test = test_set
y_train = outcomes

In [9]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier

# TODO: Define the classifier, and fit it to the data
#Note: I iterated on hyperparameters below to tune them.  Gini better than Entropy, max depth = 7-11 good, min samples leaf = 6 is golden!, min samples split = no effect
model = DecisionTreeClassifier(criterion='gini', max_depth = 10, min_samples_leaf = 3,min_samples_split =16)

model = model.fit(X_train,y_train)

In [10]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
#test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
#print('The test accuracy is', test_accuracy)

The training accuracy is 0.8787878787878788


In [11]:
test_ids = X_test['PassengerId'].to_numpy()

In [12]:
y_test_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [13]:
df = pd.DataFrame({'PassengerId':test_ids, 'Survived':y_test_pred})
df = df.set_index('PassengerId')

In [14]:
df.to_csv('bt_titanic_v1.csv')