# The Kaggle Titanic Competition

# Change Log:
1.0 - Created and run

1.01 - Updated links after long dormant period and verified functionality - 9/11/21

1.02 - Obtained Deck level from Cabin.  Dropped Cabin and Ticket.  Optimized decision tree model hyperparameters methd, depth, leaf, and split.

1.03 - Eliminated the train test split, trained on full training data, ran inferrence on test data, output test set result to csv file 'bt_titanic_v1.csv'.

1.04 - Further feature engineering including keeping ticket but stripping numbers.

## Project Goals:
* 1 - Create an initial simple classifier using a decision tree.
* 2 - Revise the initial version with increasing sophistication, measured by significantly improved accuracy
* 3 - Research Kaggle winner's for strategy, and impelement some learning to improve accuracy
* 4 - Submit the model to the competition as my first Kaggle entry

In [49]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

# Set a random seed
import random
random.seed(42)

# Load the dataset
in_file = '.\\data\\train.csv'
test_file = '.\\data\\test.csv'

full_data = pd.read_csv(in_file)
test_data = pd.read_csv(test_file)

# Print the first few entries of the RMS Titanic data
    ###display(full_data.head())

In [50]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
############features_raw = full_data.drop('Survived', axis = 1)
features_raw = full_data


#Get deck level from Cabin # then discard Cabin (below) (or it will jack up one hot)
features_raw['Deck'] = features_raw.Cabin.str[:1]
test_data['Deck'] = test_data.Cabin.str[:1]

# Show the new dataset with 'Survived' removed
    ###display(features_raw.head())

In [51]:
features_raw['ticket_str'] = features_raw['Ticket'].str.replace(' ','').str.replace('1','').str.replace('2','').str.replace('3','').str.replace('4','').str.replace('5','').str.replace('6','').str.replace('7','').str.replace('8','').str.replace('9','').str.replace('0','').str.replace('.','').str.replace('/','').replace('','none')
test_data['ticket_str'] = test_data['Ticket'].str.replace(' ','').str.replace('1','').str.replace('2','').str.replace('3','').str.replace('4','').str.replace('5','').str.replace('6','').str.replace('7','').str.replace('8','').str.replace('9','').str.replace('0','').str.replace('.','').str.replace('/','').replace('','none')

In [52]:
features_raw['ticket_str'] = features_raw['ticket_str'].str.upper()
test_data['ticket_str'] = test_data['ticket_str'].str.upper()

In [53]:
df1 = features_raw.groupby('ticket_str')['Survived'].sum()

In [54]:
df2 = features_raw.groupby('ticket_str')['Survived'].count()

In [55]:
df1

ticket_str
A              2
AS             0
C              2
CA            14
CASOTON        0
FA             0
FC             0
FCC            4
LINE           1
NONE         254
PC            39
PP             2
PPP            1
SC             1
SCA            0
SCAH           1
SCAHBASLE      1
SCOW           0
SCPARIS        5
SOC            1
SOP            0
SOPP           0
SOTONO         0
SOTONOQ        2
SP             0
STONO          8
SWPP           2
WC             1
WEP            1
Name: Survived, dtype: int64

In [56]:
df2

ticket_str
A             28
AS             1
C              5
CA            41
CASOTON        1
FA             1
FC             1
FCC            5
LINE           4
NONE         661
PC            60
PP             3
PPP            2
SC             1
SCA            1
SCAH           2
SCAHBASLE      1
SCOW           1
SCPARIS       11
SOC            6
SOP            1
SOPP           3
SOTONO         2
SOTONOQ       15
SP             1
STONO         18
SWPP           2
WC            10
WEP            3
Name: Survived, dtype: int64

In [59]:
df3 = (df1 / df2) *100

In [60]:
df3

ticket_str
A              7.142857
AS             0.000000
C             40.000000
CA            34.146341
CASOTON        0.000000
FA             0.000000
FC             0.000000
FCC           80.000000
LINE          25.000000
NONE          38.426626
PC            65.000000
PP            66.666667
PPP           50.000000
SC           100.000000
SCA            0.000000
SCAH          50.000000
SCAHBASLE    100.000000
SCOW           0.000000
SCPARIS       45.454545
SOC           16.666667
SOP            0.000000
SOPP           0.000000
SOTONO         0.000000
SOTONOQ       13.333333
SP             0.000000
STONO         44.444444
SWPP         100.000000
WC            10.000000
WEP           33.333333
Name: Survived, dtype: float64

In [37]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter


def _plot_pareto_by(df_, group_by, column):

    df = df_.groupby(group_by)[column].sum().reset_index()
    df = df.sort_values(by=column,ascending=False)

    df["cumpercentage"] = df[column].cumsum()/df[column].sum()*100


    fig, ax = plt.subplots(figsize=(20,5))
    ax.bar(df[group_by], df[column], color="C0")
    ax2 = ax.twinx()
    ax2.plot(df[group_by], df["cumpercentage"], color="C1", marker="D", ms=7)
    ax2.yaxis.set_major_formatter(PercentFormatter())

    ax.tick_params(axis="y", colors="C0")
    ax2.tick_params(axis="y", colors="C1")

    for tick in ax.get_xticklabels():
        tick.set_rotation(45)
    plt.show()

In [5]:
for a in features_raw['ticket_str'].unique():
    print(a)


A
PC
STONO
NONE
PP
CA
SCPARIS
SCA
SP
SOC
WC
SOTONOQ
WEP
C
SOP
FA
LINE
FCC
SWPP
SCOW
PPP
SC
SCAH
AS
SCAHBASLE
SOPP
FC
SOTONO
CASOTON


In [6]:
for a in test_data['ticket_str'].unique():
    print(a)


NONE
A
WEP
SCPARIS
STONO
PC
C
SCAH
CA
WC
SOTONOQ
SCA
FCC
FC
PP
STONOQ
SOPP
SOC
SOTONO
AQ
SC
LP


In [7]:
features_raw['ticket_str'].unique()

array(['A', 'PC', 'STONO', 'NONE', 'PP', 'CA', 'SCPARIS', 'SCA', 'SP',
       'SOC', 'WC', 'SOTONOQ', 'WEP', 'C', 'SOP', 'FA', 'LINE', 'FCC',
       'SWPP', 'SCOW', 'PPP', 'SC', 'SCAH', 'AS', 'SCAHBASLE', 'SOPP',
       'FC', 'SOTONO', 'CASOTON'], dtype=object)

In [8]:
test_data['ticket_str'].unique()

array(['NONE', 'A', 'WEP', 'SCPARIS', 'STONO', 'PC', 'C', 'SCAH', 'CA',
       'WC', 'SOTONOQ', 'SCA', 'FCC', 'FC', 'PP', 'STONOQ', 'SOPP', 'SOC',
       'SOTONO', 'AQ', 'SC', 'LP'], dtype=object)

In [44]:
# Removing the names, Cabin (have deck), Ticket (random dist)
features_no_names = features_raw.drop(['Name', 'Cabin', 'Ticket'], axis=1)
test_data = test_data.drop(['Name', 'Cabin', 'Ticket'], axis=1)

# One-hot encoding
features = pd.get_dummies(features_no_names)
test_set = pd.get_dummies(test_data)
test_set['Deck_T']=0

In [45]:
features['LP'] = 0
features['AQ'] = 0
features['STONOQ'] = 0

test_data['AS'] = 0
test_data['CASOTON'] = 0
test_data['FA'] = 0
test_data['LINE'] = 0
test_data['PPP'] = 0
test_data['SCAHBASLE'] = 0
test_data['SCOW'] = 0
test_data['SOP'] = 0
test_data['SP'] = 0
test_data['SWPP'] = 0


In [46]:
len(features.columns)

51

In [47]:
len(test_set.columns)

41

In [48]:
for col in features.columns:
    print(col)

PassengerId
Pclass
Age
SibSp
Parch
Fare
Sex_female
Sex_male
Embarked_C
Embarked_Q
Embarked_S
Deck_A
Deck_B
Deck_C
Deck_D
Deck_E
Deck_F
Deck_G
Deck_T
ticket_str_A
ticket_str_AS
ticket_str_C
ticket_str_CA
ticket_str_CASOTON
ticket_str_FA
ticket_str_FC
ticket_str_FCC
ticket_str_LINE
ticket_str_NONE
ticket_str_PC
ticket_str_PP
ticket_str_PPP
ticket_str_SC
ticket_str_SCA
ticket_str_SCAH
ticket_str_SCAHBASLE
ticket_str_SCOW
ticket_str_SCPARIS
ticket_str_SOC
ticket_str_SOP
ticket_str_SOPP
ticket_str_SOTONO
ticket_str_SOTONOQ
ticket_str_SP
ticket_str_STONO
ticket_str_SWPP
ticket_str_WC
ticket_str_WEP
LP
AQ
STONOQ


In [49]:
for col in test_data.columns:
    print(col)

PassengerId
Pclass
Sex
Age
SibSp
Parch
Fare
Embarked
Deck
ticket_str
AS
CASOTON
FA
LINE
PPP
SCAHBASLE
SCOW
SOP
SP
SWPP


In [6]:
features = features.fillna(0.0)
test_set = test_set.fillna(0.0)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

In [8]:
X_train = features
X_test = test_set
y_train = outcomes

In [9]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier

# TODO: Define the classifier, and fit it to the data
#Note: I iterated on hyperparameters below to tune them.  Gini better than Entropy, max depth = 7-11 good, min samples leaf = 6 is golden!, min samples split = no effect
model = DecisionTreeClassifier(criterion='gini', max_depth = 10, min_samples_leaf = 3,min_samples_split =16)

model = model.fit(X_train,y_train)

In [10]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
#test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
#print('The test accuracy is', test_accuracy)

The training accuracy is 0.8787878787878788


In [11]:
test_ids = X_test['PassengerId'].to_numpy()

In [12]:
y_test_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [13]:
df = pd.DataFrame({'PassengerId':test_ids, 'Survived':y_test_pred})
df = df.set_index('PassengerId')

In [14]:
df.to_csv('bt_titanic_v1.csv')