Part 1: Data Exploration

In [27]:
# Load zone
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
# Import train data
df_train = pd.read_csv("data/train.csv")

In [29]:
# General Description
df_train.describe()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [30]:
# How does CLASS affect survivor status?
# Percent of class 3 passengers who survived
print(sum((df_train['Pclass'] == 3) & (df_train['Survived'] == 1))/sum(df_train['Pclass'] == 3))
# Percent of class 2 passengers who survived
print(sum((df_train['Pclass'] == 2) & (df_train['Survived'] == 1))/sum(df_train['Pclass'] == 2))
# Percent of class 1 passengers who survived
print(sum((df_train['Pclass'] == 1) & (df_train['Survived'] == 1))/sum(df_train['Pclass'] == 1))
# Clearly survival status is correlated to class status

0.24236252545824846
0.47282608695652173
0.6296296296296297


In [31]:
# How does GENDER affect survivor status?
print(sum((df_train['Sex'] == 'male') & (df_train['Survived'] == 1))/sum(df_train['Sex'] == 'male'))

print(sum((df_train['Sex'] == 'female') & (df_train['Survived'] == 1))/sum(df_train['Sex'] == 'female'))
# Men survived at a much lower rate than women

0.18890814558058924
0.7420382165605095


In [32]:
# How does AGE affect survivor status?
bins_age = [0,2,4,12,18,30,70,200]
labels_age = ['infant','toddler','kid','teen','young adult','adult','elderly']
age_group = pd.cut(df_train['Age'], bins = bins_age, labels = labels_age)
pd.pivot_table(df_train, index = 'Survived', columns = age_group, values = 'Name', aggfunc = 'count')
# Add gender to the equation
pd.pivot_table(df_train, index = ['Survived','Sex'], columns = age_group, values = 'Name', aggfunc = 'count')
# Interestingly, female children did not have as high of a chance of surival as female adults
# Male children much more likely to survive

Unnamed: 0_level_0,Age,infant,toddler,kid,teen,young adult,adult,elderly
Survived,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,female,4,1,8,9,22,20,0
0,male,5,3,8,31,152,157,4
1,female,6,6,7,27,68,83,0
1,male,9,6,6,3,28,40,1


In [33]:
# How does NUMBER OF CABINS affect survivor status?
df_train['Cabin']
# NaN might mean no personal cabin
cabin_multiple = df_train.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
pd.pivot_table(df_train, index = ['Survived'], columns = cabin_multiple, values = 'Name', aggfunc = 'count')
# Having multiple cabins worked to your advantage

Cabin,0,1,2,3,4
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,481.0,58.0,7.0,3.0,
1,206.0,122.0,9.0,3.0,2.0


In [34]:
# How does CABIN LETTER affect survivor status?
# Perhaps cabin letter indicates the floor someone was on or the 
cabin_let = df_train.Cabin.apply(lambda x: str(x)[0])
pd.pivot_table(df_train, index = ['Survived'], columns = cabin_let, values = 'Name', aggfunc = 'count')
# Does not seem to be an obvious pattern between letters

Cabin,A,B,C,D,E,F,G,T,n
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,8.0,12.0,24.0,8.0,8.0,5.0,2.0,1.0,481.0
1,7.0,35.0,35.0,25.0,24.0,8.0,2.0,,206.0


In [35]:
# How does TITLE affect survivor status?
df_train.Name.head(50)
name_title = df_train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
pd.pivot_table(df_train, index = ['Survived'], columns = name_title, values = 'Name', aggfunc = 'count')

# Special titles were generally helpful, although -- notably -- "Reverend" was not

Name,Capt,Col,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1.0,1.0,1.0,4.0,1.0,,1.0,17.0,55.0,,,436.0,26.0,,6.0,,
1,,1.0,,3.0,,1.0,1.0,23.0,127.0,2.0,1.0,81.0,99.0,1.0,,1.0,1.0


In [36]:
# How does FARE affect survivor status
bins_fare = [-.1,.1,10,40,100,600]
labels_fare = ['zero','low','medium','high','very high']
fare_group = pd.cut(df_train['Fare'], bins = bins_fare, labels = labels_fare)
pd.pivot_table(df_train, index = 'Survived', columns = fare_group, values = 'Name', aggfunc = 'count')
# A higher fare indicates survival

Fare,zero,low,medium,high,very high
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,14,255,217,49,14
1,1,66,162,74,39


In [37]:
# How does PARENT/CHILDREN relations affect survivor status?
pd.pivot_table(df_train, index = ['Survived'], columns = 'Parch', values = 'Name', aggfunc = 'count')
# Having one or two siblings/spouse was helpful, but having more than that was detrimental it seems

Parch,0,1,2,3,4,5,6
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,445.0,53.0,40.0,2.0,4.0,4.0,1.0
1,233.0,65.0,40.0,3.0,,1.0,


In [38]:
# How does SIBLING/SPOUSE relations affect survivor status?
pd.pivot_table(df_train, index = ['Survived'], columns = 'SibSp', values = 'Name', aggfunc = 'count')
# Similar effect as parent/child

SibSp,0,1,2,3,4,5,8
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,398.0,97.0,15.0,12.0,15.0,5.0,7.0
1,210.0,112.0,13.0,4.0,3.0,,


In [39]:
# How does PORT OF EMBARKMENT affect survivor status?
pd.pivot_table(df_train, index = ['Survived'], columns = 'Embarked', values = 'Name', aggfunc = 'count')
# Southampton had worst survival rate, then Queenstown

Embarked,C,Q,S
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,75,47,427
1,93,30,217


Part 2: Data Cleaning

In [40]:
# Import test data
df_test = pd.read_csv("data/test.csv")

In [41]:
# Add valuable mutations from exploratory section into both datasets
df_train['name_title'] = df_train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
df_test['name_title'] = df_test.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

bins_fare = [-.1,.1,10,40,100,600]
labels_fare = ['zero','low','medium','high','very high']
df_train['fare_group'] = pd.cut(df_train['Fare'], bins = bins_fare, labels = labels_fare)
df_test['fare_group'] = pd.cut(df_test['Fare'], bins = bins_fare, labels = labels_fare)


In [42]:
# Although cabin_multiple variable was promising, the number of missing values made me take 
# out any Cabin related variable all together 
df_train = df_train.drop(['Cabin'], axis = 1)
df_test = df_test.drop(['Cabin'], axis = 1)


In [43]:
# Drop 2 rows with missing 'Embarked' variables
df_train.dropna(subset = ['Embarked'], inplace = True)
df_test.dropna(subset = ['Embarked'], inplace = True)

In [44]:
# Fill NAs with averages
df_train.Age = df_train.Age.fillna(df_train.Age.mean())
df_test.Age = df_test.Age.fillna(df_test.Age.mean())

df_train.Fare = df_train.Fare.fillna(df_train.Fare.median())
df_test.Fare = df_test.Fare.fillna(df_test.Fare.median())


In [45]:
# Change Pclass to string for dummy creation
df_train.Pclass = df_train.Pclass.astype(str)
df_test.Pclass = df_test.Pclass.astype(str)

In [46]:
# Create dummy variables
df_train = pd.get_dummies(df_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'fare_group', 'Embarked', 'name_title']])
df_test = pd.get_dummies(df_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'fare_group', 'Embarked', 'name_title']])


In [47]:
# Test set did not have all the same titles
df_train = df_train[[col for col in df_train.columns if col in df_train.columns and col in df_test.columns]]
df_test = df_test[[col for col in df_test.columns if col in df_test.columns and col in df_train.columns]]
# Print new columns
print(df_train.columns.tolist())
print(df_test.columns.tolist())
# Now they have the same columns

['Age', 'SibSp', 'Parch', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'fare_group_zero', 'fare_group_low', 'fare_group_medium', 'fare_group_high', 'fare_group_very high', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'name_title_Col', 'name_title_Dr', 'name_title_Master', 'name_title_Miss', 'name_title_Mr', 'name_title_Mrs', 'name_title_Ms', 'name_title_Rev']
['Age', 'SibSp', 'Parch', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'fare_group_zero', 'fare_group_low', 'fare_group_medium', 'fare_group_high', 'fare_group_very high', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'name_title_Col', 'name_title_Dr', 'name_title_Master', 'name_title_Miss', 'name_title_Mr', 'name_title_Mrs', 'name_title_Ms', 'name_title_Rev']


In [49]:
# Scale non-categorical data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

df_train[['Age', 'SibSp', 'Parch']] = scale.fit_transform(pd.get_dummies(df_train[['Age', 'SibSp', 'Parch']]))


In [50]:
df_train

Unnamed: 0,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,fare_group_zero,fare_group_low,...,Embarked_Q,Embarked_S,name_title_Col,name_title_Dr,name_title_Master,name_title_Miss,name_title_Mr,name_title_Mrs,name_title_Ms,name_title_Rev
0,-5.896199e-01,0.431350,-0.474326,0,0,1,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
1,6.448480e-01,0.431350,-0.474326,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-2.810029e-01,-0.475199,-0.474326,0,0,1,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
3,4.133853e-01,0.431350,-0.474326,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,4.133853e-01,-0.475199,-0.474326,0,0,1,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-2.038487e-01,-0.475199,-0.474326,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
887,-8.210826e-01,-0.475199,-0.474326,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
888,-5.482138e-16,0.431350,2.006119,0,0,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
889,-2.810029e-01,-0.475199,-0.474326,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


Part 3: Model Building

In [54]:
# Load SKlearn tools
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [52]:
# Naive Bayes (as baseline)
gnb = GaussianNB()
cv = cross_val_score(gnb,)