# Modelling 

Within this jupyter notebook - we will be creating our predictive model for the survivors of the titanic shipwreck. 

Evaluation metric being Accuracy for our classification models. 

After preparing the data we will then create the following models: 
- Logistic Regression 
- Random Forest Classifier 
- XGBoost Classifier 

In [1]:
# setting up the packages and modules that are needed for analysis
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
# Models from scikit learn 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations 

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_curve

%matplotlib inline

In [2]:
# read the data in to the notebook - we will have a look at the training set first 
training_df = pd.read_csv('../Data/train.csv')

In [3]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
training_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# transform for the missing values: numeric first: 
training_df['Age'] = training_df['Age'].fillna(training_df.groupby(['Sex', 'Pclass'])['Age'].transform('mean').round(1))

In [6]:
training_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

For the Cabin feature now: 

In [7]:
training_df['Cabin'].head(10)
# we can take the first letter to signify which cabin they sat in

0     NaN
1     C85
2     NaN
3    C123
4     NaN
5     NaN
6     E46
7     NaN
8     NaN
9     NaN
Name: Cabin, dtype: object

In [8]:
#creates categories based on the cabin letter (n stands for null)
#in this case we will treat null values like it's own category

training_df['cabin_adv'] = training_df.Cabin.apply(lambda x: str(x)[0])
training_df['cabin_adv'] = training_df['cabin_adv'].str.lower()

In [9]:
training_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_adv
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,n
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,c
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,n
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,c
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,n
5,6,0,3,"Moran, Mr. James",male,26.5,0,0,330877,8.4583,,Q,n
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,e
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,n
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,n
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,n


In [10]:
training_df['cabin_adv'].value_counts()

n    687
c     59
b     47
d     33
e     32
a     15
f     13
g      4
t      1
Name: cabin_adv, dtype: int64

In [11]:
#comparing surivial rate by cabin
pd.pivot_table(training_df,index='Survived',columns='cabin_adv', values = 'Name', aggfunc='count')

cabin_adv,a,b,c,d,e,f,g,n,t
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,8.0,12.0,24.0,8.0,8.0,5.0,2.0,481.0,1.0
1,7.0,35.0,35.0,25.0,24.0,8.0,2.0,206.0,


In [12]:
# we can now drop the cabin feature 

training_df.drop(['Cabin'],axis=1, inplace = True)

In [13]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_adv
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,n
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,c
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,n
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,c
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,n


In [15]:
training_df['Ticket']

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [16]:
training_df['numeric_ticket'] = training_df.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
training_df['ticket_letters'] = training_df.Ticket.apply(lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) >0 else 0)

In [18]:
training_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_adv,numeric_ticket,ticket_letters
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,n,0,a5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,c,0,pc
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,n,0,stono2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,c,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,n,1,0


In [19]:
training_df['numeric_ticket'].value_counts()

1    661
0    230
Name: numeric_ticket, dtype: int64

In [20]:
training_df['ticket_letters'].value_counts()

0            665
pc            60
ca            41
a5            21
stono2        18
sotonoq       15
scparis       11
wc            10
a4             7
soc            6
fcc            5
c              5
wep            3
pp             3
sopp           3
ppp            2
swpp           2
scah           2
sotono2        2
casoton        1
sca4           1
sc             1
fa             1
scahbasle      1
sop            1
as             1
sp             1
fc             1
scow           1
Name: ticket_letters, dtype: int64

In [22]:
pd.pivot_table(training_df,index='Survived',columns='numeric_ticket', values = 'Ticket', aggfunc='count')

numeric_ticket,0,1
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,142,407
1,88,254


In [24]:
#survival rate across different ticket types 
pd.pivot_table(training_df,index='Survived',columns='ticket_letters', values = 'Ticket', aggfunc='count')

ticket_letters,0,a4,a5,as,c,ca,casoton,fa,fc,fcc,...,soc,sop,sopp,sotono2,sotonoq,sp,stono2,swpp,wc,wep
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,410.0,7.0,19.0,1.0,3.0,27.0,1.0,1.0,1.0,1.0,...,5.0,1.0,3.0,2.0,13.0,1.0,10.0,,9.0,2.0
1,255.0,,2.0,,2.0,14.0,,,,4.0,...,1.0,,,,2.0,,8.0,2.0,1.0,1.0


We will now remove columns that we do not need: 
1. Name 
2. PassengerId
3. Ticket