##### Titanic Prediction



Some of the methods are borrowed from [YASSINE GHOUZAM](https://www.kaggle.com/yassineghouzam) in the completion of prediction, but there are many differences in whole process of data preprocessing, feature engineering and model selection due to different understanding of data.

In [None]:
import re
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import xgboost as xgb

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

1. Data Preprocessing

In [None]:
# find outliers using Tukey Method
# If a data point not in [Q1-1.5(Q3-Q1),Q3+1.5(Q3-Q1)], consider it an outlier.
# Output the index.
def find_outliers(df,features):
    outliers = {}
    for col in features:
        Qrange = np.percentile(df.loc[:,col].dropna().values,75) - np.percentile(df.loc[:,col].dropna().values,25)
        up_level = np.percentile(df.loc[:,col].dropna().values,75) + 1.5 * Qrange
        down_level = np.percentile(df.loc[:,col].dropna().values,25) - 1.5 * Qrange
        outliers[col] = np.where((df.loc[:,col].values > up_level) | (df.loc[:,col].values < down_level))[0]
    return outliers

In [None]:
# Clear outliers
features = ['Age','SibSp','Parch','Fare']
outliers = find_outliers(train_data,features)
outliers_index = list(k for k,v in Counter(np.hstack((outliers['Age'],outliers['SibSp'],outliers['Parch'],outliers['Fare']))).items() if v > 2)
train_data = train_data.drop(outliers_index,axis=0).reset_index(drop=True)

In [None]:
# concat train data and test data
# We assume that the training set and test set have the same characteristics and distribution, 
# so combining them together helps to get a more realistic distribution.
print(len(train_data))
train_label = train_data.loc[:,'Survived']
train_data.drop(columns=['Survived','PassengerId'],inplace=True)
split_point = len(train_data)
PassengerId_test = test_data.loc[:,'PassengerId']
test_data.drop(columns='PassengerId',inplace=True)
data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)

880


In [None]:
# fill nan values
data.info()
data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298 entries, 0 to 1297
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1298 non-null   int64  
 1   Name      1298 non-null   object 
 2   Sex       1298 non-null   object 
 3   Age       1042 non-null   float64
 4   SibSp     1298 non-null   int64  
 5   Parch     1298 non-null   int64  
 6   Ticket    1298 non-null   object 
 7   Fare      1297 non-null   float64
 8   Cabin     291 non-null    object 
 9   Embarked  1296 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 101.5+ KB


Pclass         0
Name           0
Sex            0
Age          256
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1007
Embarked       2
dtype: int64

In [None]:
# fill Embarked with mode
print(data.groupby('Embarked')['Embarked'].count())
data['Embarked'].fillna('S',inplace=True)
data['Embarked'].isna().sum()

Embarked
C    270
Q    123
S    903
Name: Embarked, dtype: int64


0

In [None]:
# Consider passengers who have nan cabin values as people who have no cabin
# fill it with 'N1912', a new class
# split cabin to letters and numbers
print(data.groupby('Cabin')['Cabin'].count())
data['Cabin'].fillna('N0',inplace=True)
print(data['Cabin'].isna().sum())
data['Cabin_letter'] = data['Cabin']
data.rename(columns = {'Cabin':'Cabin_number'},inplace=True)
data['Cabin_number'] = data['Cabin_number'].apply(lambda x: re.sub("\D","",x))
data['Cabin_letter'] = data['Cabin_letter'].apply(lambda x: ''.join(re.findall(r'[A-Za-z]',x)))
data.head()

Cabin
A10    1
A11    1
A14    1
A16    1
A18    1
      ..
F33    4
F38    1
F4     4
G6     5
T      1
Name: Cabin, Length: 186, dtype: int64
0


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin_number,Embarked,Cabin_letter
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S,N
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,85,C,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S,N
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,123,S,C
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S,N


In [None]:
print(data.groupby('Cabin_number')['Cabin_number'].count())
print(data.groupby('Cabin_letter')['Cabin_letter'].count())
def how_many_cabins(string):
    if string=='N':
        return 0
    else:
        return len(string)
    
data['How_many_cabins'] = data['Cabin_letter'].apply(lambda x: how_many_cabins(x))
data['Cabin_letter'] = data['Cabin_letter'].apply(lambda x: x[0])

print(data.groupby('Cabin_letter')['Cabin_letter'].count())

Cabin_number
           6
0       1007
10         3
101        7
1012       2
        ... 
94         1
95         1
9698       4
97         1
99         1
Name: Cabin_number, Length: 115, dtype: int64
Cabin_letter
A         22
B         47
BB         8
BBB        4
BBBB       5
C         80
CC         8
CCC        3
D         44
DD         2
E         40
EE         1
F         14
FE         3
FG         4
G          5
N       1007
T          1
Name: Cabin_letter, dtype: int64
Cabin_letter
A      22
B      64
C      91
D      46
E      41
F      21
G       5
N    1007
T       1
Name: Cabin_letter, dtype: int64


In [None]:
def merge_number(string):
    if len(string) == 4:
        return int((int(string[0:2]) + int(string[2:4])) / 2)
    elif len(string) == 6:
        return int((int(string[0:2]) + int(string[2:4]) + int(string[4:6])) / 3)
    elif len(string) == 8:
        return int((int(string[0:2]) + int(string[2:4]) + int(string[4:6]) + int(string[6:8])) / 4)
    elif string == '':
        return 0
    else:
        return int(string)
data['Cabin_number'] = data['Cabin_number'].apply(lambda x: merge_number(x))
print(data.groupby('Cabin_number')['Cabin_number'].count())

Cabin_number
0      1013
2         6
3         1
4         5
5         3
       ... 
126       2
128       1
130       1
132       1
148       1
Name: Cabin_number, Length: 103, dtype: int64


In [None]:
age_mode = data.dropna(how='any').groupby(['Pclass','SibSp','Parch'])['Age'].apply(lambda x: stats.mode(x)[0][0])
for i in range(len(data)):
    if np.isnan(data.loc[i,'Age']):
        try:
            data.loc[i,'Age'] = age_mode[(data.loc[i,'Pclass'],data.loc[i,'SibSp'],data.loc[i,'Parch'])]
        except:
            data.loc[i,'Age'] = stats.mode(data['Age'].values)[0][0]
data['Age'].isna().sum()

0

In [None]:
data.loc[data[data['Fare'].isna()].index.values[0],'Fare'] = data['Fare'].mean()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1298 entries, 0 to 1297
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Pclass           1298 non-null   int64  
 1   Name             1298 non-null   object 
 2   Sex              1298 non-null   object 
 3   Age              1298 non-null   float64
 4   SibSp            1298 non-null   int64  
 5   Parch            1298 non-null   int64  
 6   Ticket           1298 non-null   object 
 7   Fare             1298 non-null   float64
 8   Cabin_number     1298 non-null   int64  
 9   Embarked         1298 non-null   object 
 10  Cabin_letter     1298 non-null   object 
 11  How_many_cabins  1298 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 121.8+ KB


2. Feature Engineering

In [None]:
# Change Sex to binary
data['Sex'] = data['Sex'].apply(lambda x: 1 if x=='male' else 0)
# Extract information from Name
data['Name_length'] = data['Name'].apply(lambda x: len(x))
data['Contain_parentheses'] = data['Name'].apply(lambda x: 1 if '(' in x else 0)
data['appellation'] = data['Name'].apply(lambda x:x.split(',')[1].split('.')[0])
print(data.groupby('appellation')['appellation'].count())
data['appellation_Master'] = data['appellation'].map(lambda x: 1 if x==' Master' else 0)
data['appellation_Miss'] = data['appellation'].map(lambda x: 1 if x==' Miss' else 0)
data['appellation_Mr'] = data['appellation'].map(lambda x: 1 if x==' Mr' else 0)
data['appellation_Mrs'] = data['appellation'].map(lambda x: 1 if x==' Mrs' else 0)
data['appellation_others'] = data['appellation'].map(lambda x: 0 if x in [' Master',' Miss',' Mr',' Mrs'] else 1)
data.drop(columns=['Name','appellation'],inplace=True)

appellation
 Col               4
 Don               1
 Dona              1
 Dr                8
 Jonkheer          1
 Lady              1
 Major             2
 Master           60
 Miss            255
 Mlle              2
 Mme               1
 Mr              753
 Mrs             197
 Ms                2
 Rev               8
 Sir               1
 the Countess      1
Name: appellation, dtype: int64


In [None]:
# Family size
# We assume that family size can affect the speed and resolution of people's escape.
data['Family_size'] = data['SibSp'] + data['Parch'] + 1
data['Single'] = data['Family_size'].map(lambda x: 1 if x == 1 else 0)
data['Small_Family'] = data['Family_size'].map(lambda x: 1 if  x == 2  else 0)
data['Med_Family'] = data['Family_size'].map(lambda x: 1 if 3 <= x <= 4 else 0)
data['Large_Family'] = data['Family_size'].map(lambda x: 1 if x >= 5 else 0)
data.drop(columns=['Family_size'],inplace=True)

In [None]:
# Ticket
data['Ticket'] = data['Ticket'].apply(lambda x: 'N' if x.isdigit() else 
                                      x.replace('/','.').replace(' ','.').split('.')[0])
data.groupby('Ticket')['Ticket'].count()
data = pd.get_dummies(data,columns = ['Ticket'],prefix='Ticket')

In [None]:
data = pd.get_dummies(data,columns = ['Embarked'],prefix='Embarked')
data = pd.get_dummies(data,columns = ['Cabin_letter'],prefix='CL')

In [None]:
print(data['Fare'].skew())
print(data['Name_length'].skew())
data['Fare'] = data['Fare'].apply(lambda x: np.log(x) if x!=0 else 0)
data['Name_length'] = data['Name_length'].apply(lambda x: np.log(x) if x!=0 else 0)
print(data['Fare'].skew())
print(data['Name_length'].skew())

4.514424626776879
1.3344777500697589
0.5730905149465129
0.5012722690552052


In [None]:
print(data.sum())
data.info()

Pclass                  2979.000000
Sex                      837.000000
Age                    37026.850000
SibSp                    587.000000
Parch                    483.000000
Fare                    3759.861572
Cabin_number           14305.000000
How_many_cabins          346.000000
Name_length             4214.693574
Contain_parentheses      221.000000
appellation_Master        60.000000
appellation_Miss         255.000000
appellation_Mr           753.000000
appellation_Mrs          197.000000
appellation_others        33.000000
Single                   790.000000
Small_Family             235.000000
Med_Family               201.000000
Large_Family              72.000000
Ticket_A                  39.000000
Ticket_A4                  1.000000
Ticket_AQ                  2.000000
Ticket_C                  55.000000
Ticket_CA                 15.000000
Ticket_F                  12.000000
Ticket_Fa                  1.000000
Ticket_LINE                4.000000
Ticket_LP                  1

In [None]:
data.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_number,How_many_cabins,Name_length,Contain_parentheses,...,Embarked_S,CL_A,CL_B,CL_C,CL_D,CL_E,CL_F,CL_G,CL_N,CL_T
0,3,1,22.0,1,0,1.981001,0,0,3.135494,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,38.0,1,0,4.266662,85,1,3.931826,1,...,0,0,0,1,0,0,0,0,0,0
2,3,0,26.0,0,0,2.070022,0,0,3.091042,0,...,1,0,0,0,0,0,0,0,1,0
3,1,0,35.0,1,0,3.972177,123,1,3.78419,1,...,1,0,0,1,0,0,0,0,0,0
4,3,1,35.0,0,0,2.085672,0,0,3.178054,0,...,1,0,0,0,0,0,0,0,1,0
5,3,1,22.0,0,0,2.135148,0,0,2.772589,0,...,0,0,0,0,0,0,0,0,1,0
6,1,1,54.0,0,0,3.948596,46,1,3.135494,0,...,1,0,0,0,0,1,0,0,0,0
7,3,1,2.0,3,1,3.048088,0,0,3.401197,0,...,1,0,0,0,0,0,0,0,1,0
8,3,0,27.0,0,2,2.409941,0,0,3.89182,1,...,1,0,0,0,0,0,0,0,1,0
9,2,0,14.0,1,0,3.403555,0,0,3.555348,1,...,0,0,0,0,0,0,0,0,1,0


In [None]:
X_train = data.loc[:split_point-1]
X_test = data.loc[split_point:]
Y_train = train_label
print(len(X_test) == len(PassengerId_test))
print(len(X_train) == len(Y_train))

True
True


3. Model Training

In [None]:
import warnings
warnings.filterwarnings('ignore')
kfold = StratifiedKFold(n_splits=10)
random_state = 12
CLmodels = []
CLmodels.append(SVC(random_state=random_state))
CLmodels.append(DecisionTreeClassifier(random_state=random_state))
CLmodels.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
CLmodels.append(RandomForestClassifier(random_state=random_state))
CLmodels.append(GradientBoostingClassifier(random_state=random_state))
CLmodels.append(MLPClassifier(random_state=random_state))
CLmodels.append(KNeighborsClassifier())
CLmodels.append(LogisticRegression(random_state=random_state))
CLmodels.append(xgb.XGBClassifier(random_state=random_state,n_jobs=-1,tree_method='hist'))
CLmodels.append(ExtraTreesClassifier(random_state=random_state))
cv_results = []
for CLmodel in CLmodels:
    cv_results.append(cross_val_score(CLmodel,X_train,Y_train,scoring="accuracy",cv=kfold))



In [None]:
cl = ['SVC','DecisionTree','AdaBoost','RandomForest','GradientBoosting','MLPClassifier','KNC','LogitReg','XGB','EXT']
cv_mean = []
cv_std = []
for cv_result in cv_results:
    cv_mean.append(np.mean(cv_result))
    cv_std.append(np.std(cv_result))
result = pd.DataFrame(np.vstack((np.array(cv_mean),np.array(cv_std))).T,columns=['mean','std'],index=cl)
print(result['mean']/result['std'])
result[result['mean']>0.8]

SVC                 18.494350
DecisionTree        15.693344
AdaBoost            14.055459
RandomForest        17.924658
GradientBoosting    17.797539
MLPClassifier       19.341932
KNC                 24.439662
LogitReg            20.212120
XGB                 18.843261
EXT                 17.196085
dtype: float64


Unnamed: 0,mean,std
RandomForest,0.817045,0.045582
GradientBoosting,0.825,0.046355
MLPClassifier,0.823864,0.042595
LogitReg,0.820455,0.040592
XGB,0.818182,0.04342
EXT,0.811364,0.047183


In [None]:
RFC = RandomForestClassifier()
rfc_grid = {"max_depth": [None],
            "max_features": [4, 5, 6],
            "min_samples_split": [9, 10, 12],
            "min_samples_leaf": [4, 5, 6],
            "bootstrap": [False],
            "n_estimators" :[50,100],
            "criterion": ["gini"]}
RFC = GridSearchCV(RFC,param_grid=rfc_grid,cv=kfold,scoring="accuracy",verbose=1)
RFC.fit(X_train,Y_train)
RFC_best=RFC.best_estimator_

Fitting 10 folds for each of 54 candidates, totalling 540 fits


In [None]:
GBC = GradientBoostingClassifier()
gb_grid = {'loss' : ["deviance"],
           'n_estimators' : [200,500,800,1000],
           'learning_rate': [0.1,0.05,0.01],
           'max_depth': [3,4,6],
           'min_samples_leaf': [80,100,120],
           'max_features': [0.5,0.3]}
GBC = GridSearchCV(GBC,param_grid=gb_grid,cv=kfold,scoring="accuracy",verbose=1)
GBC.fit(X_train,Y_train)
GBC_best = GBC.best_estimator_

Fitting 10 folds for each of 216 candidates, totalling 2160 fits


In [None]:
MLP = MLPClassifier()
mlp_grid = {'hidden_layer_sizes': [(20,30,20),(20,35,20),(10,30,20),(10,30,10),(10,20,20,10),(20,40,20),(10,20,10),(5,10,5)],
            'solver': ['adam'],
            'max_iter': [500,1000],
            'alpha': [0.00025,0.0002],
            'learning_rate': ['adaptive'],
            'learning_rate_init': [0.03,0.05,0.06]}
MLP = GridSearchCV(MLP,mlp_grid,cv=kfold,scoring='accuracy',verbose=1)
MLP.fit(X_train,Y_train)
MLP_best = MLP.best_estimator_

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [None]:
LR = LogisticRegression()
lr_grid = {'penalty': ['l1','l2'],
           'C': [0.008,0.009,0.01,0.011,0.012,0.013,0.015],
           'solver': ['liblinear','lbfgs','newton-cg','sag']}
LR = GridSearchCV(LR,lr_grid,cv=kfold,scoring='accuracy',verbose=1)
LR.fit(X_train,Y_train)
LR_best = LR.best_estimator_

Fitting 10 folds for each of 56 candidates, totalling 560 fits


In [None]:
XGB = xgb.XGBClassifier(tree_method='hist')
xgb_grid = {'learning_rate': [0.01,0.02,0.025,0.03,0.035],
            'max_depth': [10,12,15,17],
            'subsample': [0.8,0.85,0.9]}
XGB = GridSearchCV(XGB,xgb_grid,cv=kfold,scoring='accuracy',verbose=1)
XGB.fit(X_train,Y_train)
XGB_best = XGB.best_estimator_

Fitting 10 folds for each of 60 candidates, totalling 600 fits


In [None]:
'''
EXT = ExtraTreesClassifier()
ext_grid = {"max_depth": [None],
            "max_features": [10,15,20,25,30],
            "min_samples_split": [2, 3, 4, 5],
            "min_samples_leaf": [6,8,10,12,15],
            "bootstrap": [False],
            "n_estimators" :[100,300],
            "criterion": ["gini"]}
EXT = GridSearchCV(EXT,param_grid=ext_grid,cv=kfold,scoring="accuracy",verbose=1)
EXT.fit(X_train,Y_train)
EXT_best = EXT.best_estimator_
'''

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


In [None]:
'''
SV = SVC(probability=True)
sv_grid = {'kernel': ['rbf'], 
           'gamma': [ 0.001, 0.01, 0.1, 1],
           'C': [1, 10, 50, 100,200,300, 1000]}
SV = GridSearchCV(SV,param_grid=sv_grid,cv=kfold,scoring="accuracy",verbose=1)
SV.fit(X_train,Y_train)
SV_best = SV.best_estimator_
'''

Fitting 10 folds for each of 28 candidates, totalling 280 fits


In [None]:
feature_names = X_train.columns.values
print(feature_names)

['Pclass' 'Sex' 'Age' 'SibSp' 'Parch' 'Fare' 'Cabin_number'
 'How_many_cabins' 'Name_length' 'Contain_parentheses'
 'appellation_Master' 'appellation_Miss' 'appellation_Mr'
 'appellation_Mrs' 'appellation_others' 'Single' 'Small_Family'
 'Med_Family' 'Large_Family' 'Ticket_A' 'Ticket_A4' 'Ticket_AQ' 'Ticket_C'
 'Ticket_CA' 'Ticket_F' 'Ticket_Fa' 'Ticket_LINE' 'Ticket_LP' 'Ticket_N'
 'Ticket_P' 'Ticket_PC' 'Ticket_PP' 'Ticket_S' 'Ticket_SC' 'Ticket_SCO'
 'Ticket_SO' 'Ticket_SOTON' 'Ticket_STON' 'Ticket_SW' 'Ticket_W'
 'Ticket_WE' 'Embarked_C' 'Embarked_Q' 'Embarked_S' 'CL_A' 'CL_B' 'CL_C'
 'CL_D' 'CL_E' 'CL_F' 'CL_G' 'CL_N' 'CL_T']


In [None]:
indices = np.argsort(RFC_best.feature_importances_)[::-1][:10]
for i in indices:
    print(feature_names[i],RFC_best.feature_importances_[i])

appellation_Mr 0.1855746752578586
Sex 0.1813736188846318
Fare 0.06812835380227397
Pclass 0.061949757972109894
appellation_Miss 0.057178845065727985
Contain_parentheses 0.054934530414978394
Name_length 0.054604920465279545
appellation_Mrs 0.051306291051187704
Age 0.04222231144578152
Cabin_number 0.03484458953084653


In [None]:
indices = np.argsort(GBC_best.feature_importances_)[::-1][:10]
for i in indices:
    print(feature_names[i],GBC_best.feature_importances_[i])

appellation_Mr 0.25126829162624886
Sex 0.22818643195209162
Fare 0.14334258649374693
Pclass 0.09328387450694826
Age 0.08604686096493905
Name_length 0.060350727675980514
Cabin_number 0.03153622451960316
CL_N 0.023653993015168726
Embarked_S 0.014678695615660932
Contain_parentheses 0.011193863395210767


In [None]:
indices = np.argsort(XGB_best.feature_importances_)[::-1][:10]
for i in indices:
    print(feature_names[i],XGB_best.feature_importances_[i])

appellation_Mr 0.38061327
Sex 0.18129635
Large_Family 0.078544326
Pclass 0.05316829
appellation_others 0.030835776
appellation_Master 0.018003583
Cabin_number 0.017934164
CL_A 0.015267054
Contain_parentheses 0.014491682
Fare 0.012505015


In [None]:
'''
indices = np.argsort(EXT_best.feature_importances_)[::-1][:10]
for i in indices:
    print(feature_names[i],EXT_best.feature_importances_[i])
'''

appellation_Mr 0.23893232449519758
Sex 0.20705535652961884
appellation_Miss 0.08128346230135777
Pclass 0.06453964936569048
Large_Family 0.051202619414818296
Contain_parentheses 0.05118838468707835
appellation_Mrs 0.04298958984410358
CL_N 0.040725208882774346
Fare 0.024215235428416675
Med_Family 0.02325176500889594


* It is clear that models have different opinions about different features, so we put together several models and put them to a vote.

In [None]:
ultimate_CL = VotingClassifier(estimators=[('RFC', RFC_best),('GBC', GBC_best),('MLP', MLP_best),('LR',LR_best),('XGB',XGB_best)],voting='soft',n_jobs=-1)
ultimate_CL = ultimate_CL.fit(X_train,Y_train)

In [None]:
results = pd.concat([PassengerId_test,pd.Series(ultimate_CL.predict(X_test), name="Survived")],axis=1)
results.to_csv('result_4_ens.csv',index=False)