# Titanic Survival Rate Predictions

### Importing packages

In [270]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


### Explorating Data

In [271]:
# Importing data csv as pandas df.
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')

In [272]:
# General overview of the data
raw_train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [273]:
# General information.
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [274]:
# Null values. Seems like we'll have some null values to handle in Age, Cabin, Fare and Embarked.
print(raw_train.isnull().sum())
print(raw_test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [275]:
raw_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [276]:
# There are no duplicates in out datasets.
dup_train = raw_train.duplicated().any()
dup_test = raw_test.duplicated().any()
print(dup_train)
print(dup_test)

False
False


In [277]:
# Splitting out master train file into train and valid 80/20 in order to be able to measure performance.
x = raw_train.drop(["Survived"],axis=1)
y = pd.DataFrame(raw_train["Survived"])
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.20, random_state=8)

### Data cleaning

In [278]:
# We'll replace NaN values for Age, Fare and Embarked. Cabin has too many NaN to be useful in our models.
# Where applicable, we'll always use the same values calculated for train in test. To avoid leakage.

x_train["Age"] = x_train["Age"].fillna(x_train["Age"].median())
x_valid["Age"] = x_valid["Age"].fillna(x_train["Age"].median())
raw_test["Age"] = raw_test["Age"].fillna(x_train["Age"].median())

x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])
x_valid["Embarked"] = x_valid["Embarked"].fillna(x_train["Embarked"].mode()[0])
raw_test["Embarked"] = raw_test["Embarked"].fillna(x_train["Embarked"].mode()[0])

x_train["Fare"] = x_train["Fare"].fillna(x_train["Fare"].mean())
x_valid["Fare"] = x_valid["Fare"].fillna(x_train["Fare"].mean())
raw_test["Fare"] = raw_test["Fare"].fillna(x_train["Fare"].mean())

x_train['Cabin'].fillna('Missing', inplace=True)
x_valid['Cabin'].fillna('Missing', inplace=True)
raw_test['Cabin'].fillna('Missing', inplace=True)

In [279]:
print(x_train.isnull().sum())
print(x_valid.isnull().sum())
print(raw_test.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [280]:
print(x_train.head(10))


     PassengerId  Pclass                               Name     Sex   Age  \
768          769       3                Moran, Mr. Daniel J    male  29.0   
387          388       2                   Buss, Miss. Kate  female  36.0   
823          824       3                 Moor, Mrs. (Beila)  female  27.0   
778          779       3            Kilgannon, Mr. Thomas J    male  29.0   
157          158       3                    Corn, Mr. Harry    male  30.0   
661          662       3                  Badt, Mr. Mohamed    male  40.0   
392          393       3       Gustafsson, Mr. Johan Birger    male  28.0   
57            58       3                Novel, Mr. Mansouer    male  28.5   
863          864       3  Sage, Miss. Dorothy Edith "Dolly"  female  29.0   
250          251       3             Reed, Mr. James George    male  29.0   

     SibSp  Parch           Ticket     Fare    Cabin Embarked  
768      1      0           371110  24.1500  Missing        Q  
387      0      0       

In [281]:
#Creating the label encoder
encoder_c = LabelEncoder()
encoder_e = LabelEncoder()

#Fitting the encoder on the raw_train['Cabin'] column
encoder_c.fit(pd.concat([x_train['Cabin'], x_valid['Cabin'], raw_test['Cabin']]))
encoder_e.fit(x_train['Embarked'])

#Transforming the x_train['Cabin'], x_valid['Cabin'] and raw_test['Cabin'] columns using the fitted encoder
x_train['Cabin'] = encoder_c.transform(x_train['Cabin'])
x_valid['Cabin'] = encoder_c.transform(x_valid['Cabin'])
raw_test['Cabin'] = encoder_c.transform(raw_test['Cabin'])

x_train['Embarked'] = encoder_e.transform(x_train['Embarked'])
x_valid['Embarked'] = encoder_e.transform(x_valid['Embarked'])
raw_test['Embarked'] = encoder_e.transform(raw_test['Embarked'])
x_train.head(50)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
768,769,3,"Moran, Mr. Daniel J",male,29.0,1,0,371110,24.15,185,1
387,388,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,185,2
823,824,3,"Moor, Mrs. (Beila)",female,27.0,0,1,392096,12.475,150,2
778,779,3,"Kilgannon, Mr. Thomas J",male,29.0,0,0,36865,7.7375,185,1
157,158,3,"Corn, Mr. Harry",male,30.0,0,0,SOTON/OQ 392090,8.05,185,2
661,662,3,"Badt, Mr. Mohamed",male,40.0,0,0,2623,7.225,185,0
392,393,3,"Gustafsson, Mr. Johan Birger",male,28.0,2,0,3101277,7.925,185,2
57,58,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,185,0
863,864,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,29.0,8,2,CA. 2343,69.55,185,2
250,251,3,"Reed, Mr. James George",male,29.0,0,0,362316,7.25,185,2


In [282]:
# Dictionary for numerical conversion of column sex
factors = {'male': 0, 'female': 1}

# Applying the same conversion to all df
x_train['Sex'] = x_train['Sex'].map(factors)
x_valid['Sex'] = x_valid['Sex'].map(factors)
raw_test['Sex'] = raw_test['Sex'].map(factors)

In [283]:
x_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
768,769,3,"Moran, Mr. Daniel J",0,29.0,1,0,371110,24.15,185,1
387,388,2,"Buss, Miss. Kate",1,36.0,0,0,27849,13.0,185,2
823,824,3,"Moor, Mrs. (Beila)",1,27.0,0,1,392096,12.475,150,2
778,779,3,"Kilgannon, Mr. Thomas J",0,29.0,0,0,36865,7.7375,185,1
157,158,3,"Corn, Mr. Harry",0,30.0,0,0,SOTON/OQ 392090,8.05,185,2


In [284]:
# Preparing files before starting to fit ML models to the data
passengerid_test = raw_test['PassengerId']
x_train = x_train.drop(['PassengerId','Name', 'Ticket'], axis=1)
x_valid = x_valid.drop(['PassengerId','Name', 'Ticket'], axis=1)

### Decision Tree

In [285]:
dtree = DecisionTreeClassifier(random_state=6, max_depth=5)
dtree.fit(x_train,y_train)

In [286]:
predictions_tree = dtree.predict(x_valid)

In [287]:
# Comparaison de la performance sur train vs test afin de repérer un écart de performance important.
print('Training set score: {:.4f}'.format(dtree.score(x_train, y_train)))
print('Test set score: {:.4f}'.format(dtree.score(x_valid, y_valid)))

Training set score: 0.8581
Test set score: 0.8045


In [288]:
# Classification report
print(classification_report(y_valid,predictions_tree))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       108
           1       0.79      0.69      0.74        71

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



### Random Forest

In [914]:
rfc = RandomForestClassifier(n_estimators=500, random_state=6)
rfc.fit(x_train,y_train)

RandomForestClassifier(n_estimators=500, random_state=6)

In [648]:
predictions_rfc = rfc.predict(x_test)

In [649]:
print(predictions_rfc.mean())
print(y_train.mean())

0.36363636363636365
0.3838383838383838


### Neural Network

In [650]:
x_train['Age'] = StandardScaler().fit_transform(x_train[['Age']])
x_test['Age'] = StandardScaler().fit_transform(x_test[['Age']])

x_train['Fare'] = StandardScaler().fit_transform(x_train[['Fare']])
x_test['Fare'] = StandardScaler().fit_transform(x_test[['Fare']])


In [651]:
columns_to_encode = ['Pclass','SibSp','Parch','Embarked']

train_df_encoded = pd.get_dummies(x_train, columns=columns_to_encode)
test_df_encoded = pd.get_dummies(x_test, columns=columns_to_encode)

In [652]:
train_df_encoded.head()

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_0,Embarked_1,Embarked_2
0,0,-0.565736,-0.502445,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
1,1,0.663861,0.786845,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,-0.258337,-0.488854,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,1,0.433312,0.42073,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0,0.433312,-0.486337,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [653]:
# Initialize the model
nn = MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000, alpha=0.0001,
                     solver='adam', verbose=10,  random_state=21,tol=0.000000001)

# Fit the model to the training data
nn.fit(x_train, y_train)

Iteration 1, loss = 0.71518787
Iteration 2, loss = 0.68547307
Iteration 3, loss = 0.66272712
Iteration 4, loss = 0.64285876
Iteration 5, loss = 0.62545594
Iteration 6, loss = 0.60968258
Iteration 7, loss = 0.59603509
Iteration 8, loss = 0.58290934
Iteration 9, loss = 0.57145536
Iteration 10, loss = 0.56083268
Iteration 11, loss = 0.55068792
Iteration 12, loss = 0.54170329
Iteration 13, loss = 0.53163854
Iteration 14, loss = 0.52347509
Iteration 15, loss = 0.51556166
Iteration 16, loss = 0.50771419
Iteration 17, loss = 0.50017529
Iteration 18, loss = 0.49355782
Iteration 19, loss = 0.48697126
Iteration 20, loss = 0.48125549
Iteration 21, loss = 0.47555459
Iteration 22, loss = 0.47028133
Iteration 23, loss = 0.46554089
Iteration 24, loss = 0.46136652
Iteration 25, loss = 0.45705378
Iteration 26, loss = 0.45329041
Iteration 27, loss = 0.45004952
Iteration 28, loss = 0.44649878
Iteration 29, loss = 0.44379930
Iteration 30, loss = 0.44104774
Iteration 31, loss = 0.43841510
Iteration 32, los

Iteration 271, loss = 0.37383434
Iteration 272, loss = 0.37382358
Iteration 273, loss = 0.37369956
Iteration 274, loss = 0.37361888
Iteration 275, loss = 0.37339192
Iteration 276, loss = 0.37331199
Iteration 277, loss = 0.37316095
Iteration 278, loss = 0.37293798
Iteration 279, loss = 0.37301104
Iteration 280, loss = 0.37272511
Iteration 281, loss = 0.37287184
Iteration 282, loss = 0.37266780
Iteration 283, loss = 0.37240260
Iteration 284, loss = 0.37235388
Iteration 285, loss = 0.37275856
Iteration 286, loss = 0.37236653
Iteration 287, loss = 0.37188023
Iteration 288, loss = 0.37219919
Iteration 289, loss = 0.37180961
Iteration 290, loss = 0.37198453
Iteration 291, loss = 0.37190991
Iteration 292, loss = 0.37157896
Iteration 293, loss = 0.37136566
Iteration 294, loss = 0.37150712
Iteration 295, loss = 0.37144343
Iteration 296, loss = 0.37124346
Iteration 297, loss = 0.37127428
Iteration 298, loss = 0.37112430
Iteration 299, loss = 0.37102321
Iteration 300, loss = 0.37083649
Iteration 

Iteration 521, loss = 0.35208668
Iteration 522, loss = 0.35202494
Iteration 523, loss = 0.35205277
Iteration 524, loss = 0.35197563
Iteration 525, loss = 0.35218088
Iteration 526, loss = 0.35194746
Iteration 527, loss = 0.35174044
Iteration 528, loss = 0.35196718
Iteration 529, loss = 0.35158386
Iteration 530, loss = 0.35204324
Iteration 531, loss = 0.35185749
Iteration 532, loss = 0.35176543
Iteration 533, loss = 0.35157834
Iteration 534, loss = 0.35168552
Iteration 535, loss = 0.35186045
Iteration 536, loss = 0.35122024
Iteration 537, loss = 0.35104012
Iteration 538, loss = 0.35136941
Iteration 539, loss = 0.35149337
Iteration 540, loss = 0.35094666
Iteration 541, loss = 0.35097031
Iteration 542, loss = 0.35111434
Iteration 543, loss = 0.35097975
Iteration 544, loss = 0.35133027
Iteration 545, loss = 0.35054314
Iteration 546, loss = 0.35046747
Iteration 547, loss = 0.35111755
Iteration 548, loss = 0.35024638
Iteration 549, loss = 0.35060035
Iteration 550, loss = 0.35045021
Iteration 

Iteration 795, loss = 0.33680227
Iteration 796, loss = 0.33658660
Iteration 797, loss = 0.33675096
Iteration 798, loss = 0.33642605
Iteration 799, loss = 0.33669443
Iteration 800, loss = 0.33636171
Iteration 801, loss = 0.33645774
Iteration 802, loss = 0.33630474
Iteration 803, loss = 0.33637699
Iteration 804, loss = 0.33622880
Iteration 805, loss = 0.33621596
Iteration 806, loss = 0.33629401
Iteration 807, loss = 0.33588310
Iteration 808, loss = 0.33604859
Iteration 809, loss = 0.33562015
Iteration 810, loss = 0.33612681
Iteration 811, loss = 0.33597556
Iteration 812, loss = 0.33559908
Iteration 813, loss = 0.33587057
Iteration 814, loss = 0.33569055
Iteration 815, loss = 0.33544994
Iteration 816, loss = 0.33554771
Iteration 817, loss = 0.33549889
Iteration 818, loss = 0.33576919
Iteration 819, loss = 0.33495150
Iteration 820, loss = 0.33703042
Iteration 821, loss = 0.33570982
Iteration 822, loss = 0.33590792
Iteration 823, loss = 0.33586422
Iteration 824, loss = 0.33493502
Iteration 

MLPClassifier(max_iter=2000, random_state=21, tol=1e-09, verbose=10)

In [654]:
# Predict on the test set
y_pred = nn.predict(x_train)

In [655]:
print(y_pred.mean())
print(y_train.mean())

0.31425364758698093
0.3838383838383838


In [656]:
final_predictions = pd.DataFrame({'PassengerId': passengerid_test, 'Survived': predictions_rfc})
final_predictions = final_predictions.set_index('PassengerId')
final_predictions

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,1
896,0
...,...
1305,0
1306,1
1307,0
1308,0


final_predictions.to_csv('survival_submission.csv')