# Titanic - Machine Learning from Disaster
- https://www.kaggle.com/competitions/titanic/data

In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import IPython
# from IPython.display import Image
# from IPython.core.display import HTML
from IPython.display import clear_output
from IPython.display import display
from tqdm.notebook import tqdm # FOR FANCY GREEN BAR

In [3]:
import numpy as np
import pandas as pd
import polars as pr # new pkg similar to pandas but faster
import glob

In [4]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm
import seaborn as sns
import plotly

# plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # e.g. default 100 but 300 would be a really fine plot, but slower
plt.style.use('fivethirtyeight')

# custom = {"axes.edgecolor": "black", "grid.linestyle": "dashed", "grid.color": "red"}
# sns.set_style("white", rc = custom)

In [5]:
import re
import random
import time
from datetime import datetime as dt
import scipy
import statsmodels.api as sm

In [6]:
import sklearn
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import XGBRegressor

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, jaccard_score, f1_score, r2_score, roc_curve, auc, log_loss, classification_report

classifiers = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighbors" : KNeighborsClassifier(),
    "SVC" : SVC(),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(),
    "XGBoost" : XGBClassifier()
}

In [7]:
from lazypredict.Supervised import LazyClassifier, LazyRegressor

# Data

In [8]:
print(os.listdir("../notebooks/input_data"))

['bird.png', 'Ekush-Regular.ttf', 'gender_submission.csv', 'test.csv', 'train.csv', 'Umeå1.png', 'Umeå2.png']


In [9]:
df_train = pd.read_csv('../notebooks/input_data/train.csv')
df_test = pd.read_csv('../notebooks/input_data/test.csv')

In [10]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.00,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.00,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.00,C148,C


In [11]:
df_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.50,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00,1,0,363272,7.00,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.00,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.00,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00,1,1,3101298,12.29,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.00,0,0,PC 17758,108.90,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.50,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S


In [12]:
df_train.shape, df_test.shape

((891, 12), (418, 11))

- Training set has 891 rows with 12 features and
- Test set has 418 rows with set have 11 features
- One extra feature in training set is **Survived** (target variable)

In [13]:
def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)

In [14]:
df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set' 

dfs = [df_train, df_test]

print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['Survived'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
print(df_train.columns)
print(df_test.columns)

df_all.shape

Number of Training Examples = 891
Number of Test Examples = 418

Training X Shape = (891, 12)
Training y Shape = 891

Test X Shape = (418, 11)
Test y Shape = 418

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


(1309, 12)

## 1. Exploratory Data Analysis¶
### 1.1 Overview
- **PassengerId** is the unique id of the row and it doesn't have any effect on target
- **Survived** is the target variable we are trying to predict (0 or 1):
    - 1 = Survived
    - 0 = Not Survived
- **Pclass (Passenger Class)** is the socio-economic status of the passenger and it is a categorical ordinal feature which has 3 unique values (1, 2 or 3):
    - 1 = Upper Class
    - 2 = Middle Class
    - 3 = Lower Class
- Name, Sex and Age are self-explanatory
- SibSp is the total number of the passengers' siblings and spouse
- Parch is the total number of the passengers' parents and children
- Ticket is the ticket number of the passenger
- Fare is the passenger fare
- Cabin is the cabin number of the passenger
- Embarked is port of embarkation and it is a categorical feature which has 3 unique values (C, Q or S):
    - C = Cherbourg
    - Q = Queenstown
    - S = Southampton

In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
df_train.describe() # checking numeric indipendent variables

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


## Missing Values

In [17]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [18]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [19]:
def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} has missing values \t :: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId has missing values 	 :: 0
Survived has missing values 	 :: 0
Pclass has missing values 	 :: 0
Name has missing values 	 :: 0
Sex has missing values 	 :: 0
Age has missing values 	 :: 177
SibSp has missing values 	 :: 0
Parch has missing values 	 :: 0
Ticket has missing values 	 :: 0
Fare has missing values 	 :: 0
Cabin has missing values 	 :: 687
Embarked has missing values 	 :: 2


Test Set
PassengerId has missing values 	 :: 0
Pclass has missing values 	 :: 0
Name has missing values 	 :: 0
Sex has missing values 	 :: 0
Age has missing values 	 :: 86
SibSp has missing values 	 :: 0
Parch has missing values 	 :: 0
Ticket has missing values 	 :: 0
Fare has missing values 	 :: 1
Cabin has missing values 	 :: 327
Embarked has missing values 	 :: 0




### Obervation

**Age** and **Cabin** has high number of missing values

Missing values in Age are filled with median age, but using median age of the whole data set is not a good choice. Median age of Pclass groups is the best choice because of its high correlation with Age (0.408106) and Survived (0.338481). It is also more logical to group ages by passenger classes instead of other features.

In [20]:
df_all

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.00,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.00,A/5 21171
1,38.00,C85,C,71.28,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.00,PC 17599
2,26.00,,S,7.92,"Heikkinen, Miss. Laina",0,3,3,female,0,1.00,STON/O2. 3101282
3,35.00,C123,S,53.10,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.00,113803
4,35.00,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.00,373450
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,,,S,8.05,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
1305,39.00,C105,C,108.90,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
1306,38.50,,S,7.25,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
1307,,,S,8.05,"Ware, Mr. Frederick",0,1308,3,male,0,,359309


In [21]:
df_all_corr = df_all.corr(numeric_only=True).abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()

In [22]:
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
0,Age,Age,1.0
10,Age,Pclass,0.41
17,Age,SibSp,0.24
22,Age,Fare,0.18
25,Age,Parch,0.15
29,Age,Survived,0.08
41,Age,PassengerId,0.03


In [23]:
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median(numeric_only=True)['Age']

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))
print('Median age of all passengers: {}'.format(df_all['Age'].median()))

Median age of Pclass 1 females: 36.0
Median age of Pclass 1 males: 42.0
Median age of Pclass 2 females: 28.0
Median age of Pclass 2 males: 29.5
Median age of Pclass 3 females: 22.0
Median age of Pclass 3 males: 25.0
Median age of all passengers: 28.0


In [24]:
# Filling the missing values in Age with the medians of Sex and Pclass groups
# df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

grouped = df_all.groupby(['Sex', 'Pclass'])['Age'].transform('median')
df_all['Age'].fillna(grouped, inplace=True)

In [25]:
df_all

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.00,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.00,A/5 21171
1,38.00,C85,C,71.28,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.00,PC 17599
2,26.00,,S,7.92,"Heikkinen, Miss. Laina",0,3,3,female,0,1.00,STON/O2. 3101282
3,35.00,C123,S,53.10,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.00,113803
4,35.00,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.00,373450
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,25.00,,S,8.05,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
1305,39.00,C105,C,108.90,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
1306,38.50,,S,7.25,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
1307,25.00,,S,8.05,"Ware, Mr. Frederick",0,1308,3,male,0,,359309


In [26]:
df_all.isnull().sum()

Age               0
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [27]:
df_all[df_all['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
61,38.0,B28,,80.0,"Icard, Miss. Amelie",0,62,1,female,0,1.0,113572
829,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,830,1,female,0,1.0,113572


# Reference

- https://scikit-learn.org/stable/api/sklearn.metrics.html
- https://python-charts.com/seaborn/themes/
- https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial
- https://www.kaggle.com/code/alisarpsunay/titanic-survival-prediction-logistic-regression
- https://www.kaggle.com/code/gusthema/titanic-competition-w-tensorflow-decision-forests