### Titanic Task to predict survival rate of passengers
#### '0' means the passengar won't survive and '1' means the passenger will survive.


Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

load data

In [2]:
df = pd.read_csv('./Titanic-Dataset.csv', na_values="?")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.shape

(891, 12)

Data cleaning


In [7]:
# check for empty values
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Age has 177 empty values, Cabin has 687, and Embarked has 2. 

1. Can replace Age and Cabin missing values with either the mean, median, bayesian formula, drop. 
    *   Fill age missing values with mean of dataset and drop missing values in Cabin and Embarked


In [8]:
# Handle age missing values
age_mean = df['Age'].mean()
df['Age'].fillna(age_mean, inplace=True)
df['Age'].isna().sum()

0

In [11]:
# for Embarked, let's take the mode of data
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# recheck for missing vals
df['Embarked'].isna().sum()

0

In [12]:
# for Cabin, drop the whole column as there are too many missing values
df.drop('Cabin', axis = 1, inplace=True)

In [13]:
df.shape

(891, 11)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [15]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Notice that we have 4 categorical features and two type float. 
Before modeling, we must ensure all data is of same data type and within the same scale
This means, we must apply Encoding and Scaling tecnhiques. 

One-Hot Encoding:
* Apply this for features: Sex and Embarked
* Why? because low cardinality and won't increase dimensionality of data much


In [16]:
df_encoded_low = pd.get_dummies(df, columns=['Sex', 'Embarked'])
# let's look at the encoded data
df_encoded_low.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,1,0,0,1


### Name and Ticket are both high cardinality data types with no ordinance. Therefore, after exploring the data, it is noticable that names have Prefixes like Mr./Mrs. which could be extracted for meaningful insights. Tickets with a similar pattern can indicate a family or group of friends traveling together vs. Individual passenger. 


For this reason, after feature extraction, we can apply one-hot encoding to reduce dimentionality. 

In [25]:
# Extract prefixes from 'Name'
df['Pre'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Check the dist. of extracted prefixes
prefix_counts = df['Pre'].value_counts()

# Perform one-hot encoding on the extracted prefixes
df_prefix_encoded = pd.get_dummies(df['Pre'], prefix='Pre')

# Merge the encoded prefixes back to the main dataframe
df_final = pd.concat([df, df_prefix_encoded], axis=1)

# For ticket, create a group size feature based on shared 'Ticket' numbers
ticket_group_size = df.groupby('Ticket')['PassengerId'].transform('count')
df_final['TicketGroupSize'] = ticket_group_size

# Display the dist. of prefixes to check for changes
prefix_counts, df_final[['Name', 'Pre', 'TicketGroupSize']].head()


(Mr          517
 Miss        182
 Mrs         125
 Master       40
 Dr            7
 Rev           6
 Mlle          2
 Major         2
 Col           2
 Countess      1
 Capt          1
 Ms            1
 Sir           1
 Lady          1
 Mme           1
 Don           1
 Jonkheer      1
 Name: Pre, dtype: int64,
                                                 Name   Pre  TicketGroupSize
 0                            Braund, Mr. Owen Harris    Mr                1
 1  Cumings, Mrs. John Bradley (Florence Briggs Th...   Mrs                1
 2                             Heikkinen, Miss. Laina  Miss                1
 3       Futrelle, Mrs. Jacques Heath (Lily May Peel)   Mrs                2
 4                           Allen, Mr. William Henry    Mr                1)

In [27]:
# concat sex and embarked encoded cols to final df as well
df_final = pd.concat([df_final, df_encoded_low], axis=1)

In [29]:
df_final.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'Pre', 'Pre_Capt', 'Pre_Col',
       'Pre_Countess', 'Pre_Don', 'Pre_Dr', 'Pre_Jonkheer', 'Pre_Lady',
       'Pre_Major', 'Pre_Master', 'Pre_Miss', 'Pre_Mlle', 'Pre_Mme', 'Pre_Mr',
       'Pre_Mrs', 'Pre_Ms', 'Pre_Rev', 'Pre_Sir', 'TicketGroupSize',
       'PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

Split data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split