# Titanic Classification

using Titanic Dataset, it is required to design different classifiers

# Load the Data

In [1]:
from math import nan

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns


%matplotlib inline

In [2]:
titanic = pd.read_csv("datasets/train.csv")

# Take a Quick Look at the Data Structure

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.shape

(891, 12)

In [5]:
def expand_cab(x):
    x = str(x)

    if x == "nan":
        return nan

    cabins = x.split(" ")
    
    deck = sorted([cabin[0] for cabin in cabins])[0]

    rooms = sorted([cabin[1:] for cabin in cabins])[0]

    num_cabins = len(cabins)

    return {
        "deck": deck,
        "rooms": rooms,
        "num_cabins": num_cabins
    }


titanic[["highest_class_deck", "lowest_room_num", "num_cabins"]] = pd.json_normalize(titanic["Cabin"].map(expand_cab))
titanic.drop(columns=["Cabin"], inplace=True)

In [6]:
titanic.isna().sum(axis=0).sort_values()

PassengerId             0
Survived                0
Pclass                  0
Name                    0
Sex                     0
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Embarked                2
Age                   177
highest_class_deck    687
lowest_room_num       687
num_cabins            687
dtype: int64

In [7]:
titanic = titanic[~titanic["Embarked"].isna()]

In [8]:
titanic.isna().sum(axis=0).sort_values()

PassengerId             0
Survived                0
Pclass                  0
Name                    0
Sex                     0
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Embarked                0
Age                   177
highest_class_deck    687
lowest_room_num       687
num_cabins            687
dtype: int64

In [9]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PassengerId         889 non-null    int64  
 1   Survived            889 non-null    int64  
 2   Pclass              889 non-null    int64  
 3   Name                889 non-null    object 
 4   Sex                 889 non-null    object 
 5   Age                 712 non-null    float64
 6   SibSp               889 non-null    int64  
 7   Parch               889 non-null    int64  
 8   Ticket              889 non-null    object 
 9   Fare                889 non-null    float64
 10  Embarked            889 non-null    object 
 11  highest_class_deck  202 non-null    object 
 12  lowest_room_num     202 non-null    object 
 13  num_cabins          202 non-null    float64
dtypes: float64(3), int64(5), object(6)
memory usage: 104.2+ KB


In [10]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,num_cabins
count,889.0,889.0,889.0,712.0,889.0,889.0,889.0,202.0
mean,446.0,0.382452,2.311586,29.642093,0.524184,0.382452,32.096681,1.168317
std,256.998173,0.48626,0.8347,14.492933,1.103705,0.806761,49.697504,0.509986
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,1.0
25%,224.0,0.0,2.0,20.0,0.0,0.0,7.8958,1.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,668.0,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,4.0


In [11]:
titanic["Survived"].value_counts()

0    549
1    340
Name: Survived, dtype: int64

## Encoding the categorical variables

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
def one_hot_encode(df, col):
    enc = OneHotEncoder()

    # titanic["Embarked"] = emb_encoder.fit_transform(titanic["Embarked"])
    cat_df = pd.DataFrame(enc.fit_transform(df[[col]]).toarray(), columns=[f"{col}_{cat}" for cat in enc.categories_[0]])
    return pd.concat([df.drop(columns=[col]), cat_df], axis=1), enc

In [14]:
titanic, embarked_encoder = one_hot_encode(titanic, "Embarked")

titanic["Sex"] = titanic["Sex"].map({"male":1, "female":0})

In [15]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,highest_class_deck,lowest_room_num,num_cabins,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",1.0,22.0,1.0,0.0,A/5 21171,7.2500,,,,0.0,0.0,1.0
1,2.0,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1.0,0.0,PC 17599,71.2833,C,85,1.0,1.0,0.0,0.0
2,3.0,1.0,3.0,"Heikkinen, Miss. Laina",0.0,26.0,0.0,0.0,STON/O2. 3101282,7.9250,,,,0.0,0.0,1.0
3,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1.0,0.0,113803,53.1000,C,123,1.0,0.0,0.0,1.0
4,5.0,0.0,3.0,"Allen, Mr. William Henry",1.0,35.0,0.0,0.0,373450,8.0500,,,,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,889.0,0.0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",0.0,,1.0,2.0,W./C. 6607,23.4500,,,,0.0,1.0,0.0
889,890.0,1.0,1.0,"Behr, Mr. Karl Howell",1.0,26.0,0.0,0.0,111369,30.0000,C,148,1.0,,,
890,891.0,0.0,3.0,"Dooley, Mr. Patrick",1.0,32.0,0.0,0.0,370376,7.7500,,,,,,
61,,,,,,,,,,,,,,0.0,0.0,1.0


The following function is used to save figures for the report

In [16]:
from pathlib import Path

IMAGES_PATH = Path() / "images"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [37]:
# import matplotlib.pyplot as plt

# plt.rc('font', size=14)
# plt.rc('axes', labelsize=14, titlesize=14)
# plt.rc('legend', fontsize=14)
# plt.rc('xtick', labelsize=10)
# plt.rc('ytick', labelsize=10)

# titanic.hist(bins=50, figsize=(12, 8))
# save_fig("attribute_histogram_plots")  # extra code

# plt.show()

In [38]:
# pd.plotting.scatter_matrix(titanic[[c for c in titanic if "Embarked" not in c]], alpha=0.2, figsize=(15, 10), diagonal="kde")
plt.show()

## Looking for Correlations

In [36]:
corr_matrix = titanic.corr(numeric_only=True)
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.255290
Parch          0.083151
Embarked_Q     0.037671
Embarked_C     0.010614
PassengerId   -0.005028
num_cabins    -0.032010
Embarked_S    -0.033024
SibSp         -0.034040
Age           -0.082446
Pclass        -0.335549
Sex           -0.541585
Name: Survived, dtype: float64

# Experimenting with Attribute Combinations

In [None]:
titanic["Family_Size"] = titanic["SibSp"] + titanic["Parch"] + 1

# bins = [0, 12, 18, 60, float("inf")]
# labels = ['Child', 'Teen', 'Adult', 'Senior']
# titanic["Age_Category"] = pd.cut(titanic["Age"], bins=bins, labels=labels)
# titanic["Is_Alone"] = (titanic["Family_Size"] == 1).astype(int)
titanic.head()