In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<h2> I) Data Loading </h2>

In [2]:
# Load dataset
titanic_df = pd.read_csv("../data/clean_titanic.csv")
titanic_df

Unnamed: 0,Survived,Class,Sex,Age,Fare,SiblingsSpouses,ParentsChildren,embarked
0,0,3,M,22.0,7.2500,1,0,S
1,1,1,F,38.0,71.2833,1,0,C
2,1,3,F,26.0,7.9250,0,0,S
3,1,1,F,35.0,53.1000,1,0,S
4,0,3,M,35.0,8.0500,0,0,S
...,...,...,...,...,...,...,...,...
886,0,2,M,27.0,13.0000,0,0,S
887,1,1,F,19.0,30.0000,0,0,S
888,0,3,F,28.0,23.4500,1,2,S
889,1,1,M,26.0,30.0000,0,0,C


In [3]:
titanic_df.columns

Index(['Survived', 'Class', 'Sex', 'Age', 'Fare', 'SiblingsSpouses',
       'ParentsChildren', 'embarked'],
      dtype='object')

<h2>II) Accident Overview </h2>

<h3>a) Survival Rate 

In [4]:
survived_count = float(titanic_df['Survived'].sum())       #Count the number of survivals
survival_rate = survived_count / len(titanic_df['Survived'])  #Death ratio: number of survivals / total number of passengers


print(f"Number of survivals: {survived_count:.2f}")
print(f"Chance of survive: {survival_rate:.2f}")

Number of survivals: 342.00
Chance of survive: 0.38


In [10]:
survived_df = titanic_df[['Survived', 'Sex']]
survived_df

survival_rate_gender = survived_df.value_counts()

survival_rate_gender

Survived  Sex
0         M      468
1         F      233
          M      109
0         F       81
Name: count, dtype: int64

In [13]:
men_dead = int(survival_rate_gender.iloc[0])
women_alive = int(survival_rate_gender.iloc[1])
men_alive = int(survival_rate_gender.iloc[2])
women_dead = int(survival_rate_gender.iloc[3])

print(f"Men Dead: {men_dead:.0f}")
print(f"Men Survived: {men_alive:.0f}")
print(f"Woman Dead: {women_dead:.0f}")
print(f"Women Survived: {women_alive:.0f}")

Men Dead: 468
Men Survived: 109
Woman Dead: 81
Women Survived: 233


In [14]:
women_death_ratio = women_dead / (women_dead + women_alive)
men_death_ration = men_dead / (men_dead + men_alive)

print(f"Women death ratio: {women_death_ratio:.2f}")
print(f"Men death ratio: {men_death_ration:.2f}")

Women death ratio: 0.26
Men death ratio: 0.81


<h3>b) Ticket class distribution

In [None]:
class_ticket = titanic_df[["Class"]]

class_ticket_1 = len(class_ticket[class_ticket["Class"] == 1])
class_ticket_2 = len(class_ticket[class_ticket["Class"] == 2])
class_ticket_3 = len(class_ticket[class_ticket["Class"] == 3])

class_ticket_counts = pd.DataFrame(                  #Create a dataset with the number of ticket class 1, class 2 and class 3
    {'Class 1': [class_ticket_1],
     'Class 2': [class_ticket_2],
     'Class 3': [class_ticket_3],
     })



class_ticket_counts


<h3>c) Gender and family distribution

In [None]:
#Number of women number of men

gender_df = titanic_df[['Sex']]
gender_counts = gender_df.value_counts()



#Number of children

children_df = titanic_df[['ParentsChildren']].sum()


In [None]:
print("Gender Distribution")
gender_counts

In [None]:
print("Number of Parents and Children")
children_df

<h3>d) Age distribution 

In [None]:
age_df = titanic_df[["Age"]]
age_info = age_df.describe()  #Compute mean, std, quartiles and other standar statistic metrics
age_info  


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(age_df, bins=30, kde=True) 
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

<h3>e) Ticket Price in £

In [None]:
ticket_price_df = titanic_df[['Fare']]
ticket_price_df.describe()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(ticket_price_df, bins=50, kde=True) 
plt.title("Ticket Price distribution")
plt.xlabel("Price in £")
plt.ylabel("Frequency")
plt.show()

We want to take into account inflation to estimate what would be the price of Titanic tickets today

In [None]:
#Inflation adjustement, retail price index (rpi) have been estimated with Chatgpt

rpi_2025 = 330 
rpi_2012 = 10.1

list_adjusted_price = []

for i in range(0, len(ticket_price_df)):
    old_price = ticket_price_df.loc[i, 'Fare']
    new_price = old_price * (rpi_2025 / rpi_2012)
    list_adjusted_price.append(new_price)
    
ticket_price_df['Adjusted Price'] = list_adjusted_price
ticket_price_df

In [None]:
ticket_price_df[['Adjusted Price']].describe()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(ticket_price_df[['Adjusted Price']], bins=50, kde=True) 
plt.title("Ticket Price distribution inflation adjusted")
plt.xlabel("Price in £")
plt.ylabel("Frequency")
plt.show()

<h2>III) Correlation metrics </h2>

<h3>a) Correlation Matrix

In [None]:
titanic_df_num = titanic_df
titanic_df_num

In [None]:
#Correlation matrix

titanic_df_num['Sex'] = titanic_df_num['Sex'].replace({'M' : 0, 'F' : 1})
titanic_df_num['embarked'] = titanic_df_num['embarked'].replace({'S' : 0, 'Q' : 1, 'Q' : 2, 'C' : 3})
corr_matrix = titanic_df_num.corr()


sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

<h3>b) Pairwise Plots

In [None]:
sns.pairplot(titanic_df_num)

<h3>c) PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
components = pca.fit(titanic_df_num).transform(titanic_df)


In [None]:
titanic_df_num.columns

In [None]:
r = pca.components_

print(r)

In [None]:
explained_variance = pca.explained_variance_ratio_
explained_variance

PCA 1 is corresponding to Flare and holds 93% of the variance

PCA 2 is corresponding to Age and holds 6.5% of the variance

PCA 3 is corresponding to embarked and holds 0.0005% of the variance

<h2>IV) Features linked to Survival </h2>

<h3>a) Mutual information

In [None]:
target_feature = titanic_df_num[['Survived']]  
X = titanic_df_num.drop(columns={'Survived'})
X

In [None]:
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(X, target_feature)
mi

<h3>b) Tree based importance feature

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X, target_feature)
importances = rf.feature_importances_

importances

<h3>c) Plot

In [None]:
feature_name = X.columns.to_list()

feature_importances = pd.DataFrame({
    'Features' : feature_name, 
    'Mutual Information' : mi, 
    'RF Importance' : importances
})

feature_importances = feature_importances.set_index('Features')
feature_importances

In [None]:
feature_importances.plot(kind='bar')