# Titanic ML Competition

...

## Introduction

## Objectives

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import squarify

In [None]:
from math import pi

## Importing the dataset

In [None]:
titanic_df = pd.read_csv("Datasets/train.csv")
titanic_df.head()

In [None]:
titanic_df.info()

### Renaming of the columns & changing index

In [None]:
renamed_columns = {"Pclass":"Economic status","SibSp":"Number of siblings/spouses","Parch":"Number of parents/children"}
titanic_df.rename(columns=renamed_columns,inplace = True)

In [None]:
titanic_df.set_index(titanic_df["PassengerId"],inplace = True)
titanic_df.drop(columns="PassengerId",inplace=True)

### Data Types

In [None]:
titanic_df.select_dtypes(include=['object']).nunique()

In [None]:
titanic_df["Sex"] = titanic_df["Sex"].astype("category")
titanic_df["Embarked"] = titanic_df["Embarked"].astype("category")

In [None]:
titanic_df["Economic status"] = titanic_df["Economic status"].astype("category")

In [None]:
titanic_df[["Number of siblings/spouses","Number of parents/children"]].max()

In [None]:
titanic_df["Number of siblings/spouses"] = titanic_df["Number of siblings/spouses"].astype("int8")
titanic_df["Number of parents/children"] = titanic_df["Number of parents/children"].astype("int8")

In [None]:
titanic_df["Survived"] = titanic_df["Survived"].astype("int8")

### Missing values

In [None]:
titanic_df["Cabin"].fillna("-",inplace=True)

In [None]:
titanic_df.drop(columns=["Ticket"],inplace=True)

### Data conversions

In [None]:
new_economic_status_names = {1:"Upper",2:"Middle",3:"Lower"}
titanic_df["Economic status"].cat.rename_categories(new_economic_status_names,inplace=True)

In [None]:
new_port_names = {"C":"Cherbourg","Q":"Queenstown","S":"Southhampton"}
titanic_df["Embarked"].cat.rename_categories(new_port_names,inplace = True)

In [None]:
new_sex = {"male":"Male","female":"Female"}
titanic_df["Sex"].cat.rename_categories(new_sex,inplace = True)

### Reordering columns

In [None]:
personal_info = ["Name","Sex","Age"]
economic_status = ["Economic status","Fare"]
family = ["Number of siblings/spouses","Number of parents/children"]
journey = ["Cabin","Embarked","Survived"]
new_order = personal_info + economic_status + family + journey
titanic_df = titanic_df.reindex(columns = new_order)

### Dataset after handling it

In [None]:
titanic_df.info()

In [None]:
titanic_df.head()

## Exploratory Data Analysis

First, let's start by asking some simple questions that will get us closer to the question that matters. What sorts of people were more likely to survive?
* How many survived?
* How much does the sex determine the chances of survival?
* What about the age?
* Does the economic status helps to determine it?

### How many survived the disaster?

In [None]:
survival_values = titanic_df['Survived'].value_counts()
names = ['Died',"Survived"]
plt.figure(figsize=(10, 5), dpi=100)
 
plt.subplot2grid(shape=(1,2),loc=(0,0))
plt.bar(x=survival_values.index,height=survival_values.values,color=['lightcoral', 'lightgreen'])
plt.xticks(survival_values.index,names)
plt.title("Amount of people that survived")

plt.subplot2grid(shape=(1,2),loc=(0,1))
plt.pie(survival_values, labels=names,colors=['lightcoral', 'lightgreen'], autopct='%1.0f%%')
plt.title("Proportion of people that survived")

plt.suptitle('Survival numbers')
plt.show()

We can see that approximately 40% of the passengers survived.

### How much does the sex affect the chances of survival?

Let's begin by seeing the proportions of the passengers.

In [None]:
sex_proportions = titanic_df["Sex"].value_counts()
circle=plt.Circle( (0,0), 0.7, color='white')
plt.figure(dpi=80)
plt.pie(sex_proportions.values, labels=sex_proportions.index, colors=['goldenrod','salmon'],autopct='%1.0f%%')
p=plt.gcf()
p.gca().add_artist(circle)
plt.suptitle('Proportion of passengers')
plt.show()

Now let's see the survival rate.

In [None]:
survival_by_sex = titanic_df.groupby(by="Sex")["Survived"].agg("mean")
survival_by_sex

In [None]:
plt.figure(figsize=(4, 4), dpi=100)
plt.bar(x=survival_by_sex.index,height=survival_by_sex.values,color=['palevioletred', 'cadetblue'])
plt.ylim(top=1)
plt.title("Proportions of people that survived by sex")

We can clearly see that women proportionally had a greater survival rate than men.

### What about the age?

Can we see a pattern by exploring the age? Let's try to see if the children were more likely to survive.

In [None]:
sns.distplot( a=titanic_df["Age"], hist=True, kde=False, rug=False )
plt.title("Age distribution")
plt.show()

We can see that the main group of people that was on the ship were adults, followed by children and then by old people.

In [None]:
sns.violinplot(x=titanic_df["Survived"], y=titanic_df["Age"])

plt.title("Age distribution") 
plt.show()

Here we can see that children were more likely to survive by looking at the start of it.

In [None]:
children = titanic_df["Age"] < 13
survived = titanic_df["Survived"] == 1
survived_age = titanic_df.loc[survived & children,"Age"]
died_age = titanic_df.loc[(~survived) & children,"Age"]

In [None]:
survived_age.count()

In [None]:
died_age.count()

### Does the economic status helps to determine it?

In [None]:
total_class_members = titanic_df["Economic status"].value_counts(normalize=True).round(2)

In [None]:
survival_by_status = titanic_df.groupby(by="Economic status")["Survived"].value_counts(normalize=True).round(2)
survival_by_status = survival_by_status.unstack().reset_index()
survival_by_status.index = survival_by_status["Economic status"]
survival_by_status = survival_by_status.drop(columns="Economic status")

In [None]:
renamed_columns = {0:"Died",1:"Survived"}
survival_by_status = survival_by_status.rename(columns=renamed_columns)
survival_by_status.columns.name = None
survival_by_status

In [None]:
plt.figure(figsize=(10, 5), dpi=100)
 
plt.subplot2grid(shape=(1,2),loc=(0,0))
squarify.plot(sizes=total_class_members,value=total_class_members, label=total_class_members.index, alpha=.8,color=['sienna', 'gold','lawngreen'])
plt.axis('off')  
plt.title("% of people according to economic status")   
    
plt.subplot2grid(shape=(1,2),loc=(0,1))
plt.bar(x=survival_by_status.index,height=survival_by_status["Survived"].values,color=['gold', 'lawngreen','sienna'])
plt.ylim(top=1)
plt.title("% of people that survived by class")

plt.show()

On the graphs we can see that people from lower class were more likely to die than from upper class.

### What happened to the families?

### Is there any relationship with the embarkation port?

### Let's see the relationship between variables now

## Baseline