In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
titanic_data = pd.read_csv('data/train.csv')

Handling the Missing Value 

In [3]:
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace = True)


In [4]:
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace = True)


In [6]:
titanic_data = titanic_data.drop(columns = "Cabin", axis = 1)

Feature Creation

In [8]:
titanic_data['FamilySize'] = titanic_data['SibSp']+ titanic_data['Parch']+1
titanic_data['IsAlone'] = (titanic_data['FamilySize'] == 1).astype(int)

In [9]:
titanic_data['Title'] = titanic_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [11]:
titanic_data['Title'].value_counts()

Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64

In [12]:
titanic_data['Title'] = titanic_data['Title'].replace(
    ['Dr','Rev','Col','Major','Capt','Sir','Lady','Countess','Don','Jonkheer'],
    'Rare'
)


In [13]:
titanic_data['Title'].value_counts()

Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       23
Mlle        2
Mme         1
Ms          1
Name: count, dtype: int64

Catergorial Encoding

Binary categories

In [14]:
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})


One-Hot Encoding

In [15]:
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=True) 
# drop_first=True avoids dummy variable trap (important for linear models).

In [16]:
titanic_data = pd.get_dummies(titanic_data, columns=['Title'], drop_first=True)


In [18]:
titanic_data.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)

In [19]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,Embarked_Q,Embarked_S,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rare
0,0,3,0,22.0,1,0,7.25,2,0,False,True,False,False,False,True,False,False,False
1,1,1,1,38.0,1,0,71.2833,2,0,False,False,False,False,False,False,True,False,False
2,1,3,1,26.0,0,0,7.925,1,1,False,True,True,False,False,False,False,False,False
3,1,1,1,35.0,1,0,53.1,2,0,False,True,False,False,False,False,True,False,False
4,0,3,0,35.0,0,0,8.05,1,1,False,True,False,False,False,True,False,False,False


In [20]:
titanic_data.to_csv("data/processed_titanic.csv", index=False)
