# Titanic: Machine Learning from Disaster

The Titanic Kaggle competition can be found [at this link](https://www.kaggle.com/c/titanic/overview).

 - [Feature engineering](#Feature-engineering)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [4]:
train_df = pd.read_csv('data/train.csv', na_values='NaN', index_col='PassengerId')
test_df = pd.read_csv('data/test.csv', na_values='NaN', index_col='PassengerId')

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
train_df.describe().applymap('{:.2f}'.format)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.38,2.31,29.7,0.52,0.38,32.2
std,0.49,0.84,14.53,1.1,0.81,49.69
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.12,0.0,0.0,7.91
50%,0.0,3.0,28.0,0.0,0.0,14.45
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.33


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
Pclass      418 non-null int64
Name        418 non-null object
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Ticket      418 non-null object
Fare        417 non-null float64
Cabin       91 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [8]:
test_df.describe().applymap('{:.2f}'.format)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,418.0,332.0,418.0,418.0,417.0
mean,2.27,30.27,0.45,0.39,35.63
std,0.84,14.18,0.9,0.98,55.91
min,1.0,0.17,0.0,0.0,0.0
25%,1.0,21.0,0.0,0.0,7.9
50%,3.0,27.0,0.0,0.0,14.45
75%,3.0,39.0,1.0,0.0,31.5
max,3.0,76.0,8.0,9.0,512.33


In [9]:
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

In [10]:
train_df.drop('Ticket', axis=1, inplace=True)
test_df.drop('Ticket', axis=1, inplace=True)

## Feature engineering

I'm going to extract the title from the `Name` feature and create the `Title` feature to see if the title has any effect on the chance of survival, after which I'm going to discard the `Name` feature.

In [11]:
train_df['Title'] = train_df['Name'].apply(lambda x: x.split(',')[1].split(' ')[1])

Just a quick fix, `the` which appears above is actually of part of `the Countess. of`, so I'm going to replace `the` with `Countess.`.

In [12]:
train_df['Title'].replace('the', 'Countess.', inplace=True)

In [13]:
train_df['Title'].value_counts(ascending=False)

Mr.          517
Miss.        182
Mrs.         125
Master.      40 
Dr.          7  
Rev.         6  
Col.         2  
Mlle.        2  
Major.       2  
Ms.          1  
Don.         1  
Mme.         1  
Countess.    1  
Jonkheer.    1  
Sir.         1  
Lady.        1  
Capt.        1  
Name: Title, dtype: int64

In [14]:
train_df.drop('Name', axis=1, inplace=True)

I'm going to create the `Family Size` feature to see if that has any effect on survival.

In [15]:
train_df['Family Size'] = train_df['Parch'] + train_df['SibSp'] + 1

In [16]:
train_df['Family Size'].value_counts(ascending=False)

1     537
2     161
3     102
4     29 
6     22 
5     15 
7     12 
11    7  
8     6  
Name: Family Size, dtype: int64

I'm going to replace the `NaN` values in `Embarked` with the mode of the feature column.

In [17]:
train_df['Embarked'].fillna(train_df['Embarked'].mode().values[0], inplace=True)

In a similar vein, I'm replacing the `NaN` values in `Fare` with the median of the feature column.

In [18]:
train_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

Finally, I'm replacing the `NaN` values in `Age` with the median of the feature.

In [19]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

In [21]:
train_df.isna().any()

Survived       False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
Title          False
Family Size    False
dtype: bool

In [20]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,1,0,7.25,S,Mr.,2
2,1,1,female,38.0,1,0,71.2833,C,Mrs.,2
3,1,3,female,26.0,0,0,7.925,S,Miss.,1
4,1,1,female,35.0,1,0,53.1,S,Mrs.,2
5,0,3,male,35.0,0,0,8.05,S,Mr.,1


Let's do the same for the `test_df` data set.

In [32]:
test_df['Title'] = test_df['Name'].apply(lambda x: x.split(',')[1].split(' ')[1])
test_df.drop('Name', axis=1, inplace=True)
test_df['Family Size'] = test_df['Parch'] + test_df['SibSp'] + 1
test_df['Embarked'].fillna(test_df['Embarked'].mode().values[0], inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

In [33]:
test_df.isna().any()

Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
Title          False
Family Size    False
dtype: bool

In [34]:
test_df.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Family Size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3,male,34.5,0,0,7.8292,Q,Mr.,1
893,3,female,47.0,1,0,7.0,S,Mrs.,2
894,2,male,62.0,0,0,9.6875,Q,Mr.,1
895,3,male,27.0,0,0,8.6625,S,Mr.,1
896,3,female,22.0,1,1,12.2875,S,Mrs.,3


## Checking for correlations

In [37]:
train_df.corr()['Survived']

Survived       1.000000
Pclass        -0.338481
Age           -0.064910
SibSp         -0.035322
Parch          0.081629
Fare           0.257307
Family Size    0.016639
Name: Survived, dtype: float64