# Titanic
[Kaggle URL](https://www.kaggle.com/c/titanic)

## Get the Data

In [1]:
KAGGEL_CMD = "kaggle competitions download -c titanic"
LOCAL_DATA_PATH = "./dataset/"

In [2]:
import os
import subprocess

if not os.path.isdir(LOCAL_DATA_PATH):
  os.mkdir(LOCAL_DATA_PATH)

project_root = os.getcwd()
print(project_root)
os.chdir(LOCAL_DATA_PATH)
subprocess.call(KAGGEL_CMD.split(" "))
os.chdir(project_root)

/home/anurag/code/titanic-kaggle


## Load Libraries and Data

In [3]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler

train_data_csv = pd.read_csv(os.path.join(LOCAL_DATA_PATH, "train.csv"))

## Peek Data

In [7]:
train_data_csv.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [8]:
train_data_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [9]:
train_data_csv['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
train_data_csv['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [11]:
train_data_csv.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Corelation

In [13]:
corr_matrix = train_data_csv.corr()
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

## Preprocessing

In [12]:
transformations = {}

### Remove Useless Columns

In [16]:
def remove_columns(data):
    data = data.drop("Name", axis=1)
    data = data.drop("Ticket", axis=1)
    data = data.drop("PassengerId", axis=1)
    data = data.drop("Embarked", axis=1)
    data = data.drop("SibSp", axis=1)
    data = data.drop("Cabin", axis=1)
    data = data.drop("Parch", axis=1)
    return data

transformations["remove columns"] = remove_columns
remove_columns(train_data_csv.copy()).head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


### Missing Values

#### Age

In [17]:
def add_missing_age(data):
    data["Age"] = data["Age"].fillna(data["Age"].median())
    return data

transformations["add missing age"] = add_missing_age
add_missing_age(train_data_csv.copy()).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


#### Fare

In [18]:
def add_missing_fare(data):
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())
    return data

transformations["Add missing fare"] = add_missing_fare
add_missing_fare(train_data_csv.copy()).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### Encoding

#### Sex

In [20]:
def encode_sex(data):
    sex_encoder = LabelEncoder()
    data["Sex"] = sex_encoder.fit_transform(data["Sex"])
    return data

transformations["encode sex"] = encode_sex
encode_sex(train_data_csv.copy())["Sex"][:5]

0    1
1    0
2    0
3    0
4    1
Name: Sex, dtype: int64

#### Pclass

In [21]:
def encode_pclass(data):
    pclass_encoder = LabelBinarizer()
    data["Pclass"] = pclass_encoder.fit_transform(data["Pclass"])
    return data

transformations["encode Pclass"] = encode_pclass
encode_pclass(train_data_csv.copy())["Pclass"][:5]

0    0
1    1
2    0
3    1
4    0
Name: Pclass, dtype: int64

### Apply Transformation

In [22]:
data = train_data_csv.copy()
for transform_name in transformations:
    print("Applying: ", transform_name)
    data = transformations[transform_name](data)
preprocessed_data = data
print(preprocessed_data.info())

Applying:  remove columns
Applying:  add missing age
Applying:  Add missing fare
Applying:  encode sex
Applying:  encode Pclass
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
Fare        891 non-null float64
dtypes: float64(2), int64(3)
memory usage: 34.9 KB
None


In [23]:
 preprocessed_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,0,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,0,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,0,1,35.0,8.05


## Corelation

In [24]:
pre_corr_matrix = preprocessed_data.corr()
pre_corr_matrix["Survived"].sort_values(ascending=False)

Survived    1.000000
Pclass      0.285904
Fare        0.257307
Age        -0.064910
Sex        -0.543351
Name: Survived, dtype: float64

compare this with original data

In [25]:
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64