# Titanic data preprocessing

In [116]:
import numpy as np
from pandas import DataFrame
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [296]:
titanic = pd.read_csv("titanic.csv",header=0)

In [297]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Drop non-informative features

In [298]:
titanic.drop(["PassengerId","Name","Ticket","Cabin"], axis=1, inplace=True)

_Note:_ "Cabin" field should ideally be very informative in predicting passenger survival but this field in this dataset has too many missing letters. Since the core purpose hers is not about data preprocessing but about preparing data for further model building from scratch pratices (to better understand model mechanisms), I decided to simply drop "Cabin" field for simplicity.

## Impute missing values in Age

_Note:_ There are many imputation techniques for missing values among which mean imputation might be a very naive one, yet again regarding the core purpose, I decided to use mean imputation just for simplicity.

In [299]:
NaImputer = preprocessing.Imputer(strategy="mean",axis=0)

In [300]:
titanic.loc[titanic["Sex"]=="male","Age"] \
= NaImputer.fit_transform(titanic.loc[titanic["Sex"]=="male","Age"].values.reshape(-1,1))

In [301]:
titanic.loc[titanic["Sex"]=="female","Age"] \
= NaImputer.fit_transform(titanic.loc[titanic["Sex"]=="female","Age"].values.reshape(-1,1))

## Scale numeric features

In [302]:
stdScaler = StandardScaler(copy=False)

In [303]:
num_cols_index = list(titanic.select_dtypes(include=["float64","int64"]).columns)

In [304]:
num_cols_index.remove("Survived")

In [305]:
titanic.loc[:,num_cols_index]=stdScaler.fit_transform(titanic.loc[:,num_cols_index])

## Dummy encode categorical features

In [306]:
cat_cols_index = titanic.select_dtypes(exclude=["float","int64"]).columns

In [307]:
titanic = pd.get_dummies(titanic,drop_first=True)

## Save scaled and encoded data to local

In [315]:
titanic.to_csv("titanic_scaled_encoded.csv")

## Split data into train and test set

In [308]:
train,test = \
train_test_split(titanic, train_size=0.7, test_size=0.3, random_state=123, stratify = titanic["Survived"])

In [311]:
# save to local
train.to_csv("../Data/train.csv")
test.to_csv("../Data/test.csv")

## Separate labels from features

In [312]:
train_X = train.drop("Survived",axis=1)
train_y = train.Survived

In [313]:
test_X = test.drop("Survived",axis=1)
test_y = test.Survived

In [318]:
# Save to local
train_X.to_csv("../Data/train_X.csv",index=False,header=True)
test_X.to_csv("../Data/test_X.csv",index=False,header=True)
train_y.to_csv("../Data/train_y.csv",index=False,header=True)
test_y.to_csv("../Data/test_y.csv",index=False,header=True)