# Data Preparation

In [1]:
import pandas as pd

from sklearn.preprocessing import minmax_scale

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

I add a column indicating if the row for training or testing.

I also add the columns Survived to the test data, so I can join both dataframes.

In [3]:
df_train['Train'] = 1
df_test['Train'] = 0

df_test['Survived'] = -1

In [4]:
df = pd.concat([df_train, df_test], sort=False)

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Train
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


# Categorical and Numerical Features

For each feature considered important I create a dummy or normalize the data.


In [6]:
def add_dummies(df, df_corr, feature):
    df_dummies = pd.get_dummies(df[feature], prefix=feature)
    return pd.concat([df_corr, df_dummies], axis=1)

In [7]:
def add_normalized(df, df_corr, feature):
    df_corr[feature] = minmax_scale(df[[feature]].astype(float))
    return df_corr

In [8]:
df_clean = df[['PassengerId', 'Survived', 'Train']]

### Pclass

In [9]:
df_clean = add_dummies(df, df_clean, 'Pclass')

### Sex

In [10]:
df_clean = add_dummies(df, df_clean, 'Sex')

### Age

In [11]:
df_clean = add_normalized(df, df_clean, 'Age')

### SibSp

In [12]:
df_clean = add_normalized(df, df_clean, 'SibSp')

### Parch

In [13]:
df_clean = add_normalized(df, df_clean, 'Parch')

### Fare

In [14]:
df_clean = add_normalized(df, df_clean, 'Fare')

### Embarked

In [15]:
df_clean = add_dummies(df, df_clean, 'Embarked')

### Name

In [16]:
df_title = (1 - df['Name'].str.extract(r'(?P<title_mr>.*Mr\..*)|(?P<title_mrs>.*Mrs\..*)|(?P<title_miss>.*Miss\..*)|(?P<title_master>.*Master.*)|(?P<title_don>.*Don\..*)|(?P<title_rev>.*Rev\..*)|(?P<title_dr>.*Dr\..*)|(?P<title_mme>.*Mme\..*)|(?P<title_ms>.*Ms\..*)|(?P<title_major>.*Major\..*)|(?P<title_lady>.*Lady\..*)|(?P<title_sir>.*Sir\..*)|(?P<title_mlle>.*Mlle\..*)|(?P<title_col>.*Col\..*)|(?P<title_capt>.*Capt\..*)|(?P<title_countess>.*Countess\..*)|(?P<title_jonkheer>.*Jonkheer\..*)|(?P<title_dona>.*Dona\..*)').isna())
df_clean = pd.concat([df_clean, df_title], sort=False, axis=1)

### NaNs

In [17]:
df_clean = df_clean.fillna(0)

In [18]:
df_clean.to_csv('df_clean.csv', index=False)