In [202]:
import pandas as pd
import numpy as np

import copy

In [203]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(list(df_train))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [204]:
def one_hot_encode(df, ls_variables_to_encode):
    df_encoded = copy.copy(df)

    for var in ls_variables_to_encode:
        df_encoded = pd.get_dummies(df, columns=[var], dtype=int, prefix=var)

    return df_encoded

### Drop Irrelevant Variables

In [205]:
df_train.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

### Transform SibSp Variable
- SibSp is mostly value 0, which means most traveled with no sibling or spouse so convert that to binary to help balance the distribution.

In [206]:
df_train['SibSp_binary'] = np.nan
df_train.loc[df_train['SibSp'] > 0, 'SibSp_binary'] = 1
df_train.loc[df_train['SibSp'] == 0, 'SibSp_binary'] = 0

df_test['SibSp_binary'] = np.nan
df_test.loc[df_test['SibSp'] > 0, 'SibSp_binary'] = 1
df_test.loc[df_test['SibSp'] == 0, 'SibSp_binary'] = 0

### Encode Variables
- Pclass is a proxy for socioeconomic status, so will one hot encode it to treat it as such rather than numerically.

In [207]:
df_train_encoded = pd.get_dummies(data=df_train, columns=['Pclass', 'Sex', 'Embarked'], dtype=int)
df_test_encoded = pd.get_dummies(data=df_test, columns=['Pclass', 'Sex', 'Embarked'], dtype=int)

### Impuate Missing Variables
- Age, Cabin

In [208]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [209]:
df_train_target_values = df_train_encoded['Survived']
df_train_encoded = df_train_encoded.drop('Survived', axis=1)

In [210]:
imp = IterativeImputer(max_iter=10, random_state=0)

In [211]:
imp.fit(df_train_encoded.values)

In [212]:
df_train_imputed = pd.DataFrame(np.round(imp.transform(df_train_encoded.values)), columns=list(df_train_encoded))
df_test_imputed = pd.DataFrame(np.round(imp.transform(df_test_encoded.values)), columns=list(df_test_encoded))

## Save Newly Preprocessed Data

In [216]:
df_train_final = pd.concat([df_train_imputed, df_train_target_values], axis=1)
df_train_final.to_csv('data/train_imputed.csv', index=False, encoding='utf-8')

In [217]:
df_test_imputed.to_csv('data/test_imputed.csv', index=False, encoding='utf-8')