In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sklearn

In [2]:
tf.__version__

'2.0.0'

In [3]:

from utils.eda import reduce_mem_usage, missing_data, get_column_types

In [4]:
path = "../data/titanic/"
train = pd.read_csv(path+"train.csv")
test = pd.read_csv(path+"test.csv")

train = reduce_mem_usage(train,verbose=0)
test = reduce_mem_usage(test, verbose=0)

# EDA

In [5]:
train.columns = [f.lower() for f in train.columns]
test.columns = [f.lower() for f in test.columns]

In [6]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283302,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.099998,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# missing_data(train)

# Preprocessing

In [8]:
# Drop other columns

drop_cols = ['name', 'ticket', 'cabin']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

In [9]:
# Impute missing values (or add an "imputed" column for feature)

train['age'] = train['age'].fillna(-1)
test['age'] = test['age'].fillna(-1)

train['fare'] = train['fare'].fillna(-1)
test['fare'] = test['fare'].fillna(-1)

train['embarked'] = train['embarked'].fillna(train['embarked'].mode()[0])
test['embarked'] = test['embarked'].fillna(train['embarked'].mode()[0])

In [10]:
categorical_cols, numeric_cols = get_column_types(train)
categorical_cols, numeric_cols

(['sex', 'embarked'],
 ['passengerid', 'survived', 'pclass', 'age', 'sibsp', 'parch', 'fare'])

In [11]:
numeric_cols.remove('passengerid')
numeric_cols.remove('survived')

feature_cols = (categorical_cols + numeric_cols)
feature_cols

['sex', 'embarked', 'pclass', 'age', 'sibsp', 'parch', 'fare']

# Categorical Encoding and Normalization

In [12]:
from utils.feature_preprocessor import DataFramePreprocessor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Define how each column will be preprocessed, if no preprocessing, just map the column to None
preprocessor_map = {
    'sex': OneHotEncoder(),
    'embarked': OneHotEncoder(),
    'pclass': OneHotEncoder(),
    'age': MinMaxScaler((0,1)),
    'sibsp': MinMaxScaler((0,1)),
    'parch': MinMaxScaler((0,1)),
    'fare': MinMaxScaler((0,1))    
}


In [17]:
df_preprocessor = DataFramePreprocessor(preprocessor_map)

df_preprocessor.fit_transform(train).shape

(891, 9)
(891, 2)
(891, 3)
(891, 3)
(891, 1)
(891, 1)
(891, 1)
(891, 1)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(891, 12)

# Model Creation

In [37]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy

In [36]:
input_width = 12



In [44]:
def instanciate_neural_network_model(input_width):
    input_layer = Input(shape=(None,input_width))
    hidden_layer = Dense(units=200, activation='relu')(input_layer)
    dropout_layer = Dropout(0.2)(hidden_layer)
    output_layer = Dense(1)(dropout_layer)

    nn_model = Model(inputs=[input_layer], outputs=[output_layer])
    nn_model.compile(
        optimizer=Adam(),
        loss=CategoricalCrossentropy()
    )
    return nn_model

# Cross Validation Splits

In [12]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

In [13]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=13)
# skf_splitted = skf.split(train[feature_cols], train['survived'])

In [14]:
def get_data_from_split(X,y,train_idx, val_idx):
    # Split the dataset according to the fold indexes
    if isinstance(X, pd.DataFrame):
        X_fit = X.iloc[fit_idx]
        X_val = X.iloc[val_idx]
    else:
        X_fit = X[fit_idx]
        X_val = X[val_idx]
        
    if isinstance(y, pd.Series):
        y_fit = y.iloc[fit_idx]
        y_val = y.iloc[val_idx]
    else:
        y_fit = y[fit_idx]
        y_val = y[val_idx]
        
    return X_fit, X_val, y_fit, y_val

# Cross Validation Training

In [15]:
skf_split_generator = skf.split(train[feature_cols], train['survived'])


X = train[feature_cols] 
y = train['survived']
# X should be pd.DataFrame or an array
# y should be a pd.Series or an array

for fit_idx, val_idx in skf_split_generator:
    
    X_fit, X_val, y_fit, y_val = get_data_from_split(train[feature_cols],train['survived'],fit_idx, val_idx)
   

    display(X_fit.head())
    display(y_fit.head())
    
    
    
    print("=============")
    
    break

Unnamed: 0,sex,embarked,pclass,age,sibsp,parch,fare
1,female,C,1,38.0,1,0,71.283302
2,female,S,3,26.0,0,0,7.925
3,female,S,1,35.0,1,0,53.099998
4,male,S,3,35.0,0,0,8.05
6,male,S,1,54.0,0,0,51.862499


1    1
2    1
3    1
4    0
6    0
Name: survived, dtype: uint8



In [None]:
X_fit.shape

In [None]:
X.shape

In [None]:
X_val.shape