# Titanic Survival with DNN

**Predicting survival on the Titanic using an artificial neural network in Keras**


**Supervised Learning. Binary classification**


This project is based on a dataset containing demographics and passenger information from 891 of the 2224 passengers and crew on board the Titanic. A description of this dataset is on the [Kaggle website](https://www.kaggle.com/c/titanic/data), where the data was obtained.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper_ds
import keras

helper_ds.info_system()
helper_ds.reproducible(seed=0)  # Setup reproducible results from run to run using Keras

%matplotlib inline

## 1. Data Processing and Exploratory Data Analysis

In [None]:
data_path = "data/titanic_data.csv"
target = ["Survived"]  # the target will remain the same throughout the notebook

df_original = pd.read_csv(data_path)
print("{} rows \n{} columns \ntarget: {}".format(*df_original.shape, target))

### Show the data

In [None]:
df_original.head(3)

#### Numerical Data

In [None]:
df_original.describe(percentiles=[0.5])

#### Non-numerical Data

In [None]:
df_original.describe(include=["O"])

#### Missing values

In [None]:
helper_ds.missing(df_original)

- Binary target "Survived": ~38% ones; F1 score won't be used <br>
- Some values are missing for key values (e.g. Age)
- Some features (e.g. PassengerID, Name, Ticket) seem irelevant to survival probabilities <br> 

### Transform the data

#### Enhance and add new features

In [None]:
df = df_original.copy()  # modified dataset


def enhance_features(df, dict_categories=None):
    """Enhance dataframe df"""

    df = df.copy()

    # filter Cabin to first letter
    df["Cabin"] = df["Cabin"].str[0]

    # get Title from Name
    df["Title"] = df["Name"].str.extract("([A-Za-z]+)\.", expand=False)

    # remove low frequency values for the new feautres
    fields = ["Cabin", "Title"]
    df, dict_categories = helper_ds.remove_categories(df, target=target, show=False)

    # Alone passenger
    df["Alone"] = ((df["SibSp"] + df["Parch"]) == 0).astype(int)

    return df, dict_categories


df, dict_categories = enhance_features(df)

#### Remove irrelevant features

In [None]:
def drop_irrelevant_features(df, inplace=False):
    """Remove non-relevant columns from dataftame df (inplace)"""

    if not inplace:
        df = df.copy()

    df.drop(["PassengerId", "Name", "Ticket"], axis="columns", inplace=True)

    if not inplace:
        return df


drop_irrelevant_features(df, inplace=True)

#### Classify variables

Change categorical variables as dtype 'categorical' and sort columns: numerical + categorical + target

In [None]:
df = helper_ds.classify_data(df, target, numerical=["Age", "SibSp", "Parch", "Fare"])

pd.DataFrame(dict(df.dtypes), index=["Type"])[df.columns].head()  # show data types

### Visualize the data

#### Categorical features

In [None]:
helper_ds.show_categorical(df, target=target, sharey=True)

#### Target vs Categorical features

In [None]:
helper_ds.show_target_vs_categorical(df, target)
plt.ylim([0, 1]);

#### Numerical features

In [None]:
helper_ds.show_numerical(df, kde=True)

#### Target vs numerical features

In [None]:
helper_ds.show_target_vs_numerical(df, target, jitter=0.2)
plt.ylim([-0.4, 1.4])
plt.yticks([0, 1]);
# df.groupby('Survived')['Age'].hist(alpha=0.4)
# helper_ds.show_target_vs_numerical(df_3sigma, target, numerical, jitter=0.2)

#### Correlation between numerical features and target

In [None]:
helper_ds.correlation(df, target)

#### Most relevant features

In [None]:
sns.FacetGrid(df, row="Sex", col="Pclass", hue="Survived", size=3, margin_titles=True).map(
    plt.hist, "Age", alpha=0.7
).add_legend()
plt.ylim([0, 70]);
# df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(
#     by='Survived', ascending=False)
# helper_ds.show_target_vs_categorical(df.loc[(df['Age']<12) | (df['Sex']=='female')],
#                                   target, categorical)

-  Unlike in third class, most children and women in first and second classes survived.

### Fill missing values

In [None]:
helper_ds.missing(df)

In [None]:
plt.figure(figsize=(7, 3))
sns.countplot(data=df, x="Pclass", hue="Cabin");

In [None]:
helper_ds.show_target_vs_categorical(df, ["Age"], figsize=(17, 2))  # Age vs categorical

In [None]:
def fill_missing_values(df, inplace=False):
    """Fill missing values of the dataframe df"""

    if not inplace:
        df = df.copy()

    # fill Embarked with mode
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

    # fill Cabin: the mode for grouped Pclass and Embarked
    ref = df.groupby(["Pclass", "Embarked"])["Cabin"].transform(lambda x: x.mode()[0])
    df["Cabin"].fillna(ref.iloc[0], inplace=True)

    # fill Age: the median for grouped Pclass and Title
    ref = df.groupby(["Pclass", "Title"])["Age"].transform("median")
    df["Age"].fillna(ref, inplace=True)

    # fill Title: by age and sex only (not spouse or job)
    # df.loc[df['Title']=='Master','Age'].unique()
    #     for idx, row in df.iterrows():
    #         if (pd.isnull(row['Title'])):
    #             if row['Age'] >= 13:
    #                 if row['Sex'] == 'male':
    #                     df.loc[idx, 'Title'] = "Mr"
    #                 else:
    #                     df.loc[idx, 'Title'] = "Mrs"
    #             else:
    #                 if row['Sex'] == 'male':
    #                     df.loc[idx, 'Title'] = "Master"
    #                 else:
    #                     df.loc[idx, 'Title'] = "Miss"

    # fill missing categorical values with the mode (if any)
    categorical = list(df.select_dtypes(include=["category"]))
    modes = df[categorical].mode()  # this solves fillna issue with mode()
    for idx, f in enumerate(df[categorical]):
        df[f].fillna(modes.iloc[0, idx], inplace=True)

    # fill missing numeric NaN values with the median (if any)
    df.fillna(df.median(), inplace=True)

    if not inplace:
        return df


# bins = list(range(0,80,10))
# # bins = (0, 5, 10, 15, 20, 30, 40, 50, 60)
# labels = ["{}-{}".format(i, j) for i,j in zip(bins[:-1],bins[:-1])]
# df['Age_cat'] = pd.cut(df['Age'], bins, labels=labels).astype('category')
# df = df.drop(['Age'], axis='columns')

fill_missing_values(df, inplace=True)

## 2. Neural Network model

### Select the features

In [None]:
droplist = []  # features to drop from the model

# For the model 'data' instead of 'df'
data = df.copy()
df.drop(droplist, axis="columns", inplace=True)
data.head(3)

### Scale numerical variables

Shift and scale numerical variables to a standard normal distribution. The scaling factors are saved to be used for predictions.

In [None]:
data, scale_param = helper_ds.scale(data)

### Create dummy features

Replace categorical features (no target) with dummy features

In [None]:
data, dict_dummies = helper_ds.replace_by_dummies(data, target)

model_features = [f for f in data if f not in target]  # sorted neural network inputs

data.head(3)

### Split the data into training and test sets
Data leakage: Test set hidden when training the model, but seen when preprocessing the dataset

In [None]:
from sklearn.model_selection import train_test_split


def split(data, target, test_size=0.15):

    train, test = train_test_split(data, test_size=test_size, random_state=9, stratify=data[target])

    # Separate the data into features and target (x=features, y=target)
    x_train, y_train = train.drop(target, axis=1).values, train[target].values
    x_test, y_test = test.drop(target, axis=1).values, test[target].values
    # _nc: non-categorical yet (needs one-hot encoding)

    return x_train, y_train, x_test, y_test


x_train, y_train, x_test, y_test = split(data, target, test_size=0.2)

### One-hot encode the output

In [None]:
def one_hot_output(y_train, y_test):

    num_classes = len(np.unique(y_train))
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    return y_train, y_test


y_train, y_test = one_hot_output(y_train, y_test)

print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("test size  \t X:{} \t Y:{} ".format(x_test.shape, y_test.shape))

### Build the Neural Network for Binary Classification

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout


def build_nn(input_size, output_size, summary=False):

    input_nodes = input_size
    weights = keras.initializers.RandomNormal(stddev=0.001)
    leaky_relu = keras.layers.advanced_activations.LeakyReLU(alpha=0.01)

    model = Sequential()
    model.add(
        Dense(
            input_nodes,
            input_dim=input_size,
            kernel_initializer=weights,
            activation="relu",
            bias_initializer="zero",
        )
    )
    model.add(leaky_relu)

    model.add(Dropout(0.3))

    model.add(
        Dense(
            output_size,
            activation="softmax",
            kernel_initializer=weights,
            bias_initializer="zero",
        )
    )

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    if summary:
        model.summary()

    return model


model = build_nn(x_train.shape[1], y_train.shape[1], summary=True)

### Train the Neural Network

In [None]:
from time import time

model_path = os.path.join("models", "titanic.h5")


def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
    """
    Train the neural network model. If no validation_data is provided, a split for validation
    will be used
    """

    if show:
        print("Training ....")

    callbacks = [keras.callbacks.EarlyStopping(monitor="val_loss", patience=1, verbose=0)]
    t0 = time()

    history = model.fit(
        x_train,
        y_train,
        epochs=1000,
        batch_size=64,
        verbose=0,
        validation_split=0.25,
        validation_data=validation_data,
        callbacks=callbacks,
    )

    if show:
        print("time: \t {:.1f} s".format(time() - t0))
        helper_ds.show_training(history)

    if path:
        model.save(path)
        print("\nModel saved at", path)

    return history


model = None
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
train_nn(model, x_train, y_train, path=model_path);

### Train with Cross Validation

In [None]:
from sklearn.model_selection import StratifiedKFold


def cv_train_nn(x_train, y_train, n_splits):
    """Create and Train models for cross validation. Return best model"""

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

    score = []

    best_model = None
    best_acc = 0

    print("Training {} models for Cross Validation ...".format(n_splits))

    for train, val in skf.split(x_train[:, 0], y_train[:, 0]):
        model = None
        model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
        history = train_nn(
            model,
            x_train[train],
            y_train[train],
            show=False,
            validation_data=(x_train[val], y_train[val]),
        )

        val_acc = history.history["val_acc"][-1]

        score.append(val_acc)

        if val_acc > best_acc:  # save best model (fold) for evaluation and predictions
            best_model = model
            best_acc = val_acc

    model = best_model
    print("\nCross Validation accuracy: {:.3f}".format(np.mean(score)))

    return best_model


model = cv_train_nn(x_train, y_train, 4)

### Evaluate the model

In [None]:
def evaluate_nn(model, x_test, y_test):
    score = model.evaluate(x_test, y_test, verbose=0)
    print("Test Accuracy: {:.3f}".format(score[1]))


# model = keras.models.load_model(model_path)
evaluate_nn(model, x_test, y_test)

In [None]:
y_pred = model.predict(x_test, verbose=2)
helper_ds.binary_classification_scores(y_test[:, 1], y_pred[:, 1], return_dataframe=True, index="Neural Network")

### Make predictions

In [None]:
def predict_manual(new_df):
    """
    input: custom dataframe
    """

    new_data = new_df.copy()

    # force data types to previous dataframe df
    for col in new_data:
        new_data[col] = new_data[col].astype(df.dtypes[col])

    # standardize numerical variables
    new_data, _ = helper_ds.scale(new_data, scale_param)

    # replace categorical features by dummy variables (using existing dummies)
    new_data, _ = helper_ds.replace_by_dummies(new_data, target, dict_dummies)

    # sort columns to match with manual entries
    new_data = new_data[model_features]  ## model_features: sorted list used in the model

    # make predictions
    prediction = model.predict(new_data.values)[:, 1]
    return prediction


#     for index, row in new_data.iterrows():
#         single_pred = model.predict(np.array([row]))
#         print('{}:\t {:.0f}%'.format(index,single_pred[0,1] * 100))

In [None]:
# input data format
df.describe()

In [None]:
df.describe(include=["category"])

In [None]:
print(list(df))

In [None]:
new_passengers = {
    "Average man": [26, 1, 0, 14, 2, "male", "C", "S", "Mr", 0],
    "Average woman": [26, 1, 0, 14, 2, "female", "C", "S", "Mrs", 0],
    "Alone woman 3c": [26, 0, 2, 8, 3, "female", "C", "S", "Miss", 1],
    "Boy 1c ": [7, 0, 2, 31, 1, "male", "C", "S", "Master", 0],
    "Boy 2c ": [7, 0, 2, 14, 2, "male", "C", "S", "Master", 0],
    "Boy 3c ": [7, 0, 2, 8, 3, "male", "C", "S", "Master", 0],
}

# create a dataframe with the new data
new_df = pd.DataFrame(
    data=list(new_passengers.values()),
    index=new_passengers.keys(),
    columns=[f for f in list(df) if f not in target],
)

prediction = predict_manual(new_df)
new_df["Survival prob. (%)"] = (prediction * 100).astype(int)
new_df

The results predicted from the model confirm the impact of the sex for the survival probabilities, as well as the class for the survival of women and children.

### Compare with non-enhanced features

In [None]:
# Same dataset without:
#   enhancing features
#   adding new features
#   filling missing values using grouped median


def non_enhanced_pipeline(df):

    df = df.copy()

    # select features & classify features
    df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis="columns", inplace=True)
    df = helper_ds.classify_data(df, target, numerical=["Age", "SibSp", "Parch", "Fare"])

    # fill NaN
    df.fillna(df.median(), inplace=True)

    # standardize and create dummies
    data, _ = helper_ds.scale(df)
    data, _ = helper_ds.replace_by_dummies(data, target)

    # split and one-hot output
    x_train, y_train, x_test, y_test = split(data, target, test_size=0.15)
    y_train, y_test = one_hot_output(y_train, y_test)

    # build, train and evaluate model
    model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
    train_nn(model, x_train, y_train, path=False, show=False)
    evaluate_nn(model, x_test, y_test)


non_enhanced_pipeline(df_original)

### Compare removing outliers

In [None]:
def remove_outliers_peline(df):

    df = df.copy()

    # transform features
    df, dict_categories = enhance_features(df)

    # select features & classify features
    df.drop(["PassengerId", "Name", "Ticket"], axis="columns", inplace=True)
    df = helper_ds.classify_data(df, target, numerical=["Age", "SibSp", "Parch", "Fare"])

    # remove outliers
    helper_ds.remove_outliers(df, inplace=True)  # remove default values above 3 times std

    # fill missing values (enhanced)
    fill_missing_values(df, inplace=True)

    # standardize and create dummies
    data, _ = helper_ds.scale(df)
    data, _ = helper_ds.replace_by_dummies(data, target)

    # split and one-hot output
    x_train, y_train, x_test, y_test = split(data, target, test_size=0.15)
    y_train, y_test = one_hot_output(y_train, y_test)

    # build, train and evaluate model
    model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
    train_nn(model, x_train, y_train, path=False, show=False)
    evaluate_nn(model, x_test, y_test)


remove_outliers_peline(df_original)

### Compare with non-neural network models

In [None]:
# enhanced features
helper_ds.ml_classification(x_train, y_train[:, 1], x_test, y_test[:, 1])

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_random_forest = RandomForestClassifier(
    n_estimators=30, max_depth=13, class_weight="balanced", n_jobs=-1, random_state=0
).fit(x_train, np.ravel(y_train[:, 1]))

####  Best tree-based model

In [None]:
y_pred = clf_random_forest.predict(x_test).reshape([-1, 1])
helper_ds.binary_classification_scores(y_test[:, 1], y_pred, return_dataframe=True, index="Random Forest")

#### Feature importance

In [None]:
re = helper_ds.feature_importance(model_features, clf_random_forest)