# Titanic: Machine Learning from Disaster

<img src='https://www.rd.com/wp-content/uploads/2019/08/shutterstock_242291458-1-760x506.jpg'>

In [1]:
# Imports
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pathlib import Path
import sklearn
import pandas as pd
import datetime as dt

# Main project directories
PROJECT_ROOT_DIR = Path.cwd()
DATASETS_DIR = PROJECT_ROOT_DIR / 'datasets'
DATASETS_TITANIC_DIR = DATASETS_DIR / 'titanic'
SUBMISSIONS_DIR = PROJECT_ROOT_DIR / 'submissions'
IMAGES_DIR = PROJECT_ROOT_DIR / 'images'

# Pretty plots
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Helper functions
def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300):
    path = IMAGES_DIR / (fig_id + '.' + fig_extension)
    print('Saving figure', fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Load Data

In [2]:
TRAIN_DATA_PATH = DATASETS_TITANIC_DIR / 'train.csv'
TEST_DATA_PATH = DATASETS_TITANIC_DIR / 'test.csv'
titanic_train = pd.read_csv(TRAIN_DATA_PATH)  # load titanic train data
titanic_test = pd.read_csv(TEST_DATA_PATH)  # load titanic test data

# Data Exploration & Visualization

In [3]:
# generate initial report on titanic training dataset before any data pre-processing
from pandas_profiling import ProfileReport
titanic_train_report = ProfileReport(titanic_train, title='Titanic Training Dataset Exploration Report')
titanic_train_report.to_widgets()

  import pandas.util.testing as tm


HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=26.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

# Data Preprocessing

In [39]:
titanic_train_no_labels = titanic_train.drop('Survived', axis=1)  # make a copy of titanic_train but without labels
titanic_train_labels = titanic_train['Survived'].copy()  # copy all titanic_train labels to a new array

In [40]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# numerical feature pipeline (Pclass, Age, SibSp, Parch, Fare)
features_num = np.array(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])
pipeline_num = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler()),
])

# 'Sex' feature pipeline
features_sex = np.array(['Sex'])
pipeline_sex = Pipeline([
    ('ord_enc', OrdinalEncoder()),
    ('std_scaler', StandardScaler()),
])

# 'Embarked' feature pipeline
features_embarked = np.array(['Embarked'])
pipeline_embarked = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_enc', OneHotEncoder(sparse=False)),
    ('std_scaler', StandardScaler()),
])

# full data preprocessing pipeline
data_preprocessing_pipeline = ColumnTransformer([
    ('num', pipeline_num, features_num),
    ('sex', pipeline_sex, features_sex),
    ('embarked', pipeline_embarked, features_embarked),
])

# fit data preprocessing pipeline to titanic training data without labels
# and transform titanic training data without labels
titanic_train_no_labels_preprocessed = data_preprocessing_pipeline.fit_transform(titanic_train_no_labels)

In [41]:
# rebuild pandas dataframe for preprocessed titanic train data without labels
embarked_ohe_categories = data_preprocessing_pipeline.named_transformers_['embarked'].named_steps['one_hot_enc'].categories_[0]
embarked_ohe_categories = features_embarked[0] + '_' + embarked_ohe_categories
column_names_after_preprocessing = np.concatenate((features_num, features_sex, embarked_ohe_categories))

titanic_train_no_labels_preprocessed_df = pd.DataFrame(titanic_train_no_labels_preprocessed, columns=column_names_after_preprocessing)
titanic_train_no_labels_preprocessed_df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,-2.031048e-16,2.562796e-16,3.456519e-16,6.716164e-17,-4.3736060000000004e-17,-4.059603e-16,-1.738851e-16,-4.017238e-16,-3.628473e-16
std,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562,1.000562
min,-1.566107,-2.253155,-0.4745452,-0.4736736,-0.6484217,-1.355574,-0.4820427,-0.3075623,-1.623803
25%,-0.3693648,-0.5924806,-0.4745452,-0.4736736,-0.4891482,-1.355574,-0.4820427,-0.3075623,-1.623803
50%,0.8273772,0.0,-0.4745452,-0.4736736,-0.3573909,0.7376951,-0.4820427,-0.3075623,0.6158384
75%,0.8273772,0.407926,0.4327934,-0.4736736,-0.02424635,0.7376951,-0.4820427,-0.3075623,0.6158384
max,0.8273772,3.870872,6.784163,6.974147,9.667167,0.7376951,2.074505,3.251373,0.6158384


In [42]:
titanic_train_after_processing = pd.concat([titanic_train_no_labels_preprocessed_df, titanic_train_labels.to_frame()], axis=1)
titanic_train_report_after_preprocessing = ProfileReport(titanic_train_after_processing, title='Titanic Training Dataset Exploration Report After Preprocessing')
titanic_train_report_after_preprocessing.to_widgets()

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=24.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

# Train Linear Regression Model

In [81]:
from sklearn.linear_model import LinearRegression, LogisticRegression

# build a models dictionary
models = [LinearRegression(), LogisticRegression(solver='lbfgs')]
models = dict([(type(model).__name__, model) for model in models])

for name, model in models.items():
    model.fit(titanic_train_no_labels_preprocessed, titanic_train_labels)
    print('Fit ' + name + ' model')

Fit LinearRegression model
Fit LogisticRegression model


In [82]:
# Evaluate model performance on the titanic training dataset

for name, model in models.items():
    raw_predictions = model.predict(titanic_train_no_labels_preprocessed)
    predictions = raw_predictions.copy()
    predictions[predictions <= 0.5] = 0
    predictions[predictions > 0.5] = 1

    ae = np.absolute(predictions - titanic_train_labels.to_numpy()).sum()
    mae = ae / len(titanic_train_no_labels_preprocessed)
    percent_correct = np.round((1 - mae) * 100, 2)
    print('Model:', name)
    print('Number of Predictions:', len(titanic_train_no_labels_preprocessed))
    print('First five raw predictions:', raw_predictions[:5])
    print('First five actual predictions:', predictions[:5])
    print('Absolute Error:', ae)
    print('Mean Absolute Error:', mae)
    print('Percent Correct:', str(percent_correct) + '%')
    print('\n')

Model: LinearRegression
Number of Predictions: 891
First five raw predictions: [0.09375837 0.92878034 0.61710431 0.87741925 0.05942609]
First five actual predictions: [0. 1. 1. 1. 0.]
Absolute Error: 180.0
Mean Absolute Error: 0.20202020202020202
Percent Correct: 79.8%


Model: LogisticRegression
Number of Predictions: 891
First five raw predictions: [0 1 1 1 0]
First five actual predictions: [0 1 1 1 0]
Absolute Error: 178
Mean Absolute Error: 0.19977553310886645
Percent Correct: 80.02%




# Make Predictions for Test Data

In [83]:
titanic_test_preprocessed = data_preprocessing_pipeline.transform(titanic_test)  # just transform, not fit

model_test_predictions = dict([(name, None) for name in models.keys()])

for name, model in models.items():
    titanic_test_predictions = model.predict(titanic_test_preprocessed)

    # use 0.5 as threshold to determine if passenger survived (1) or died (0)
    titanic_test_predictions[titanic_test_predictions > 0.5] = 1
    titanic_test_predictions[titanic_test_predictions <= 0.5] = 0
    titanic_test_predictions = titanic_test_predictions.astype('int64')
    
    model_test_predictions[name] = titanic_test_predictions

In [84]:
# generate submission
for name, predictions in model_test_predictions.items():
    submission_filename = 'submission_' + name + '_' + dt.datetime.now().strftime('%Y%m%d_%H%M%S') + '.csv'
    submission_path = SUBMISSIONS_DIR / submission_filename
    submission_df = titanic_test[['PassengerId']].copy()
    submission_df['Survived'] = predictions
    submission_df.to_csv(submission_path, index=False)
    print('Wrote predictions to submission file:', submission_path)

Wrote predictions to submission file: /Users/Ben/code/github/titanic_machine_learning_from_disaster/submissions/submission_LinearRegression_20200526_142926.csv
Wrote predictions to submission file: /Users/Ben/code/github/titanic_machine_learning_from_disaster/submissions/submission_LogisticRegression_20200526_142926.csv
