# Import software libraries and load the dataset #

In [None]:
import sys                             # Read system parameters.
import os                              # Interact with the operating system.
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data.
import matplotlib                      # Create 2D charts.
import matplotlib.pyplot as plt
import sklearn                         # Perform data mining and analysis.
from time import time                  # Calculate training time.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(matplotlib.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
PROJECT_ROOT_DIR = "."
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, "titanic_data")
print('Data files in this project:', os.listdir(DATA_PATH))
data_raw_file = os.path.join(DATA_PATH, 'train.csv')
data_raw = pd.read_csv(data_raw_file)
print('Loaded {} records from {}.'.format(len(data_raw), data_raw_file))

# Get acquainted with the dataset #

In [None]:
print(data_raw.info())      # View data types and see if there are missing entries.
data_raw.head(10)           # View first 10 records.

# Examine a general summary of statistics #

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print(data_raw.describe())

# Use stacked bar visualization to show survival numbers

In [None]:
feature_list = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

for feature in feature_list:
    plot_set = data_raw.groupby([feature, 'Survived'])
    plot_set = plot_set.size().reset_index()
    plot_set = plot_set.pivot(columns=feature,index='Survived',values=0)
    plot_set.plot(kind='bar', stacked=True, rot=0, figsize=(20,3)).set_title(f'Survival Rates by {feature}');

# Look for relationships between survival, age, and sex

In [None]:
survived = 'Survived'
perished = 'Perished'

men = data_raw[data_raw['Sex']=='male'].round(0)
women = data_raw[data_raw['Sex']=='female'].round(0)

women_plot = women.groupby(['Survived','Age']).size().reset_index().pivot(columns='Survived', index='Age', values=0)
ax = women_plot.plot(kind='bar', stacked=True, figsize=(20,4), title='Female Survival by Age')
ax.set_xlim(0, 80)
ax.set_ylim(0, 20)

men_plot = men.groupby(['Survived','Age']).size().reset_index().pivot(columns='Survived', index='Age', values=0)
ax = men_plot.plot(kind='bar', stacked=True, figsize=(20,4), title='Male Survival by Age')
ax.set_xlim(0, 80)
ax.set_ylim(0, 20)

# Identify columns with missing values

In [None]:
print('Number of missing values:\n{}\n'.format(data_raw.isnull().sum()))

# Split the datasets

In [None]:
# Separate training and test sets already exist.
# A validation set will be split off from the training sets.

from sklearn.model_selection import train_test_split

# 'Survived' is the dependent variable (value to be predicted), so it will be
# removed from the training data and put into a separate DataFrame for labels.
label_columns = ['Survived']

training_columns = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

# Split the training and validation datasets and their labels.
X_train, X_val, y_train, y_val = train_test_split(data_raw[training_columns],
                                                                            data_raw[label_columns],
                                                                            random_state = 1912)

# Compare the number of rows and columns in the original data to the training and validation sets.
print(f'Original set:        {data_raw.shape}')
print('------------------------------')
print(f'Training features:   {X_train.shape}')
print(f'Validation features: {X_val.shape}')
print(f'Training labels:     {y_train.shape}')
print(f'Validation labels:   {y_val.shape}')

# Identify columns that should be modified or deleted from the training set

In [None]:
X_train.head()

# Determine how to handle ticket values

In [None]:
data_raw.Ticket.sort_values().unique()

# Identify all personal titles and embarked port codes

In [None]:
title = data_raw['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
print('Titles: ', title.unique())

embarked_loc = data_raw['Embarked']
print('Embarked locations: ', embarked_loc.unique())

# Perform common preparation on the training and validation sets

In [None]:
# Perform common cleaning and feature engineering tasks on datasets.
def prep_dataset(dataset):
    
    print('Before prep:\n\n{}\n'.format(dataset.isnull().sum())) 
    
    # PROVIDE MISSING VALUES
    
    # Fill missing Age values with the median age.
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    # Fill missing Fare values with the median fare.
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

    # Fill missing Embarked values with the mode.
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    
    
    # FEATURE ENGINEERING
    
    # Size of family and whether passenger is traveling alone.
    size_of_family = dataset['SibSp'] + dataset['Parch'] + 1        
    dataset['SizeOfFamily'] = size_of_family
        
    # Extract the title from the passenger's name.
    title = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    dataset['Title'] = title
    
    
    # CONVERT CATEGORICAL VALUES TO NUMERIC ENCODINGS
    
    title_encode = {}
    count = 1
    for i in title.unique():
        title_encode.update({i: count})
        count += 1
    
    sex_encode = {'female': 1, 'male': 2}
    
    embarked_encode = {'S':1, 'C':2, 'Q':3}
    
    dataset['SexEncoding'] = dataset['Sex'].map(sex_encode)
    dataset['EmbarkedEncoding'] = dataset['Embarked'].map(embarked_encode)
    
    dataset['TitleEncoding'] = dataset['Title'].map(title_encode)
    dataset['TitleEncoding'].fillna(dataset['TitleEncoding'].mode()[0], inplace = True)
        
    print('After prep:\n\n{}\n'.format(dataset.isnull().sum()))

    return dataset

print('---- TRAINING -----')
X_train = prep_dataset(X_train.copy())

print('---- VALIDATION -----')
X_val = prep_dataset(X_val.copy())

# Preview current training data

In [None]:
X_train.head()

# Drop columns that won't be used for training

In [None]:
# Drop unused columns from datasets.
def drop_unused(dataset):
    
    print('Columns before drop:\n\n{}\n'.format(list(dataset.columns)))
        
    dataset = dataset.drop(['PassengerId'], axis=1)
    dataset = dataset.drop(['Cabin'], axis=1)
    dataset = dataset.drop(['Ticket'], axis=1)
    dataset = dataset.drop(['Name'], axis=1)

    # These have been replaced with numeric codes.
    dataset = dataset.drop(['Title'], axis=1)
    dataset = dataset.drop(['Sex'], axis=1)
    dataset = dataset.drop(['Embarked'], axis=1)
    
    print('Columns after drop:\n\n{}\n'.format(list(dataset.columns)))
    return dataset

print('---- TRAINING -----')
X_train = drop_unused(X_train.copy())

print('--- VALIDATION ----')
X_val = drop_unused(X_val.copy())

In [None]:
X_train.head()

# Create a logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='sag', C = 0.05, max_iter = 10000)
start = time()
log_reg.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start) * 1000

# Score using the validation data.
score = log_reg.score(X_val, y_val)

print('Logistic regression model took {:.2f} milliseconds to fit.'.format(train_time))
print('Score on validation set: {:.0f}%'.format(score * 100))

In [None]:
# Use validation set to evaluate.
results_comparison = X_val.copy()
results_comparison['PredictedSurvival'] = log_reg.predict(X_val)
results_comparison['ActualSurvival'] = y_val.copy()
results_comparison['ProbPerished'] = np.round(log_reg.predict_proba(X_val)[:, 0] * 100, 2)
results_comparison['ProbSurvived'] = np.round(log_reg.predict_proba(X_val)[:, 1] * 100, 2)

# View examples of the predictions compared to actual survival.
results_comparison.head(20)

# Create a *k*-nearest neighbor model

In [None]:
from math import sqrt

# Use bootstrapping to find ideal k value.
k_num = round(sqrt(X_train.shape[0]))

# Make k odd if it is even.
if k_num % 2 == 0:
    k_num += 1

from sklearn.neighbors import KNeighborsClassifier

# Train model.
knn = KNeighborsClassifier(n_neighbors = k_num) 
start = time()
knn.fit(X_train, np.ravel(y_train))  
end=time()
train_time = (end - start) * 1000

# Score using the validation data.
score = knn.score(X_val, y_val)

print('Value of k: {}'.format(k_num))
print('KNN model took {:.2f} milliseconds to fit.'.format(train_time))
print('Score on validation set: {:.0f}%'.format(score * 100))

In [None]:
# Use validation set to evaluate.
results_comparison = X_val.copy()
results_comparison['PredictedSurvival'] = knn.predict(X_val)
results_comparison['ActualSurvival'] = y_val.copy()
results_comparison['ProbPerished'] = np.round(knn.predict_proba(X_val)[:, 0] * 100, 2)
results_comparison['ProbSurvived'] = np.round(knn.predict_proba(X_val)[:, 1] * 100, 2)

# View examples of the predictions compared to actual survival.
results_comparison.head(20)

# Use the logistic regression model to make predictions on the test data

In [None]:
# Read the test dataset.
X_test_file = os.path.join(DATA_PATH, 'test.csv')
X_test_raw = pd.read_csv(X_test_file)
print('Loaded {} records from {}\n'.format(len(X_test_raw), X_test_file))

In [None]:
# Prepare the dataset and drop unneeded columns.
print('Preparing test data for prediction\n')
X_test = prep_dataset(X_test_raw.copy())
X_test = drop_unused(X_test.copy())

In [None]:
# Show example predictions with the original test data.
results_log_reg = X_test_raw.copy()
results_log_reg['PredictedSurvival'] = log_reg.predict(X_test)
results_log_reg['ProbPerished'] = np.round(log_reg.predict_proba(X_test)[:, 0] * 100, 2)
results_log_reg['ProbSurvived'] = np.round(log_reg.predict_proba(X_test)[:, 1] * 100, 2)
results_log_reg.head(20)