# Titanic

Deep Neural Networks - Version 1

Only 
1. He_Initialization & 
2. Nesterov Optimization

No Batch Optimization and Drop out regularization

Overfits the training data. Provides a dismal 70.8% accuracy.

In [2]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model as lm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import LabelEncoder

In [3]:
#Ignore warnings 
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

In [4]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import os

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ann"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 1. Prepare Data

### Correct the Training data
#### Step1 - Change the categorical variables to numerical values

In [6]:
# Lets use Pclass, Sex, Age, SibSp, Parch, Fare, Embarked

train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].isnull().any()
#Lets predict the age from the Fare, Sex, Pclass, parch and SubSp and use it

selected_df = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'PassengerId', 'Survived']]

#Sex
selected_df['Sex'].replace('male',0, inplace=True)
selected_df['Sex'].replace('female',1, inplace=True)

#Embarked
selected_df['Embarked'].replace('S', 0, inplace=True)
selected_df['Embarked'].replace('C', 1, inplace=True)
selected_df['Embarked'].replace('Q', 2, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


### Impute for Predictors with missing values 
#### Step 2 - Use regression to predict the age missing values

In [7]:
# Select sepcific rows
age_select = selected_df.loc[selected_df['Age'].notnull(),['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Regression
lmod_age = lm.LinearRegression()
lmod_age.fit(age_select[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']], age_select['Age'])

selected_df['predicted_age'] = lmod_age.predict(selected_df[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']])

# Fill back the age correctly
#selected_df[['Age', 'predicted_age']].Age
#selected_df[['Age','predicted_age']].apply(lambda x: x)

age = selected_df['Age']
pred_age = selected_df['predicted_age']

#selected_df['Age'] = np.where(math.isnan(age) is True, pred_age, age)

for ind, row in selected_df.iterrows():
    if math.isnan(row['Age']) is True:
        if(row['predicted_age'] > 0 ):
            selected_df['Age'][ind] = row['predicted_age']
        else:
            selected_df['Age'][ind] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Correct test data
#### 1. Handle categorical predictors

In [8]:
test_df['Sex'].replace('male',0, inplace=True)
test_df['Sex'].replace('female',1, inplace=True)

test_df[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age']].isnull().any()

Pclass    False
Sex       False
SibSp     False
Parch     False
Fare       True
Age        True
dtype: bool

#### 2. Impute missing values

In [9]:
# There is only one value missing in Fare and we adjust that as per ticket prices as per observation
for ind, row in test_df.iterrows():
    if math.isnan(row['Fare']) is True:
        test_df['Fare'][ind] = 7.8958

# Setting the age
test_df['predicted_age'] = lmod_age.predict(test_df[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']])

# Correct Age
for ind, row in test_df.iterrows():
    if math.isnan(row['Age']) is True:
        if(row['predicted_age'] > 0 ):
            test_df['Age'][ind] = row['predicted_age']
        else:
            test_df['Age'][ind] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


# 2. Setup Data

In [10]:
x_train = selected_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y_train = selected_df['Survived']

In [11]:
x_train.shape

(891, 6)

# 3. Feature Engineering
[Name based Feature Engineering](https://www.kaggle.com/cdeotte/titanic-using-name-only-0-81818)

Refer to that article to understand how someone created a new feature based on family name groupings. We use that information to build a new XGBoost model and see if it bumps up our score. It is not worth spending more time than this. So, I will just attempt this one feature engineering attempt here.

What these guys are doing is this
* If a surname-woman-child combination is found to be alive in training set then they are borrowing that same surname-woman-child combination information to the test set
* By default they are assuming that all women have survived. Which means they have only corrected for those cases where they definitely know the outcome from training set

This looks a bit like gaming rather than modeling. But let us see if it works.

In [12]:
import re

In [13]:
x_train['Name'] = train_df['Name']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
p = re.compile('.*, (.*?)\.')
x_train['Title'] = x_train['Name'].apply(lambda x: p.match(x).group(1))

# Change the title now to either man / woman
x_train['Title'] = x_train['Title'].apply(lambda x: -1 if x in ["Capt","Don","Major","Col","Rev","Dr","Sir","Mr", "Jonkheer"] 
                                          else 1 if x in ["Dona","the Countess","Mme","Mlle","Ms","Miss","Lady","Mrs"]
                                                 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [15]:
# Engineer "woman-child-groups"
p = re.compile('(.*?),.*')
x_train['Surname'] = x_train['Name'].apply(lambda x: p.match(x).group(1))

# Bucket people into three groups. All males go into no-group (-1)
x_train.Surname[x_train.Title==-1] = -1

# Check the bucket sizes for others
surnamefreq = x_train.groupby(['Surname']).size()
x_train['SurnameFreq'] = x_train['Surname'].apply(lambda x: surnamefreq[x])

# If the family group size is just one then call that as no-group (-1)
x_train['Surname'][x_train['SurnameFreq'] <= 1] = -1

# Update frequencies one final time
surnamefreq = x_train.groupby(['Surname']).size()
x_train['SurnameFreq'] = x_train['Surname'].apply(lambda x: surnamefreq[x])

# Now all the remaining women and children group has to be marked as a single group because mother and child fates
# identified to be intertwined (in that shared link)

# Now change Surnames and Titles into numeric data to have them be handled by XGBoost
#x_train['Surname'][x_train['Surname'] != -1] = 1
#x_train['SurnameClass'] = x_train['Surname'].apply(lambda x: int(x))

# Now write the survival rate for the women-child-family name combinations
x_train['Survived'] = y_train
name_survival = x_train.groupby(['Surname','Survived']).size()

for ind, row in x_train.iterrows():
    surived_c = 0
    not_survived_c = 0
    
    if((row['Surname'],1) in name_survival.index):
        survived_c = name_survival[row['Surname'],1]
    else:
        survived_c = 0
    
    if((row['Surname'],0) in name_survival.index):
        not_survived_c = name_survival[row['Surname'],0]
    else:
        not_survived_c = 0
        
    if(survived_c + not_survived_c > 0):
        x_train.loc[ind,'SurnameSurvival'] = survived_c/(survived_c + not_survived_c)
    else:
        x_train.loc[ind,'SurnameSurvival'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying

#### We see that there is a slight bump in the cross validated score to 86.08%. Let us see if it really works.
##### Engineering the features for Test set now

In [16]:
p = re.compile('.*, (.*?)\.')
test_df['Title'] = test_df['Name'].apply(lambda x: p.match(x).group(1))

# Change the title now to either man / woman
test_df['Title'] = test_df['Title'].apply(lambda x: -1 if x in ["Capt","Don","Major","Col","Rev","Dr","Sir","Mr", "Jonkheer"] 
                                          else 1 if x in ["Dona","the Countess","Mme","Mlle","Ms","Miss","Lady","Mrs"]
                                                 else 0)

# Engineer "woman-child-groups"
p = re.compile('(.*?),.*')
test_df['Surname'] = test_df['Name'].apply(lambda x: p.match(x).group(1))

# Bucket people into three groups. All males go into no-group (-1)
test_df.Surname[test_df.Title==-1] = -1

# Check the bucket sizes for others
surnamefreq = test_df.groupby(['Surname']).size()
test_df['SurnameFreq'] = test_df['Surname'].apply(lambda x: surnamefreq[x])

# If the family group size is just one then call that as no-group (-1)
test_df['Surname'][test_df['SurnameFreq'] <= 1] = -1

# Now all the remaining women and children group has to be marked as a single group because mother and child fates
# identified to be intertwined (in that shared link)

# Here borrow from training set
test_df['SurnameSurvival'] = 0

#test_df['Surname'].unique()

for ind, row in test_df.iterrows():
    if(row['Surname'] in x_train['Surname'].unique()):
        test_df.loc[ind,'SurnameSurvival'] = x_train['SurnameSurvival'][x_train['Surname'] == row['Surname']].iloc[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### 4. Scale data

In [17]:
from sklearn.preprocessing import MinMaxScaler

# Scale data
scaler = MinMaxScaler()
data_train = scaler.fit_transform(x_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'SurnameSurvival']])
data_test = scaler.transform(test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'SurnameSurvival']])

  return self.partial_fit(X, y)


### 4. Create a TensorFlow model

We have 7 predictors and these we will blow out into 16 dimensions and then we will reduce it to half every time.

11 X 7 > 7 X 16 > 16 X 8 > 8 X 4 > 4 X 1

#### This has
1. 1 Input layer
2. 3 Hidden layers
3. 1 Output layer

#### And output is 0 / 1

Design the initializers, variables and layers

In [18]:
# Import TensorFlow
import tensorflow as tf

In [19]:
# Neurons
n_inputs = data_train.shape[1]
n_hidden1 = 512
n_hidden2 = 256
n_hidden3 = 64
n_outputs = 2

In [21]:
reset_graph()

# Placeholder
X = tf.placeholder(dtype=tf.float32, shape=[None, n_inputs])
y = tf.placeholder(dtype=tf.int64, shape=[None])

# Performing drop_out to ensure that we do not overfit for the data.

# He Initilization
he_init = tf.contrib.layers.variance_scaling_initializer()

# Using tensor flow neuron_layer function
with tf.name_scope("dnn"):

    # This operation outputs False nothing is fed to it. So, it will return true only during the training phase
    training = tf.placeholder_with_default(False, shape=(), name='training')
    
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=tf.nn.elu, kernel_initializer=he_init)
    
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=tf.nn.elu, kernel_initializer=he_init)
    
    hidden3 = tf.layers.dense(hidden2, n_hidden3, name="hidden3", activation=tf.nn.elu, kernel_initializer=he_init)
    
    logits = tf.layers.dense(hidden3, n_outputs, name="outputs", kernel_initializer=he_init)

In [22]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

In [23]:
# Now start optimizing the cost function for the model
learning_rate = 0.01
with tf.name_scope("train"):
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True)
    training_op = optimizer.minimize(loss)

In [24]:
#Report the overall accuracy
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [25]:
init = tf.global_variables_initializer()

In [26]:
#Execution Phase
n_epochs = 10010

# Train on full batch

# This is required to update operations related to batch_normalization at each step during the training in order to
# update the moving averages
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        sess.run([training_op, extra_update_ops], feed_dict={training: True, X: data_train,y: y_train})
        acc_train = accuracy.eval(feed_dict={X: data_train,y: y_train})        
        
        if(epoch%1000 == 1):
            print(epoch, "Train accuracy:", acc_train)
    
    pred = logits.eval(feed_dict={X: data_test})

1 Train accuracy: 0.81930417
1001 Train accuracy: 0.86195284
2001 Train accuracy: 0.86756456
3001 Train accuracy: 0.8731762
4001 Train accuracy: 0.8810325
5001 Train accuracy: 0.8821549
6001 Train accuracy: 0.88439953
7001 Train accuracy: 0.8855219
8001 Train accuracy: 0.89001125
9001 Train accuracy: 0.89001125


### 5. How do I tune this model so that I do not overfit

I am not trying to correct for overfitting here. I will just try with the overfit model itself.

In [27]:
# No need to compute the softmax scores here. Probably if I compute them, I can use them to check the PR-Curves etc.
# But I will directly consume the data as it is with whichever logit is higher that is the class-number (0/1)

import pandas as pd

pred_df = pd.DataFrame(pred)
pred_df.columns = ['zero','one']

final = pred_df.apply(lambda x : 1 if x['zero'] < x['one'] else 0, axis = 1).astype(int)

test_df['Survived'] = final

## 6. Test and proceed

Gives us a dismal 70.8% . Just shows why overfitting is a problem

In [28]:
finalResult = test_df[['PassengerId', 'Survived']]
finalResult.to_csv("result_final_dnn_overfit.csv", index=False)