In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'train.csv', 'gender_submission.csv']


# Read Data

Here's what we'll do 
- Read data into csv files (train and test)
- Print out a small summary of the data
- Combine them into one dataset if we require later on
- Find out how many examples of each class exist in the training data (check if skewed or not)
- Find out how many features have null values
- Replace null values with mean values for numerical features
- Replace null values with some values for categorical features

 ## Read data into csv files

In [3]:
# Read data into csv files
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("train_df shape : ",train_df.shape)
print("test_df shape : ",test_df.shape)

train_df shape :  (891, 12)
test_df shape :  (418, 11)


## Print out summary of data

In [4]:
# Print small summary
print("A look at training data:")
train_df.head()

A look at training data:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
print("A look at testing data:")
test_df.head()

A look at testing data:


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


> ***Obvious observation - 'Survived' column is missing in test_df***

## Combine them into one dataset for future

In [6]:
# I tried both axis value as 0 and 1 and then checked shape to find out which one was suitable for me
# Ref link : https://pandas.pydata.org/pandas-docs/stable/merging.html
dataset = pd.concat([train_df,test_df],axis=0)
dataset.shape

(1309, 12)

## Find out how many examples of each class in training data

In [7]:
train_df.groupby('Survived')['PassengerId'].count()

Survived
0    549
1    342
Name: PassengerId, dtype: int64

**Observations** : 
1. 549+342 = 891. So no data in the training data is missing its class
2. It's not such a skewed dataset 

## How many features have null values

In [9]:
train_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

**Age**, **Cabin** and **Embarked** are the only ones having NaN values. We gotta fix them. 

In [11]:
# How many NaN values of Age in train_df?
train_df['Age'].isnull().sum()

177

In [12]:
# For Cabin
train_df['Cabin'].isnull().sum()

687

In [13]:
# For Embarked
train_df['Embarked'].isnull().sum()

2

## Fixing null / NaN values for each column one by one

### For embarked

In [14]:
train_df.groupby('Embarked')['PassengerId'].count()

Embarked
C    168
Q     77
S    644
Name: PassengerId, dtype: int64

We observed earlier that only 2 entries have NaN for Embarked. And here, we see there are only 3 possible values of Embarked - C, Q and S. Out of which, S has the most number. So, let's just assign the missing ones to S. 

In [15]:
train_df['Embarked'] = train_df['Embarked'].fillna('S')

Now, let's check again....

In [16]:
train_df.groupby('Embarked')['PassengerId'].count()

Embarked
C    168
Q     77
S    646
Name: PassengerId, dtype: int64

Perfect.

### For Age

In [17]:
train_df.groupby('Age')['PassengerId'].count()

Age
0.42      1
0.67      1
0.75      2
0.83      2
0.92      1
1.00      7
2.00     10
3.00      6
4.00     10
5.00      4
6.00      3
7.00      3
8.00      4
9.00      8
10.00     2
11.00     4
12.00     1
13.00     2
14.00     6
14.50     1
15.00     5
16.00    17
17.00    13
18.00    26
19.00    25
20.00    15
20.50     1
21.00    24
22.00    27
23.00    15
         ..
44.00     9
45.00    12
45.50     2
46.00     3
47.00     9
48.00     9
49.00     6
50.00    10
51.00     7
52.00     6
53.00     1
54.00     8
55.00     2
55.50     1
56.00     4
57.00     2
58.00     5
59.00     2
60.00     4
61.00     3
62.00     4
63.00     2
64.00     2
65.00     3
66.00     1
70.00     2
70.50     1
71.00     2
74.00     1
80.00     1
Name: PassengerId, Length: 88, dtype: int64

So, the first thing to note is, thie Age can be in decimals! So, it's more of a continuous variable than discrete one.
I think it would make sense to fix the missing ones by filling them with the mean?

In [18]:
train_df['Age'].mean()

29.69911764705882

In [19]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())

Now, let's check how many missing values remain.

In [21]:
train_df['Age'].isnull().sum()

0

Perfect.

### For Cabin

In [22]:
train_df.groupby('Cabin')['PassengerId'].count()

Cabin
A10      1
A14      1
A16      1
A19      1
A20      1
A23      1
A24      1
A26      1
A31      1
A32      1
A34      1
A36      1
A5       1
A6       1
A7       1
B101     1
B102     1
B18      2
B19      1
B20      2
B22      2
B28      2
B3       1
B30      1
B35      2
B37      1
B38      1
B39      1
B4       1
B41      1
        ..
E12      1
E121     2
E17      1
E24      2
E25      2
E31      1
E33      2
E34      1
E36      1
E38      1
E40      1
E44      2
E46      1
E49      1
E50      1
E58      1
E63      1
E67      2
E68      1
E77      1
E8       2
F E69    1
F G63    1
F G73    2
F2       3
F33      3
F38      1
F4       2
G6       4
T        1
Name: PassengerId, Length: 147, dtype: int64

Okay, So : 
- This can be alphanumeric
- 147 different vaulues exist for Cabin
- None of them seem to be far far greater in number than others
- A lot of values are actually missing - 687!

So, let's do one thing - Add a new 'Cabin' value as 'UNKNOWN' and fill the data with that

In [23]:
train_df['Cabin'] = train_df['Cabin'].fillna('UNKNOWN')

Check how many NaN now

In [25]:
train_df['Cabin'].isnull().sum()

0

Perfect.

### All NaN values fixed

In [27]:
# What any does is return whether any element in a particular axis is true or not. So, it works for us in this case. For each column, it checks if any column has a NaN value or not.
train_df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

# Preprocessing Data

- Convert Categorical values to numerical ones
- Divide train_df into train_df_X and train_df_y
- One hot values

### Convert Categorical values to numerical ones

**1. Find which columns are categorical**

Ref : https://stackoverflow.com/questions/29803093/check-which-columns-in-dataframe-are-categorical/29803290#29803290

In [28]:
all_cols = train_df.columns

In [29]:
numeric_cols = train_df._get_numeric_data().columns

In [30]:
categorical_cols = set(all_cols) - set(numeric_cols)
categorical_cols

{'Cabin', 'Embarked', 'Name', 'Sex', 'Ticket'}

**2. Convert to numerical ones using get_dummies of Pandas**

Ref : http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/

In [31]:
# First, let's backup our train_df and test_df till now
train_df_backup_filledna_still_having_categorical_data = train_df
train_df_backup_filledna_still_having_categorical_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,UNKNOWN,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,UNKNOWN,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,UNKNOWN,S


In [32]:
# Now, let's convert it.
train_df_dummies = pd.get_dummies(train_df, columns=categorical_cols)
train_df_dummies.shape

(891, 1732)

In [33]:
# However, backup's shape is still 
train_df_backup_filledna_still_having_categorical_data.shape

(891, 12)

In [34]:
# Let's check out data once
train_df_dummies.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)",...,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Cabin_UNKNOWN
0,1,0,3,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,3,35.0,0,0,8.05,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [35]:
train_df.shape

(891, 12)

### Another way to convert Categorical columns data into numerical is assigning them integers
Ref : https://stackoverflow.com/questions/42215354/pandas-get-mapping-of-categories-to-integer-value

In [36]:
# 2nd way to convert is having integers represent different values of each categorical column
train_df_numerical = train_df.copy()
for col in categorical_cols:
    train_df_numerical[col] = train_df_numerical[col].astype('category')
    train_df_numerical[col] = train_df_numerical[col].cat.codes
train_df_numerical.shape

(891, 12)

In [37]:
train_df_numerical.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.25,147,2
1,2,1,1,190,0,38.0,1,0,596,71.2833,81,0
2,3,1,3,353,0,26.0,0,0,669,7.925,147,2
3,4,1,1,272,0,35.0,1,0,49,53.1,55,2
4,5,0,3,15,1,35.0,0,0,472,8.05,147,2


*Perfect*.

Now, we have all of these available for our use : 

* **train_df**                    : original training dataset   (891,12)
* **train_df_dummies**  : training dataset with dummies (891, 1732)
* **train_df_numerical** : training dataset with integers for categorical attributes (891,12) 

# Running a model in Tensorflow

This will again involve a set of steps
- Get data converted to numpy arrays so tensorflow can read them
- Write tensorflow model
- Run a session of tensorflow model and check accuracy on training data set

Try the above for both train_df_dummies and train_df_numerical

In [38]:
# import tensorflow stuff...
import tensorflow as tf

In [39]:
# Dividing data between X and Y
# Ref : https://stackoverflow.com/questions/29763620/how-to-select-all-columns-except-one-column-in-pandas

train_df_dummies_Y = train_df_dummies['Survived']
# Don't worry. drop does not change the existing dataframe unless inplace=True is passed.
train_df_dummies_X = train_df_dummies.drop('Survived', axis=1)

train_df_numerical_X = train_df_numerical.drop('Survived', axis=1)
train_df_numerical_Y = train_df_numerical['Survived']

print("train_df_numerical_X shape : ",train_df_numerical_X.shape)
print("train_df_numerical_Y shape : ",train_df_numerical_Y.shape)
print("train_df_dummies_X shape : ",train_df_dummies_X.shape)
print("train_df_dummies_Y shape : ",train_df_dummies_Y.shape)

train_df_numerical_X shape :  (891, 11)
train_df_numerical_Y shape :  (891,)
train_df_dummies_X shape :  (891, 1731)
train_df_dummies_Y shape :  (891,)


### Converting to numpy arrays so tensorflow variables can pick it up

In [40]:
trainX_num = train_df_numerical_X.as_matrix()
trainY_num = train_df_numerical_Y.as_matrix()

trainX_dummies = train_df_dummies_X.as_matrix()
trainY_dummies = train_df_dummies_Y.as_matrix()

print("trainX_num.shape = ",trainX_num.shape)
print("trainY_num.shape = ",trainY_num.shape)
print("trainX_dummies.shape = ",trainX_dummies.shape)
print("trainY_dummies.shape = ",trainY_dummies.shape)

trainX_num.shape =  (891, 11)
trainY_num.shape =  (891,)
trainX_dummies.shape =  (891, 1731)
trainY_dummies.shape =  (891,)


In [41]:
trainY_num = trainY_num[:,np.newaxis]
trainY_dummies = trainY_dummies[:,np.newaxis]

print("trainX_num.shape = ",trainX_num.shape)
print("trainY_num.shape = ",trainY_num.shape)
print("trainX_dummies.shape = ",trainX_dummies.shape)
print("trainY_dummies.shape = ",trainY_dummies.shape)

trainX_num.shape =  (891, 11)
trainY_num.shape =  (891, 1)
trainX_dummies.shape =  (891, 1731)
trainY_dummies.shape =  (891, 1)


In [44]:
### Tensorflow model
def model(learning_rate, X_arg, Y_arg, num_of_epochs):
    # 1. Placeholders to hold data
    X = tf.placeholder(tf.float32, [11,None])
    Y = tf.placeholder(tf.float32, [1, None])

    # 2. Model. 2 layers NN. So, W1, b1, W2, b2.
    # This is basically coding forward propagation formulaes
    W1 = tf.Variable(tf.random_normal((20,11)))
    b1 = tf.Variable(tf.zeros((20,1)))
    Z1 = tf.matmul(W1,X) + b1
    A1 = tf.nn.relu(Z1)

    W2 = tf.Variable(tf.random_normal((1, 20)))
    b2 = tf.Variable(tf.zeros((1,1)))
    Z2 = tf.matmul(W2,A1) + b2
    A2 = tf.nn.sigmoid(Z2)

    # 3. Calculate cost
    cost = tf.nn.sigmoid_cross_entropy_with_logits(logits=Z2, labels=Y)
    cost_mean = tf.reduce_mean(cost)

    # 4. Optimizer (Gradient Descent / AdamOptimizer ) - Using this line, tensorflow automatically does backpropagation
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_mean)
    
    # 5. initialize variabls
    session = tf.Session()
    init = tf.global_variables_initializer()
    session.run(init)
    
    # 6. Actual loop where learning happens
    for i in range(num_of_epochs):
        _, cost_mean_val, predicted_Y_shape = session.run([optimizer, cost_mean, tf.shape(A2)], feed_dict={X:X_arg, Y:Y_arg})
        if i % 100 == 0:
            print("cost : ",cost_mean_val,"| A2 : ",predicted_Y_shape)
            
    return session.run([W1,b1,W2,b2,A2,Y],feed_dict={X:X_arg, Y:Y_arg})

In [86]:
W1_tr,b1_tr,W2_tr,b2_tr,A2,Y = model(0.005, trainX_num.T, trainY_num.T, 3000)

cost :  421.068 | A2 :  [  1 891]
cost :  19.0897 | A2 :  [  1 891]
cost :  9.44296 | A2 :  [  1 891]
cost :  5.41135 | A2 :  [  1 891]
cost :  4.09721 | A2 :  [  1 891]
cost :  3.37385 | A2 :  [  1 891]
cost :  3.42691 | A2 :  [  1 891]
cost :  4.05197 | A2 :  [  1 891]
cost :  2.9199 | A2 :  [  1 891]
cost :  2.13852 | A2 :  [  1 891]
cost :  1.95891 | A2 :  [  1 891]
cost :  1.99622 | A2 :  [  1 891]
cost :  1.77542 | A2 :  [  1 891]
cost :  2.3679 | A2 :  [  1 891]
cost :  2.0052 | A2 :  [  1 891]
cost :  1.55312 | A2 :  [  1 891]
cost :  1.47187 | A2 :  [  1 891]
cost :  1.46762 | A2 :  [  1 891]
cost :  1.49459 | A2 :  [  1 891]
cost :  1.33229 | A2 :  [  1 891]
cost :  1.39278 | A2 :  [  1 891]
cost :  1.28307 | A2 :  [  1 891]
cost :  2.47505 | A2 :  [  1 891]
cost :  1.8015 | A2 :  [  1 891]
cost :  1.19324 | A2 :  [  1 891]
cost :  1.13675 | A2 :  [  1 891]
cost :  3.13718 | A2 :  [  1 891]
cost :  1.15231 | A2 :  [  1 891]
cost :  1.28036 | A2 :  [  1 891]
cost :  1.05294 | 

In [47]:
A2.shape

(1, 891)

In [48]:
Y.shape

(1, 891)

In [72]:
A2_bool = A2 > 0.5
Y_prediction_training = A2_bool.astype(int)
Y_int = Y.astype(int)

In [73]:
Y_int

array([[0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 

In [74]:
Y_prediction_training

array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
        1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 

In [75]:
accuracy = (Y_prediction_training == Y_int).mean()
accuracy

0.80695847362514028

### Awesome

80.6% accuracy isn't bad on training dataset. That too, with just 3000 epochs!
People got near 85% with 40000 epochs. So, it's fine. This is good enough.

Let's try now with dummies wala data.

In [76]:
### Tensorflow model
def model_for_dummies_data(learning_rate, X_arg, Y_arg, num_of_epochs):
    # 1. Placeholders to hold data
    X = tf.placeholder(tf.float32, [1731,None])
    Y = tf.placeholder(tf.float32, [1, None])

    # 2. Model. 2 layers NN. So, W1, b1, W2, b2.
    # This is basically coding forward propagation formulaes
    W1 = tf.Variable(tf.random_normal((100,1731)))
    b1 = tf.Variable(tf.zeros((100,1)))
    Z1 = tf.matmul(W1,X) + b1
    A1 = tf.nn.relu(Z1)

    W2 = tf.Variable(tf.random_normal((1, 100)))
    b2 = tf.Variable(tf.zeros((1,1)))
    Z2 = tf.matmul(W2,A1) + b2
    A2 = tf.nn.sigmoid(Z2)

    # 3. Calculate cost
    cost = tf.nn.sigmoid_cross_entropy_with_logits(logits=Z2, labels=Y)
    cost_mean = tf.reduce_mean(cost)

    # 4. Optimizer (Gradient Descent / AdamOptimizer ) - Using this line, tensorflow automatically does backpropagation
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_mean)
    
    # 5. initialize variabls
    session = tf.Session()
    init = tf.global_variables_initializer()
    session.run(init)
    
    # 6. Actual loop where learning happens
    for i in range(num_of_epochs):
        _, cost_mean_val, predicted_Y_shape = session.run([optimizer, cost_mean, tf.shape(A2)], feed_dict={X:X_arg, Y:Y_arg})
        if i % 100 == 0:
            print("cost : ",cost_mean_val,"| A2 : ",predicted_Y_shape)
            
    return session.run([W1,b1,W2,b2,A2,Y],feed_dict={X:X_arg, Y:Y_arg})

In [77]:
_,_,_,_,A2_dummies,Y_dummies = model(0.005, trainX_num.T, trainY_num.T, 3000)

cost :  771.551 | A2 :  [  1 891]
cost :  35.1499 | A2 :  [  1 891]
cost :  10.0435 | A2 :  [  1 891]
cost :  6.07549 | A2 :  [  1 891]
cost :  4.02721 | A2 :  [  1 891]
cost :  2.82459 | A2 :  [  1 891]
cost :  2.17197 | A2 :  [  1 891]
cost :  1.86025 | A2 :  [  1 891]
cost :  3.05591 | A2 :  [  1 891]
cost :  1.31164 | A2 :  [  1 891]
cost :  1.15255 | A2 :  [  1 891]
cost :  1.19615 | A2 :  [  1 891]
cost :  1.60564 | A2 :  [  1 891]
cost :  3.43758 | A2 :  [  1 891]
cost :  1.93545 | A2 :  [  1 891]
cost :  1.06232 | A2 :  [  1 891]
cost :  0.874119 | A2 :  [  1 891]
cost :  0.805396 | A2 :  [  1 891]
cost :  1.01414 | A2 :  [  1 891]
cost :  1.58733 | A2 :  [  1 891]
cost :  0.89306 | A2 :  [  1 891]
cost :  0.758746 | A2 :  [  1 891]
cost :  0.876668 | A2 :  [  1 891]
cost :  0.87613 | A2 :  [  1 891]
cost :  1.44315 | A2 :  [  1 891]
cost :  0.998441 | A2 :  [  1 891]
cost :  1.70832 | A2 :  [  1 891]
cost :  0.915267 | A2 :  [  1 891]
cost :  0.801776 | A2 :  [  1 891]
cost : 

In [78]:
A2_bool_dummies = A2_dummies > 0.5
Y_prediction_training_dummies = A2_bool_dummies.astype(int)
Y_dummies_int = Y_dummies.astype(int)

accuracy = (Y_prediction_training_dummies == Y_dummies_int).mean()
accuracy

0.80920314253647585

So, for when we use dummies data, accuracy is 80.92, and for when we replace with integers, it is 80.69. 
However, this marginal difference could also be due to different random seeds, generating different starting values of W1 and W2. 

# Prediction on Test Data

Let's use numerical wala data only now.
- Converting test data in the same form
- Pass it through the network to get the value of A2
- Concatenate this with the data and write that into csv
- Submit the csv

In [79]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


In [80]:
test_df.isnull().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [81]:
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())
test_df['Cabin'] = test_df['Cabin'].fillna('UNKNOWN')

In [82]:
test_df.isnull().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [84]:
# Converting to numerical data
test_df_numerical = test_df.copy()
for col in categorical_cols:
    test_df_numerical[col] = test_df_numerical[col].astype('category')
    test_df_numerical[col] = test_df_numerical[col].cat.codes
test_df_numerical.shape

(418, 11)

In [85]:
test_df_numerical.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,206,1,34.5,0,0,152,7.8292,76,1
1,893,3,403,0,47.0,1,0,221,7.0,76,2
2,894,2,269,1,62.0,0,0,73,9.6875,76,1
3,895,3,408,1,27.0,0,0,147,8.6625,76,2
4,896,3,178,0,22.0,1,1,138,12.2875,76,2


In [106]:
import math
# Ref : https://stackoverflow.com/questions/32109319/how-to-implement-the-relu-function-in-numpy
# Ref : https://stackoverflow.com/questions/3985619/how-to-calculate-a-logistic-sigmoid-function-in-python
def predict(W1,b1,W2,b2,X):
    
    Z1 = np.dot(W1,X) + b1
    A1 = np.maximum(Z1, 0, Z1)
    
    Z2 = np.dot(W2,A1) + b2
    A2 = 1 / (1 + np.exp(-Z2))
    return A2

In [107]:
# Let's predict
X_test = test_df_numerical.as_matrix()
X_test.shape

(418, 11)

In [108]:
W1_tr.shape

(20, 11)

In [109]:
W2_tr.shape

(1, 20)

In [110]:
final_prediction = predict(W1_tr,b1_tr,W2_tr,b2_tr,X_test.T)

In [118]:
final_prediction_int = final_prediction > 0.5
final_prediction_int = final_prediction_int.astype(int)
final_prediction_int.shape

(1, 418)

In [124]:
final_survived_df = pd.DataFrame(data=final_prediction_int.T, columns=['Survived'])
final_survived_df

Unnamed: 0,Survived
0,0
1,1
2,0
3,1
4,1
5,0
6,0
7,1
8,1
9,0


In [115]:
test_df['PassengerId']

0       892
1       893
2       894
3       895
4       896
5       897
6       898
7       899
8       900
9       901
10      902
11      903
12      904
13      905
14      906
15      907
16      908
17      909
18      910
19      911
20      912
21      913
22      914
23      915
24      916
25      917
26      918
27      919
28      920
29      921
       ... 
388    1280
389    1281
390    1282
391    1283
392    1284
393    1285
394    1286
395    1287
396    1288
397    1289
398    1290
399    1291
400    1292
401    1293
402    1294
403    1295
404    1296
405    1297
406    1298
407    1299
408    1300
409    1301
410    1302
411    1303
412    1304
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [125]:
final_df = pd.concat([test_df['PassengerId'], final_survived_df], axis=1)
final_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,1
4,896,1
5,897,0
6,898,0
7,899,1
8,900,1
9,901,0


In [127]:
# Exporting to a csv file
final_df.to_csv("output-prediction.csv", index=False)