In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from hyperdash import monitor_cell
import os
print(os.listdir("../input"))
import tensorflow as tf
from sklearn.cross_validation import train_test_split


# Any results you write to the current directory are saved as output.

['test.csv', 'train.csv', 'gender_submission.csv']




In [6]:
def get_num_of_NaN_rows(df):
    return df.isnull().sum()

def fill_NaN_values_for_numerical_column(df, colname):
    df[colname] = df[colname].fillna(df[colname].mean())
    return df

def fill_NaN_values_for_categorical_column(df, colname, value):
    df[colname] = df[colname].fillna(value)
    return df

In [7]:
# Let's make a helper method from this now.
def find_categorical_columns(df):
    all_cols = df.columns
    numeric_cols = df._get_numeric_data().columns
    return set(all_cols) - set(numeric_cols)

In [8]:
# Let's make helper function here also
def convert_categorical_column_to_integer_values(df):
    df_numerical = df.copy()
    for col in find_categorical_columns(df):
        df_numerical[col] = df_numerical[col].astype('category')
        df_numerical[col] = df_numerical[col].cat.codes
    return df_numerical

In [9]:
# First, let's list our helper functions we could make from logic used above.
def convert_sigmoid_output_to_boolean_array(array, threshold):
    array = array > threshold
    return array

def convert_boolean_array_to_binary_array(array):
    array_binary = array.astype(int)
    return array_binary

In [10]:
### Tensorflow model
def model_generic(learning_rate, X_arg, Y_arg, X_dev, Y_dev, num_of_epochs, hidden_units, threshold):
    # 1. Placeholders to hold data
    X = tf.placeholder(tf.float32, [X_arg.shape[0],None])
    Y = tf.placeholder(tf.float32, [1, None])

    # 2. Model. 2 layers NN. So, W1, b1, W2, b2.
    # This is basically coding forward propagation formulaes
    W1 = tf.Variable(tf.random_normal((hidden_units,X_arg.shape[0])))
    b1 = tf.Variable(tf.zeros((hidden_units,1)))
    Z1 = tf.matmul(W1,X) + b1
    A1 = tf.nn.relu(Z1)

    W2 = tf.Variable(tf.random_normal((1, hidden_units)))
    b2 = tf.Variable(tf.zeros((1,1)))
    Z2 = tf.matmul(W2,A1) + b2
    A2 = tf.nn.sigmoid(Z2)

    # 3. Calculate cost
    cost = tf.nn.sigmoid_cross_entropy_with_logits(logits=Z2, labels=Y)
    cost_mean = tf.reduce_mean(cost)

    # 4. Optimizer (Gradient Descent / AdamOptimizer ) - Using this line, tensorflow automatically does backpropagation
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_mean)
    
    # 5. Accuracy methods
    predicted_class = tf.greater(A2,threshold)
    prediction_arr = tf.equal(predicted_class, tf.equal(Y,1.0))
    accuracy = tf.reduce_mean(tf.cast(prediction_arr, tf.float32))
    
    # 5. initialize variabls
    session = tf.Session()
    tf.set_random_seed(1)
    init = tf.global_variables_initializer()
    session.run(init)
    
    # 6. Actual loop where learning happens
    for i in range(num_of_epochs):
        _, cost_mean_val, accuracy_val = session.run([optimizer, cost_mean, accuracy], feed_dict={X:X_arg, Y:Y_arg})
        if i % 5000 == 0 or i==(num_of_epochs-1):
            print("i:",i,", cost : ",cost_mean_val,", training accuracy : ",accuracy_val)
            
    return session.run([W1,b1,W2,b2,A2,Y,accuracy],feed_dict={X:X_dev, Y:Y_dev})

In [11]:
import math
# Ref : https://stackoverflow.com/questions/32109319/how-to-implement-the-relu-function-in-numpy
# Ref : https://stackoverflow.com/questions/3985619/how-to-calculate-a-logistic-sigmoid-function-in-python
def predict(W1,b1,W2,b2,X):
    
    Z1 = np.dot(W1,X) + b1
    A1 = np.maximum(Z1, 0, Z1)
    
    Z2 = np.dot(W2,A1) + b2
    A2 = 1 / (1 + np.exp(-Z2))
    return A2

In [12]:
# helper exercise which does the whole thing for any training dataframe given 
def execute_steps_for_titanic(columns_to_use, output_file_name, learning_rate=0.01, num_of_epochs=3000, hidden_units=50, threshold_for_output=0.5, ):
    # read data
    training_df_orig = pd.read_csv("../input/train.csv")
    testing_df_orig = pd.read_csv("../input/test.csv")
    # get X and Y separated
    train_df_Y = training_df_orig['Survived']
    train_df_X = training_df_orig[columns_to_use]
    test_df_X = testing_df_orig[columns_to_use]
    # fix missing data
    categorical_columns = find_categorical_columns(train_df_X)
    replace_values_dict = {'Embarked':'S', 'Cabin':'UNKNOWN'}
    for col in columns_to_use:
        num_of_NaN_rows = get_num_of_NaN_rows(train_df_X)[col]
        num_of_NaN_rows_test = get_num_of_NaN_rows(test_df_X)[col]
        if(num_of_NaN_rows > 0):
            print("Filling NaN values for column:",col)
            if col not in categorical_columns:
                train_df_X[col] = train_df_X[col].fillna(train_df_X[col].mean())
            else:
                train_df_X[col] = train_df_X[col].fillna(replace_values_dict[col])
        if(num_of_NaN_rows_test > 0):
            print("Filling NaN values for column:",col," in test data")
            if col not in categorical_columns:
                test_df_X[col] = test_df_X[col].fillna(test_df_X[col].mean())
            else:
                test_df_X[col] = test_df_X[col].fillna(replace_values_dict[col])
    print("Fixed NaN values in training and testing data.")
    # convert categorical to numerical data
    train_df_X_num = convert_categorical_column_to_integer_values(train_df_X)
    test_df_X_num = convert_categorical_column_to_integer_values(test_df_X)
    # Get numpy arrays for this data
    train_X = train_df_X_num.as_matrix()
    test_X = test_df_X_num.as_matrix()
    train_Y = train_df_Y.as_matrix()
    # fix rank-1 array created
    train_Y = train_Y[:,np.newaxis]
    # call model and get values 
    train_X_tr, train_X_dev, train_Y_tr, train_Y_dev = train_test_split(train_X, train_Y, test_size=0.3)
    W1,b1,W2,b2,A2,Y,final_tr_dev_accuracy = model_generic(learning_rate, train_X_tr.T, train_Y_tr.T, train_X_dev.T, train_Y_dev.T, num_of_epochs, hidden_units, threshold_for_output)
    print("Final training accuracy : ",final_tr_dev_accuracy)
    # get prediction and save it to output file
    prediction = predict(W1,b1,W2,b2,test_X.T)
    # if prediction value > threshold, then set as True, else as False
    prediction = prediction > threshold_for_output
    # Convert the True/False array to a 0 , 1 array
    prediction = prediction.astype(int)
    # Convert back to dataframe and give the column name as 'Survived'
    prediction_df = pd.DataFrame(data=prediction.T, columns=['Survived'])
    # Make a final data frame of the required output and output to csv
    final_df = pd.concat([testing_df_orig['PassengerId'], prediction_df], axis=1)
    final_file_name = output_file_name+"_tr_acc_"+"{0:.2f}".format(final_tr_dev_accuracy)+"_prediction.csv"
    final_df.to_csv(final_file_name, index=False)
    print("Done.")
    return final_file_name, final_tr_dev_accuracy

In [13]:
%%monitor_cell "Titanic all variations"

columns_to_use = ['Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
learning_rates = [0.001, 0.005, 0.01]
num_of_epochs_arr = [5000, 10000, 50000]
hidden_units_arr = [3, 10, 50]

for learning_rate in learning_rates:
    for num_of_epochs in num_of_epochs_arr:
        for hidden_units in hidden_units_arr:
            filename, accuracy_val = execute_steps_for_titanic(columns_to_use, "dev", learning_rate=learning_rate, num_of_epochs=num_of_epochs, hidden_units=hidden_units, threshold_for_output=0.5)
            print("\n","="*50)
            print("[lr:",learning_rate,"][epoch:",num_of_epochs,"][hidden:",hidden_units,"][file:",filename,"] ACCURACY : ",accuracy_val)
            print("="*50,"\n")

Filling NaN values for column: Age
Filling NaN values for column: Age  in test data
Filling NaN values for column: Fare  in test data
Filling NaN values for column: Cabin
Filling NaN values for column: Cabin  in test data
Filling NaN values for column: Embarked
Fixed NaN values in training and testing data.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


i: 0 , cost :  45.7819 , training accuracy :  0.635634
i: 4999 , cost :  0.60768 , training accuracy :  0.688604
Final training accuracy :  0.66791
Done.

[lr: 0.001 ][epoch: 5000 ][hidden: 3 ][file: dev_tr_acc_0.67_prediction.csv ] ACCURACY :  0.66791

Filling NaN values for column: Age
Filling NaN values for column: Age  in test data
Filling NaN values for column: Fare  in test data
Filling NaN values for column: Cabin
Filling NaN values for column: Cabin  in test data
Filling NaN values for column: Embarked
Fixed NaN values in training and testing data.
i: 0 , cost :  329.953 , training accuracy :  0.614767
i: 4999 , cost :  0.624247 , training accuracy :  0.796148
Final training accuracy :  0.772388
Done.

[lr: 0.001 ][epoch: 5000 ][hidden: 10 ][file: dev_tr_acc_0.77_prediction.csv ] ACCURACY :  0.772388

Filling NaN values for column: Age
Filling NaN values for column: Age  in test data
Filling NaN values for column: Fare  in test data
Filling NaN values for column: Cabin
Filling 

  # Remove the CWD from sys.path while we load stuff.


In [14]:
%%monitor_cell "Titanic all variations - Name also included"

columns_to_use = ['Name','Pclass','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
learning_rates = [0.005, 0.01]
num_of_epochs_arr = [5000, 10000]
hidden_units_arr = [3, 5, 10]

for learning_rate in learning_rates:
    for num_of_epochs in num_of_epochs_arr:
        for hidden_units in hidden_units_arr:
            filename, accuracy_val = execute_steps_for_titanic(columns_to_use, "dev", learning_rate=learning_rate, num_of_epochs=num_of_epochs, hidden_units=hidden_units, threshold_for_output=0.5)
            print("\n","="*50)
            print("[lr:",learning_rate,"][epoch:",num_of_epochs,"][hidden:",hidden_units,"][file:",filename,"] ACCURACY : ",accuracy_val)
            print("="*50,"\n")

Filling NaN values for column: Age
Filling NaN values for column: Age  in test data
Filling NaN values for column: Fare  in test data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Filling NaN values for column: Cabin
Filling NaN values for column: Cabin  in test data
Filling NaN values for column: Embarked
Fixed NaN values in training and testing data.
i: 0 , cost :  953.734 , training accuracy :  0.386838


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


i: 4999 , cost :  0.445757 , training accuracy :  0.81862
Final training accuracy :  0.746269
Done.

[lr: 0.005 ][epoch: 5000 ][hidden: 3 ][file: dev_tr_acc_0.75_prediction.csv ] ACCURACY :  0.746269

Filling NaN values for column: Age
Filling NaN values for column: Age  in test data
Filling NaN values for column: Fare  in test data
Filling NaN values for column: Cabin
Filling NaN values for column: Cabin  in test data
Filling NaN values for column: Embarked
Fixed NaN values in training and testing data.
i: 0 , cost :  165.126 , training accuracy :  0.627608
i: 4999 , cost :  0.42519 , training accuracy :  0.82183
Final training accuracy :  0.753731
Done.

[lr: 0.005 ][epoch: 5000 ][hidden: 5 ][file: dev_tr_acc_0.75_prediction.csv ] ACCURACY :  0.753731

Filling NaN values for column: Age
Filling NaN values for column: Age  in test data
Filling NaN values for column: Fare  in test data
Filling NaN values for column: Cabin
Filling NaN values for column: Cabin  in test data
Filling NaN v