# Notebook for generating the submission csv.
-------------------------------------------------------------------------------------------------------------------
# Technology used: Tensorflow core

I start with the usual utility cells

In [11]:
# packages used for processing:
import cPickle as pickle # for pickling the processed data
import matplotlib.pyplot as plt # for visualization
import numpy as np # numerical computations

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# pandas for extracting data from csv file
import pandas as pd

# the boss of deep learning frameworks
import tensorflow as tf

# to plot the images inline
%matplotlib inline

In [2]:
# apply the seaborn makeup on the plots drawn using matplotlib
import seaborn as sns
sns.set(color_codes=True)

In [3]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [4]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
LICENSE
Models
README.md
Scripts



In [5]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path
base_model_path = "../Models"

data_files = {
    "train": os.path.join(data_path, "train.csv"),
    "test": os.path.join(data_path, "test.csv")
}

base_model_path = '../Models'

plug_and_play_data_file_path = os.path.join(data_path, "plug_and_play.pickle")

# constants:
(train_size, dev_size, test_size) = (0.9, 0.05, 0.05) # values are unit ratios
no_of_features = 57
no_of_itreations = 10000 
batch_size = 512
checkpoint_factor = 50

In [6]:
# function to unpickle the given file and load the obj back into the python environment
def unPickleIt(pickle_path): # might throw the file not found exception
    '''
        function to unpickle the object from the given path
        @param
        pickle_path => the path where the pickle file is located
        @return => the object extracted from the saved path
    '''

    with open(pickle_path, 'rb') as dumped_pickle:
        obj = pickle.load(dumped_pickle)

    return obj # return the unpickled object

# Load the data from the test.csv file to generate predictions from them

In [7]:
# load the means and variances from the plug_and_play file
dat_dict = unPickleIt(plug_and_play_data_file_path)
means = dat_dict['means']; variances = dat_dict['variances']

In [8]:
# check the shapes of theses two vals
print means.shape, variances.shape

(57, 1) (57, 1)


In [9]:
# you can delete the dat_dict now. To free up resources
del dat_dict

In [12]:
# load the data from the test.csv file
raw_data = pd.read_csv(data_files['test'])

In [14]:
# print a few rows of the raw_data
raw_data.head(10)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1
5,5,0,1,6,0,0,1,0,0,0,...,8,1,4,9,1,0,1,0,1,0
6,6,0,1,3,0,0,0,1,0,0,...,2,0,4,6,1,1,0,0,0,0
7,8,0,1,0,0,0,1,0,0,0,...,3,1,4,9,0,1,0,0,0,0
8,10,0,1,7,0,0,0,1,0,0,...,5,1,4,6,0,0,1,0,0,0
9,11,1,1,6,0,0,0,0,0,1,...,6,1,6,10,0,1,1,0,0,0


## Note that the target column is missing! since these are the vals for which predictions are to be given

In [15]:
# check the number of test examples for which predictions are to be generated.
n_test_examples = raw_data['id'].count()
print "Total test examples to be predicted: " + str(n_test_examples)

Total test examples to be predicted: 892816


In [16]:
# transform this new data to normalize it using the earlier means and variances
def normalize_data_frame(data, means, variances):
    '''
        function to normalize the pandas dataframe and convert it into a numpy array
        @param
        data => the pandas dataframe
        means => the means array for mean cancellation
        variances => the variances array for variance correction
        @return => features array
    '''
    
    # create an empty data structure to hold all the data
    features = np.ndarray(shape = (len(data.columns) - 1, data.id.count()))
    
    # iterate over all the columns and insert their slices into the features array after normalizing them
    count = 0; # start the counter from 0 and perform the required stuff
    for column in data.columns[1:]:
        feature_slice = np.array(data[column]).reshape(1, -1) # carve out the feature slice
        mean = means[count]
        variance = variances[count]
        
        feature_slice = feature_slice - mean # mean cancellation
        feature_slice = feature_slice / variance # variance normalization
        
        # add the slice to the features vector
        features[count, :] = feature_slice
        
        # do not forget to increment the counter
        count += 1
    
    return features

In [17]:
test_data = normalize_data_frame(raw_data, means, variances)

In [19]:
print "Shape of test data: " + str(test_data.shape)

Shape of test data: (57, 892816)


In [20]:
test_data[:, :2]

array([[ -4.82890952e-01,   5.33519177e-01],
       [ -8.12666095e-01,   1.45138915e+00],
       [  4.90664154e-01,   7.91116329e-02],
       [  2.39651111e+00,   2.39651111e+00],
       [ -2.22114198e-01,  -2.22114198e-01],
       [ -1.64946017e+00,  -1.64946017e+00],
       [  3.89054769e+00,  -1.34595216e+00],
       [ -1.19605782e+00,  -1.19605782e+00],
       [ -1.22744923e+00,   5.39653656e+00],
       [ -1.00037143e+00,  -1.00037143e+00],
       [ -1.00169302e+00,  -1.00169302e+00],
       [ -1.00952689e+00,  -1.00952689e+00],
       [ -1.00094678e+00,  -1.00094678e+00],
       [ -7.65382390e-01,  -7.65382390e-01],
       [  3.73781082e-01,  -1.82904874e-01],
       [  1.51326113e+00,   1.51326113e+00],
       [ -1.13775966e+00,  -1.13775966e+00],
       [ -1.18125780e+00,  -1.18125780e+00],
       [ -1.34147559e+00,   3.49304620e+00],
       [ -8.51647113e-01,   3.72121314e-01],
       [  9.40614751e-02,   3.49813546e-01],
       [ -2.05984213e-01,  -6.82824094e-01],
       [  

# The data has been properly set up. I can now proceed further with the predictions generation.

In [21]:
layer_dims = [512, 512, 512, 256, 1] # the num_units in each layer of the feed_forward neural network

In [22]:
# the tensorflow computation graph (THE MAIN NEURAL NETWORK):

model1 = tf.Graph()

with model1.as_default():
    # scoped as Inputs
    with tf.variable_scope("Input"):
        
        # define the placeholders for the input data
        input_X = tf.placeholder(tf.float32, shape=(None, no_of_features), name="Input_features") # placeholder for feeding in input data batch
        labels_Y = tf.placeholder(tf.float32, shape=(None, 1), name="Ideal_labels") # placeholder for the labels
    
    # scoped as model:
    with tf.variable_scope("Deep_Neural_Network"):
        
        # define the layers for the neural network.
        ''' This is a plain and simple neural network with relu activations '''
        # layer 1 => 
        lay1 = tf.layers.dense(input_X, layer_dims[0], activation=tf.nn.relu, name="layer_1")
        # layer 2 =>
        lay2 = tf.layers.dense(lay1, layer_dims[1], activation=tf.nn.relu, name="layer_2")
        # layer 3 =>
        lay3 = tf.layers.dense(lay2, layer_dims[2], activation=tf.nn.relu, name="layer_3")
        # layer 4 =>
        lay4 = tf.layers.dense(lay3, layer_dims[3], activation=tf.nn.relu, name="layer_4")
        # layer 5 =>
        # the last layer has activation sigmoid since it is going to output probability.
        lay5 = tf.layers.dense(lay4, layer_dims[4], name="output") # the activation is linear
        
        
        ''' Separately record all the activations as histograms '''
        # recording the summaries to visualize separately
        lay1_summary = tf.summary.histogram("lay1_summary", lay1)
        lay2_summary = tf.summary.histogram("lay2_summary", lay2)
        lay3_summary = tf.summary.histogram("lay3_summary", lay3)
        lay4_summary = tf.summary.histogram("lay4_summary", lay4)
        output_summary = tf.summary.histogram("output_summary", lay5)
        
    # scoped as predictions
    with tf.variable_scope("Prediction"):
        prediction = tf.nn.sigmoid(lay5, name="sigmoid") # apply sigmoid to the linear activation of the output
        
    # scoped as loss
    with tf.variable_scope("Loss"):
        
        # define the loss function.
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=lay5, labels=labels_Y), name="loss")
        # we use the softmanx_cross_entropy_with_logits function for this.
        
        # record the loss summary:
        tf.summary.scalar("Loss", loss)
        
    # scoped as train_step
    with tf.variable_scope("Train_Step"):
    
        # define the optimizer and the train_step:
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6) # use the default learning rate
        train_step = optimizer.minimize(loss, name="train_step")
        
    # scoped as init operation
    with tf.variable_scope("Init"):
        init_op = tf.global_variables_initializer()
    
    # scoped as summaries
    with tf.variable_scope("Summary"):
        all_summaries = tf.summary.merge_all()

In [24]:
def generate_predictions(dataX, exec_graph, model_name):
    '''
        Function to run the trained model and generate predictions for the given data
        @param 
        dataX => The data to be used for accuracy calculation
        exec_graph => the Computation graph to be used
        model_name => the model to restore the weights from
        @return => predictions array returned
    '''
    
    # the number of examples in the dataset
    no_of_examples = dataX.shape[-1]
    
    with tf.Session(graph=exec_graph) as sess:
        
        # The saver object for saving and loading the model
        saver = tf.train.Saver(max_to_keep=2)
        
        # the model must exist and you must be able to restore the weights
        model_path = os.path.join(base_model_path, model_name)
        assert os.path.isfile(os.path.join(model_path, "checkpoint")), "Model doesn't exist"
        
        saver.restore(sess, tf.train.latest_checkpoint(model_path))
        
        # compute the predictions given out by model
        preds = sess.run(prediction, feed_dict={input_X: dataX.T})
        
    # return the so calculated accuracy:
    return preds

In [None]:
''' 
    WARNING! WARNING! WARNING!
    Keep an eye on the htop meter while executing this cell. The machine might freeze momentarily if it
    is a low end machine.
'''

# get the predictions for the test_data.
model_name = os.path.join(base_model_path, "Model1")
predictions = generate_predictions(test_data, model1, model_name=model_name)

INFO:tensorflow:Restoring parameters from ../Models/../Models/Model1/Model1-10000


In [27]:
# print the shape of the generated predictions
print "The generated predictions have the shape: " + str(predictions.shape)

The generated predictions have the shape: (892816, 1)


In [28]:
# lets quickly write the function to generate the subimssion csv file
def generate_submission_file(preds, save_path, model_name):
    '''
        function to generate the submission file. 
        @param
        preds => the predictions to be written to the file
        model_name => the model used for this generation 
        save_path => the path where the file needs to be saved
        @return => None (check the save path where the file is saved)
    '''
    save_file = save_path + '_' + model_name
    
    with open(save_file, 'w') as submission:
        # write the header to the file
        submission.write("id,target\n")
        for count in range(preds.shape[0]):
            line = str(count) + ',' + str(preds[count, 0]) + '\n'
            submission.write(line) # write the line to the file
    
    # print a feedback statement to notify the required file generation
    print "The file has been generated at: " + save_file

In [30]:
# use the above function to generate the submission file
generate_submission_file(predictions, os.path.join(data_path, "submission"), "Model1")

The file has been generated at: ../Data/submission_Model1
