# In this notebook, I will proceed with the pickled data and start the iterative machine learning process (conceptualize -> code -> experiment)
-------------------------------------------------------------------------------------------------------------------
# Technology used: Tensorflow 

# Model 1

As usual, I start with the utility cells

In [1]:
# packages used for processing:
import cPickle as pickle # for pickling the processed data
import matplotlib.pyplot as plt # for visualization
import numpy as np # numerical computations

# for operating system related stuff
import os
import sys # for memory usage of objects
from subprocess import check_output

# the boss of tensorflow frameworks
import tensorflow as tf

# to plot the images inline
%matplotlib inline

In [2]:
# apply the seaborn makeup on the plots drawn using matplotlib
import seaborn as sns
sns.set(color_codes=True)

In [3]:
# Input data files are available in the "../Data/" directory.

def exec_command(cmd):
    '''
        function to execute a shell command and see it's 
        output in the python console
        @params
        cmd = the command to be executed along with the arguments
              ex: ['ls', '../input']
    '''
    print(check_output(cmd).decode("utf8"))

In [7]:
# check the structure of the project directory
exec_command(['ls', '..'])

Data
LICENSE
Models
README.md
Scripts



In [65]:
''' Set the constants for the script '''

# various paths of the files
data_path = "../Data" # the data path
base_model_path = "../Models"

data_files = {
    "train": os.path.join(data_path, "train.csv"),
    "test": os.path.join(data_path, "test.csv")
}

base_model_path = '../Models'

plug_and_play_data_file_path = os.path.join(data_path, "plug_and_play.pickle")

# constants:
(train_size, dev_size, test_size) = (0.9, 0.05, 0.05) # values are unit ratios
no_of_features = 57
no_of_itreations = 10000 
batch_size = 512
checkpoint_factor = 50

In [9]:
# function to unpickle the given file and load the obj back into the python environment
def unPickleIt(pickle_path): # might throw the file not found exception
    '''
        function to unpickle the object from the given path
        @param
        pickle_path => the path where the pickle file is located
        @return => the object extracted from the saved path
    '''

    with open(pickle_path, 'rb') as dumped_pickle:
        obj = pickle.load(dumped_pickle)

    return obj # return the unpickled object

# Load in the data and create the train / dev / test splits

In [10]:
data_dict = unPickleIt(plug_and_play_data_file_path)

In [12]:
X = data_dict['features']; Y = data_dict['labels']

In [13]:
X.shape, Y.shape # check if the shapes are compatible

((57, 595212), (1, 595212))

In [14]:
# keep the means and variances for the features
means = data_dict['means']; variances = data_dict['variances']

In [27]:
# function to split the data into train, dev and test sets
def train_dev_test_split_data(X, Y):
    '''
        function to split the X and Y arrays into train, dev and test sets
        @param
        X => the input features to train on
        Y => the ideal labels for the given inputs
        @return => train_X, train_Y, dev_X, dev_Y, test_X, test_Y: the names suggest meanings
    '''
    m_examples = X.shape[-1] # total number of examples to train on
    
    # first parition point
    train_dev_partition_point = int((m_examples * train_size) + 0.5)
    
    # second partition point 
    dev_test_partition_point = train_dev_partition_point + int((m_examples * dev_size) + 0.5)
    
    ''' perform the actual split of the data '''
    # Training set splitting:
    train_X = X[:, : train_dev_partition_point]; train_Y = Y[:, : train_dev_partition_point]
    
    # dev set splitting
    dev_X = X[:, train_dev_partition_point: dev_test_partition_point]
    dev_Y = Y[:, train_dev_partition_point: dev_test_partition_point]
    
    # test set splitting
    test_X = X[:, dev_test_partition_point:]; test_Y = Y[:, dev_test_partition_point:]
    
    # return the so formed splits
    return train_X, train_Y, dev_X, dev_Y, test_X, test_Y

In [28]:
train_X, train_Y, dev_X, dev_Y, test_X, test_Y = train_dev_test_split_data(X, Y)

In [30]:
# print the shapes of all the above obtained sets:
print "Training X shape: " + str(train_X.shape)
print "Training Y shape: " + str(train_Y.shape)
print "Dev X shape     : " + str(dev_X.shape)
print "Dev Y shape     : " + str(dev_Y.shape)
print "Test X shape    : " + str(test_X.shape)
print "Test Y shape    : " + str(test_Y.shape)

Training X shape: (57, 535691)
Training Y shape: (1, 535691)
Dev X shape     : (57, 29761)
Dev Y shape     : (1, 29761)
Test X shape    : (57, 29760)
Test Y shape    : (1, 29760)


In [40]:
# Make sure that no Example has been left out
assert X.shape[-1] == np.hstack((train_X, dev_X, test_X)).shape[-1], "Examples have been left out"
assert Y.shape[-1] == np.hstack((train_Y, dev_Y, test_Y)).shape[-1], "Labels have been left out"

# If both the above asserts are successful, we can go ahead and print the following statement
print "Both the assertions pass!!"

Both the assertions pass!!


# Cool! So now Let's get onto the part where we build the Tensorflow Graph
-------------------------------------------------------------------------------------------------------------------
## I am going to keep the graph scoped and in a single cell, so that I can port it into the production graph file

In [44]:
layer_dims = [512, 512, 512, 256, 1] # the num_units in each layer of the feed_forward neural network

In [52]:
train_Y.shape

(1, 535691)

In [86]:
# the tensorflow computation graph (THE MAIN NEURAL NETWORK):

model1 = tf.Graph()

with model1.as_default():
    # scoped as Inputs
    with tf.variable_scope("Input"):
        
        # define the placeholders for the input data
        input_X = tf.placeholder(tf.float32, shape=(None, no_of_features), name="Input_features") # placeholder for feeding in input data batch
        labels_Y = tf.placeholder(tf.float32, shape=(None, 1), name="Ideal_labels") # placeholder for the labels
    
    # scoped as model:
    with tf.variable_scope("Deep_Neural_Network"):
        
        # define the layers for the neural network.
        ''' This is a plain and simple neural network with relu activations '''
        # layer 1 => 
        lay1 = tf.layers.dense(input_X, layer_dims[0], activation=tf.nn.relu, name="layer_1")
        # layer 2 =>
        lay2 = tf.layers.dense(lay1, layer_dims[1], activation=tf.nn.relu, name="layer_2")
        # layer 3 =>
        lay3 = tf.layers.dense(lay2, layer_dims[2], activation=tf.nn.relu, name="layer_3")
        # layer 4 =>
        lay4 = tf.layers.dense(lay3, layer_dims[3], activation=tf.nn.relu, name="layer_4")
        # layer 5 =>
        # the last layer has activation sigmoid since it is going to output probability.
        lay5 = tf.layers.dense(lay4, layer_dims[4], name="output") # the activation is linear
        
        
        ''' Separately record all the activations as histograms '''
        # recording the summaries to visualize separately
        lay1_summary = tf.summary.histogram("lay1_summary", lay1)
        lay2_summary = tf.summary.histogram("lay2_summary", lay2)
        lay3_summary = tf.summary.histogram("lay3_summary", lay3)
        lay4_summary = tf.summary.histogram("lay4_summary", lay4)
        output_summary = tf.summary.histogram("output_summary", lay5)
        
    # scoped as predictions
    with tf.variable_scope("Prediction"):
        prediction = tf.nn.sigmoid(lay5, name="sigmoid") # apply sigmoid to the linear activation of the output
        
    # scoped as loss
    with tf.variable_scope("Loss"):
        
        # define the loss function.
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=lay5, labels=labels_Y), name="loss")
        # we use the softmanx_cross_entropy_with_logits function for this.
        
        # record the loss summary:
        tf.summary.scalar("Loss", loss)
        
    # scoped as train_step
    with tf.variable_scope("Train_Step"):
    
        # define the optimizer and the train_step:
        optimizer = tf.train.AdamOptimizer(learning_rate=1e-6) # use the default learning rate
        train_step = optimizer.minimize(loss, name="train_step")
        
    # scoped as init operation
    with tf.variable_scope("Init"):
        init_op = tf.global_variables_initializer()
    
    # scoped as summaries
    with tf.variable_scope("Summary"):
        all_summaries = tf.summary.merge_all()

# The graph has been defined. Now, use the session executer to run the graph and see how it trains.

In [87]:
model_name = "Model1"

In [88]:
# function to execute the session and train the model:
def execute_graph(dataX, dataY, exec_graph, model_name, no_of_iterations):
    '''
        function to start and execute the session with training.
        @param 
        dataX, dataY => the data to train on
        exec_graph => the computation graph to be trained
        model_name => the name of the model where the files will be saved
        no_of_itreations => no of iterations for which the model needs to be trained
        @return => Nothing, this function has a side effect
    '''
    assert dataX.shape[-1] == dataY.shape[-1], "The Dimensions of input X and labels Y don't match"
    
    # the number of examples in the dataset
    no_of_examples = dataX.shape[-1]
    
    with tf.Session(graph=exec_graph) as sess:
        # create the tensorboard writer for collecting summaries:
        log_dir = os.path.join(base_model_path, model_name)
        tensorboard_writer = tf.summary.FileWriter(logdir=log_dir, graph=sess.graph, filename_suffix=".bot")
        
        # The saver object for saving and loading the model
        saver = tf.train.Saver(max_to_keep=2)
        
        # check if the model has been saved.
        model_path = log_dir
        model_file = os.path.join(model_path, model_name) # the name of the model is same as dir
        if(os.path.isfile(os.path.join(base_model_path, model_name, "checkpoint"))):
            # the model exists and you can restore the weights
            saver.restore(sess, tf.train.latest_checkpoint(model_path))
        else:
            # no saved model found. so, run the global variables initializer:
            sess.run(init_op)

        print "Starting the training ..."
        print "==============================================================================================="
        
        batch_index = 0 # initialize it to 0
        # start the training:
        for iteration in range(no_of_itreations):
            
            # fetch the input and create the batch:
            start = batch_index; end = start + batch_size
            inp_X = dataX[:, start: end].T # extract the input features
            inp_Y = dataY[:, start: end].T # extract the labels
            
            # feed the input to the graph and get the output:
            _, cost = sess.run((train_step, loss), feed_dict={input_X: inp_X, labels_Y: inp_Y})
            
            # checkpoint the model at certain times
            if((iteration + 1) % checkpoint_factor == 0):
                # compute the summary:
                summary = sess.run(all_summaries, feed_dict={input_X: inp_X, labels_Y: inp_Y})
                
                # accumulate the summary
                tensorboard_writer.add_summary(summary, (iteration + 1))
                
                # print the cost at this point
                print "Iteration: " + str(iteration + 1) + " Current cost: " + str(cost)
                
                # save the model trained so far:
                saver.save(sess, model_file, global_step = (iteration + 1))
                
            # increment the batch_index
            batch_index = (batch_index + batch_size) % no_of_examples
            
        print "==============================================================================================="
        print "Training complete"

In [89]:
# use the above defined method to start the training:
execute_graph(train_X, train_Y, model1, model_name, no_of_itreations)

Starting the training ...
Iteration: 50 Current cost: 0.848867
Iteration: 100 Current cost: 0.722478
Iteration: 150 Current cost: 0.590394
Iteration: 200 Current cost: 0.533269
Iteration: 250 Current cost: 0.472457
Iteration: 300 Current cost: 0.448901
Iteration: 350 Current cost: 0.414152
Iteration: 400 Current cost: 0.362162
Iteration: 450 Current cost: 0.360992
Iteration: 500 Current cost: 0.328143
Iteration: 550 Current cost: 0.316141
Iteration: 600 Current cost: 0.288144
Iteration: 650 Current cost: 0.275934
Iteration: 700 Current cost: 0.332201
Iteration: 750 Current cost: 0.323909
Iteration: 800 Current cost: 0.214473
Iteration: 850 Current cost: 0.215616
Iteration: 900 Current cost: 0.229044
Iteration: 950 Current cost: 0.217725
Iteration: 1000 Current cost: 0.231152
Iteration: 1050 Current cost: 0.236267
Iteration: 1100 Current cost: 0.217161
Iteration: 1150 Current cost: 0.203768
Iteration: 1200 Current cost: 0.187398
Iteration: 1250 Current cost: 0.244081
Iteration: 1300 Cur

# Calculate the accuracy on the dev set

In [97]:
def calc_accuracy(dataX, dataY, exec_graph, model_name, threshold = 0.5):
    '''
        Function to run the trained model and calculate it's accuracy on the given inputs
        @param 
        dataX, dataY => The data to be used for accuracy calculation
        exec_graph => the Computation graph to be used
        model_name => the model to restore the weights from
        threshold => the accuracy threshold (by default it is 0.5)
        @return => None (function has side effect)
    '''
    assert dataX.shape[-1] == dataY.shape[-1], "The Dimensions of input X and labels Y don't match"
    
    # the number of examples in the dataset
    no_of_examples = dataX.shape[-1]
    
    with tf.Session(graph=exec_graph) as sess:
        
        # The saver object for saving and loading the model
        saver = tf.train.Saver(max_to_keep=2)
        
        # the model must exist and you must be able to restore the weights
        model_path = os.path.join(base_model_path, model_name)
        assert os.path.isfile(os.path.join(model_path, "checkpoint")), "Model doesn't exist"
        
        saver.restore(sess, tf.train.latest_checkpoint(model_path))
        
        # compute the predictions given out by model
        preds = sess.run(prediction, feed_dict={input_X: dataX.T, labels_Y: dataY.T})
        
        encoded_preds = (preds >= threshold).astype(np.float32)
        
        # calculate the accuracy in percentage:
        correct = np.sum((encoded_preds == dataY.T).astype(np.int32))
        accuracy = (float(correct) / dataX.shape[-1]) * 100 # for percentage
        
    # return the so calculated accuracy:
    return accuracy

In [98]:
print "Train_Set Accuracy: " + str(calc_accuracy(train_X, train_Y, model1, model_name))

INFO:tensorflow:Restoring parameters from ../Models/Model1/Model1-10000
Train_Set Accuracy: 96.3577883519


In [99]:
print "Dev Set Accuracy: " + str(calc_accuracy(dev_X, dev_Y, model1, model_name))

INFO:tensorflow:Restoring parameters from ../Models/Model1/Model1-10000
Dev Set Accuracy: 96.3038876382


# Model 1 works pretty cool. But I will experiment more with this model to see what else I can do to get better accuracy