CogniFrame Assignment : MNIST Digit Classification Model
=============

Task : 

1. Construct a classification model on the MNIST data set in the form of a deep neural network of your choice.

2. Perform and compare two or more methods of hyperparameter optimization on this model, and comment on the comparison.


Link : https://www.kaggle.com/c/digit-recognizer

In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, Image
from datetime import datetime as dt
from scipy import ndimage

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

# 1 :  Data  Load

MNIST Data as downloaded from "http://yann.lecun.com/exdb/mnist/".

In [2]:
from mnist import MNIST
mndata = MNIST('./data')
images, labels = mndata.load_training()


In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip
import os
import sys
import time

from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
from scipy.misc import imsave
import tensorflow as tf
import numpy as np
import csv

SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
WORK_DIRECTORY = 'data'
IMAGE_SIZE = 28
NUM_CHANNELS = 1
PIXEL_DEPTH = 255
NUM_LABELS = 10

def maybe_download(filename):
  """Download the data from Yann's website, unless it's already here."""
  if not tf.gfile.Exists(WORK_DIRECTORY):
    tf.gfile.MakeDirs(WORK_DIRECTORY)
  filepath = os.path.join(WORK_DIRECTORY, filename)
  if not tf.gfile.Exists(filepath):
    filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)
    with tf.gfile.GFile(filepath) as f:
      size = f.Size()
    print('Successfully downloaded', filename, size, 'bytes.')
  return filepath


def extract_data(filename, num_images):
  """Extract the images into a 4D tensor [image index, y, x, channels].
  Values are rescaled from [0, 255] down to [-0.5, 0.5].
  """
  print('Extracting', filename)
  with gzip.open(filename) as bytestream:
    bytestream.read(16)
    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images)
    data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
    #data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, 1)
    return data


def extract_labels(filename, num_images):
  """Extract the labels into a vector of int64 label IDs."""
  print('Extracting', filename)
  with gzip.open(filename) as bytestream:
    bytestream.read(8)
    buf = bytestream.read(1 * num_images)
    labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
  return labels

train_data_filename = maybe_download('train-images-idx3-ubyte.gz')
train_labels_filename = maybe_download('train-labels-idx1-ubyte.gz')
test_data_filename = maybe_download('t10k-images-idx3-ubyte.gz')
test_labels_filename = maybe_download('t10k-labels-idx1-ubyte.gz')

# Extract it into np arrays.
X_train = extract_data(train_data_filename, 60000)
y_train = extract_labels(train_labels_filename, 60000)
X_test = extract_data(test_data_filename, 10000)
y_test = extract_labels(test_labels_filename, 10000)

print('Completed', dt.now())

Extracting data/train-images-idx3-ubyte.gz
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz
Completed 2018-05-24 15:49:23.856543


In [4]:
X_valid = X_test
y_valid = y_test
X_train.shape
y_train.shape

(60000,)

In [5]:
X_train[0,5:7,:,0]

array([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   3.,  18.,  18.,  18., 126., 136., 175.,  26., 166., 255.,
        247., 127.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,  30.,  36.,  94.,
        154., 170., 253., 253., 253., 253., 253., 225., 172., 253., 242.,
        195.,  64.,   0.,   0.,   0.,   0.]], dtype=float32)

# 3.0 Simple Decision Tree

## 3.3 Evaluation metrics

In [12]:
# list(set(tr['label']))

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#Defining the confusion matrix function
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):   
    import itertools
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
                
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def eval_classification(y_test, ypred_test ):
    print('Accuracy is ', accuracy_score(y_test, ypred_test) )
    print('Confusion report is')
    print(classification_report(y_test, ypred_test) )    
    conf_matrix = confusion_matrix(y_test, ypred_test)
    plot_confusion_matrix(conf_matrix, classes = list(set(tr['label'])), title = 'Confusion Matrix')
    return(conf_matrix)

### 3.3.1 GridSearchCV for optimal parameter search

## 3.4 Simple Decision tree

In [7]:
def simple_tree(X_train, y_train, X_test, y_test, \
            max_depth = None, class_weight = None):
    from sklearn import tree
    clf = tree.DecisionTreeClassifier( max_depth =   max_depth,\
                        class_weight = class_weight )
    clf.fit(X_train, y_train)
    ypred_test = clf.predict(X_test)

    eval_classification(y_test, ypred_test )
    return clf
    #import pydotplus 
    #from IPython.display import Image  
    #dot_data = tree.export_graphviz( clf, feature_names=feature_cols,out_file=None)  
    #graph = pydotplus.graph_from_dot_data(dot_data)  
    #return Image(graph.create_png())

# clf = simple_tree(X_train, y_train, X_test, y_test, 15, 'balanced' )

## 3.5 Simple Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 50, max_depth = 20, random_state= 40)
#forest.fit(X_train, y_train)
#eval_classification(y_test,  forest.predict(X_test))


# 4.0 Data Pre-Processing

### 4.0.1 Scale 

In [7]:
X_train.shape
raw = (X_train / 255.0)

In [11]:
raw.shape
raw[0,5:7,10:28,0]

array([[0.        , 0.        , 0.01176471, 0.07058824, 0.07058824,
        0.07058824, 0.49411765, 0.53333336, 0.6862745 , 0.10196079,
        0.6509804 , 1.        , 0.96862745, 0.49803922, 0.        ,
        0.        , 0.        , 0.        ],
       [0.36862746, 0.6039216 , 0.6666667 , 0.99215686, 0.99215686,
        0.99215686, 0.99215686, 0.99215686, 0.88235295, 0.6745098 ,
        0.99215686, 0.9490196 , 0.7647059 , 0.2509804 , 0.        ,
        0.        , 0.        , 0.        ]], dtype=float32)

In [9]:
#3/255.0

In [10]:
X_train[0,5:7,10:28,0]

array([[  0.,   0.,   3.,  18.,  18.,  18., 126., 136., 175.,  26., 166.,
        255., 247., 127.,   0.,   0.,   0.,   0.],
       [ 94., 154., 170., 253., 253., 253., 253., 253., 225., 172., 253.,
        242., 195.,  64.,   0.,   0.,   0.,   0.]], dtype=float32)

In [12]:
## #  scale the pixels  #  3. Scale 0-255 to (-0.5  to 0.5)
def scale_img(tr):
    #return ( (tr - 255.0) / 2.0 ) / 255.0
    return  tr / 255.0

X_train = scale_img(X_train)
X_test  = scale_img(X_test)
X_valid = scale_img(X_valid)

In [27]:
#x_train.max(axis=1).sort_values(ascending=False)[0:5] 

### 4.0.2 Reformat Image to fit ConvNets

Convert image into 3d array (image index, x, y) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road.


Reformat into a TensorFlow-friendly shape:

    convolutions need the image data formatted as a cube (width by height by #channels)
    labels as float 1-hot encodings.



In [13]:
print('Training Data Shape :: ',X_train.shape, '. Train Label Shape ', y_train.shape )
print('To be converted into TensofrFlow friendly shape')

Training Data Shape ::  (60000, 28, 28, 1) . Train Label Shape  (60000,)
To be converted into TensofrFlow friendly shape


In [14]:
#labels[:,None])     for   (28000,)
y_test[:,None]

array([[7],
       [2],
       [1],
       ...,
       [4],
       [5],
       [6]])

In [15]:
num_labels = 10
import numpy as np

#trs = X_train.as_matrix().reshape((-1, 28, 28, channel_in)).astype(np.float32)
#trs.shape
#labels = (np.arange(num_labels) == labels.as[:,None]).astype(np.float32)
#np.arange(num_labels)
( np.arange(num_labels) == y_test[:,None] ).astype(np.float16)

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float16)

In [35]:
print('Training set', X_train.shape, y_train.shape)
ab = ('a','b')
if 'a' in ab:  print('a is in ab')
if 'c' in ab:  print('c is in ab')    

Training set (60000, 28, 28, 1) (60000,)
a is in ab


In [16]:
image_size = 28
num_labels = 10
channel_in = 1 # grayscale

def reformat(dataset, labels, limit_to = ('dataset','labels') ):
  import numpy as np
  if 'dataset' in limit_to:
      dataset = dataset.as_matrix().reshape(
            (-1, image_size, image_size, channel_in)).astype(np.float32)
  if 'labels' in limit_to:
      #labels = (np.arange(num_labels) == labels.as_matrix()[:,:]).astype(np.float32)
      labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(X_train, y_train, ('labels') )
valid_dataset, valid_labels = reformat(X_valid, y_valid, ('labels') )
test_dataset, test_labels = reformat(X_test, y_test, ('labels'))
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
#print('Test set', test_dataset.shape, test_labels.shape)
print('Test set', test_dataset.shape)

Training set (60000, 28, 28, 1) (60000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1)


### 4.0.2 Extend data with rotation, stretch, shift and zoom

In [21]:
def augment_data(dataset, dataset_labels, augementation_factor=1, use_random_rotation=True, use_random_shear=True, use_random_shift=True, use_random_zoom=True):
    import tensorflow as  tf
    import numpy as np
    augmented_image = []
    augmented_image_labels = []

    for num in range (0, dataset.shape[0]):

        for i in range(0, augementation_factor):
            # original image:
            augmented_image.append(dataset[num])
            augmented_image_labels.append(dataset_labels[num])

            if use_random_rotation:
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_rotation(dataset[num], 20, row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

            if use_random_shear:
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_shear(dataset[num], 0.2, row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

            if use_random_shift:
                augmented_image.append(tf.contrib.keras.preprocessing.image.random_shift(dataset[num], 0.2, 0.2, row_axis=0, col_axis=1, channel_axis=2))
                augmented_image_labels.append(dataset_labels[num])

            #if use_random_zoom:
            #    augmented_image.append(tf.contrib.keras.preprocessing.image.random_zoom(dataset[num], 0.9, row_axis=0, col_axis=1, channel_axis=2))
            #    augmented_image_labels.append(dataset_labels[num])

    return np.array(augmented_image), np.array(augmented_image_labels)

print('From Training set', train_dataset.shape, train_labels.shape)
#train_dataset,train_labels  = augment_data(train_dataset, train_labels)
print('To Training augmentation set', train_dataset.shape, train_labels.shape)
print('TEST WITH DATA AUGMENTATION FAILED. THE accuracy in fact decreased.')

From Training set (39900, 28, 28, 1) (39900, 10)
To Training augmentation set (159600, 28, 28, 1) (159600, 10)
TEST WITH DATA AUGMENTATION FAILED. THE accuracy in fact decreased.


## 5.0 TensforFlow Models

### 5.0.1 Tensorflow evaluation metric

### 5.0.2 TensorFlow Model : 2 Conv Layer + 1 FC layer
Two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.


In [45]:
channel_in = 1

In [None]:
import tensorflow as tf
import datetime
def accuracy(predictions, labels):
  #predictions = tf.argmax(predictions,1)
  #labels = tf.argmax(labels, 1)
  #correct_prediction =  tf.equal(predictions, labels )
  #accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32))
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])
  # gives [True, False, True, True,...] based on whethe predictions and lables is equal or not
  print(datetime.datetime.now())

def GetOptimiser(tf, gradient_optimizer, loss, rate):    
    #print('gradient_optimizer is :: ',gradient_optimizer)
    if gradient_optimizer == 'GradientDescent':
        print('SELECTED OPTIMISER :: Gradient Descent optimisier with default rate - 0.05')
        return tf.train.GradientDescentOptimizer(0.05).minimize(loss)        
    elif  gradient_optimizer == 'Adam':
        print('SELECTED OPTIMISER :: Adam with learning rate - ', rate)
        return tf.train.AdamOptimizer(rate).minimize(loss)
    else:
        print('SELECTED OPTIMISER :: Invalid optimiser selected. Valid Optimiser : {GradientDescent | Adam}')
        return False

def InitNRunTfModel(num_steps,  batch_size, patch_size, depth,  num_hidden,\
                report_interval_steps,  random_seed, gradient_optimizer, learning_rate, beta\
                   ,learning_decay_impl):
    import tensorflow as tf
    import random
    print('patch size ', patch_size)
    graph = tf.Graph()
    with graph.as_default():
      # Model.
      def conv2d(data, weights, biases, strides, padding = 'SAME'):
        conv = tf.nn.conv2d(data, weights, strides, padding='SAME') # sweep a 2-D filter over a batch of images, with tf.nn.conv2d  
        return tf.nn.relu(tf.add(conv , biases))                
    
      def maxpool2d(data , k=2):
        return tf.nn.max_pool(data, ksize = [1,k,k,1], strides = [1,k,k,1], padding = 'SAME')
      
      def weight_var(shape):
        return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
      
      def bias_var(shape):
        return tf.Variable(tf.constant(1.0, shape=shape))
      
      # conv layer = Conv + Max pool  
      def conv_layer(data, weights, biases, strides, padding):
            conv = tf.nn.conv2d(data, weights, strides, padding='SAME') # sweep a 2-D filter over a batch of images, with tf.nn.conv2d  
            actv_out = tf.nn.relu(tf.add(conv , biases))
            k = 2
            return tf.nn.max_pool(actv_out, ksize = [1,k,k,1], strides = [1,k,k,1], padding = 'SAME')
               
      def fc_layer(data, channel_in, channel_out, name):
        with tf.name_scope(name):
            fc_weights = tf.Variable( tf.truncated_normal([channel_in, channel_out] ) ,name = "fc_weights")
            fc_biases = tf.Variable(tf.constant(1.0,shape=[channel_out]), name ="fc_biases")
            return tf.nn.relu(tf.add(tf.matmul(data, fc_weights), fc_biases))
        
      # Input data.
      tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, channel_in))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
      
        # Variables.
      channel_out = { 1 : depth , 2 : depth*4 , 3 : num_hidden , 4 : num_labels }
      weights = {11 : weight_var([ 1, 1, channel_in, channel_out[1] ]),
                 13 : weight_var([ 3, 3, channel_in, channel_out[1] ]),
                 15 : weight_var([ 5, 5, channel_in, channel_out[1] ]),
                 17 : weight_var([ 5, 5, channel_in, channel_out[1] ]),                 
                 21 : weight_var([ 1 , 1, channel_out[1]*4 ,channel_out[2] ]),
                 23 : weight_var([ 3 , 3, channel_out[1]*4 ,channel_out[2] ]),
                 25 : weight_var([ 5 , 5, channel_out[1]*4 ,channel_out[2] ]),
                 27 : weight_var([ 7 , 7, channel_out[1]*4 ,channel_out[2] ]),
                 
                 201 : weight_var([patch_size, patch_size, channel_out[2]*4, 256 ]),
                 3 : weight_var([256, num_hidden]),
                 4 : weight_var([num_hidden, num_labels]),
                }
      biases = {1 : bias_var( [channel_in] ),
                2 : bias_var( [channel_out[1]*4] ),                
                201: bias_var( [256] ),
                3 : bias_var( [num_hidden] ),
                4 : bias_var( [num_labels] ),
               }          
         
    # Image -> Conv+MaxPool -> Conv+MaxPool -> FC-> O/p
      data = tf_train_dataset
      def model(data):       
            print('Using  Vanilla With no max pool. Drop out not supported yet')            
            strides = [1, 2, 2, 1]   # A stride of sliding windows for each dimension. Here two dimension hence 4 stride param                 
            conv1x1_1 = conv_layer(data, weights[11], biases[1], strides,'SAME'  )
            conv3x3_1 = conv_layer(data, weights[13], biases[1], strides,'SAME'  )
            conv5x5_1 = conv_layer(data, weights[15], biases[1], strides,'SAME'  )
            conv7x7_1 = conv_layer(data, weights[17], biases[1], strides,'SAME'  )
            conv1 = tf.concat([conv1x1_1, conv3x3_1, conv5x5_1, conv7x7_1 ],3)
          
            conv1x1_2 = conv_layer(conv1, weights[21], biases[2], strides,'SAME'  )
            conv3x3_2 = conv_layer(conv1, weights[23], biases[2], strides,'SAME'  )
            conv5x5_2 = conv_layer(conv1, weights[25], biases[2], strides,'SAME'  )
            conv7x7_2 = conv_layer(conv1, weights[27], biases[2], strides,'SAME'  )
            conv2 = tf.concat([conv1x1_2, conv3x3_2, conv5x5_2, conv7x7_2 ],3)
            
            conv3 = conv_layer(conv2, weights[201], biases[201], strides,'SAME'  )
            maxpool2 = conv3
            shape = maxpool2.get_shape().as_list()
            reshape = tf.reshape(maxpool2, [shape[0],shape[1] * shape[2] * shape[3] ])
            fc = tf.nn.relu(tf.nn.bias_add(tf.matmul(reshape, weights[3]) , biases[3]))
            # adding dropout
            #keep_prob = tf.placeholder(tf.float32)            
            #fc = tf.nn.dropout(fc,  keep_prob)
            return tf.nn.bias_add(tf.matmul(fc, weights[4]), biases[4] )        
              # Training computation.
      logits = model(tf_train_dataset)
      loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))      
                # Optimizer.
      # l2 regularisation
      if(beta != 0):
          print('L2 reg invoked')
          #regularizer = 0.000
          #for k,v in weights.items():
          #      v = tf.Print(v,[v]) 
          regularizer = tf.nn.l2_loss(weights[11]) + tf.nn.l2_loss(weights[13]) + \
                        tf.nn.l2_loss(weights[15]) + tf.nn.l2_loss(weights[17]) + \
                        tf.nn.l2_loss(weights[21]) + tf.nn.l2_loss(weights[23]) + \
                        tf.nn.l2_loss(weights[25]) + tf.nn.l2_loss(weights[27]) + \
                        tf.nn.l2_loss(weights[3]) + tf.nn.l2_loss(weights[4]) 
          loss = tf.reduce_mean(loss + beta* regularizer )
      else:
          print('NO L2 reg invoked. To use it, set flag => l2_reg=True')
        
      if(learning_decay_impl == True):
          print('''Decaying learning rate Implemented''')
          global_step = tf.Variable(0)  # count the number of steps taken.
          start_learning_rate = 0.5
          learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96, staircase=True)
          #optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
          optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
      else:
        print('''NO Decaying learning rate Implemented''')
        optimizer = GetOptimiser(tf, gradient_optimizer, loss, rate = learning_rate)
      #optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
      #optimizer = tf.train.AdamOptimizer(0.0001).minimize(loss)
              # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax( model(tf_valid_dataset))
      test_prediction = tf.nn.softmax( model(tf_test_dataset))
    #print('TensorFlow Graph Initialisation completed at ',datetime.datetime.now())
    
#def RunTfModel(graph, num_steps, batch_size, report_interval_steps = 50, random_seed = 12  ):        
    ################
    ### Run TF model
    ###############
    start_time = datetime.datetime.now()
    with tf.Session(graph=graph) as session:
      #tf.global_variables_initializer().run()
      tf.initialize_all_variables().run()
      print('Initialized')
      for step in range(num_steps):
        random.seed(random_seed*step)    
        offset = random.randint( 1, train_labels.shape[0] - batch_size )        
        #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)        
        if (step % report_interval_steps == 0):
            print('Step %4d :: Minibatch ==> Loss: %f .Accuracy: %.1f%% .Validation Acc.: %.1f%% DataOffset:%7d'\
                    % (step, l, accuracy(predictions, batch_labels), accuracy(valid_prediction.eval(), valid_labels), offset ))
            print('Time ', datetime.datetime.now() )
      #print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
      test_prediction_0n9_percent =  test_prediction.eval()
      res = pd.DataFrame(np.argmax(test_prediction_0n9_percent,1))
      res.index += 1
      res.to_csv('results.csv')     
      #tf.reset_default_graph()
      session.close()    
    tf.reset_default_graph()    
    print('Total Time taken', datetime.datetime.now() - start_time)
    
def RunNN(num_steps = 101, batch_size = 16, patch_size = 5, depth = 16, num_hidden = 64 \
         , report_interval_steps = 50, random_seed = 12, gradient_optimizer = 'None'\
         , learning_rate = 0.0001, beta = 0, learning_decay_impl = False):        
    InitNRunTfModel( num_steps = num_steps, batch_size = batch_size, patch_size = patch_size, \
            depth = depth, num_hidden = num_hidden , report_interval_steps = report_interval_steps,\
            random_seed = random_seed, gradient_optimizer= gradient_optimizer,\
            learning_rate = learning_rate, beta = beta, learning_decay_impl = learning_decay_impl)
    #RunTfModel(graph, num_steps = num_steps , batch_size = batch_size ,report_interval_steps = report_interval_steps, random_seed = random_seed  )
    
print('NN Function Init Completed', datetime.datetime.now()) 

In [26]:
# to do : implement elu to relu
#change plain plus to tf.add functions # no significant imporvement in performance
# reduce learning rate to 0.0001 because learning seems to plateau at 96 %. Go to 0.00001 if needed.
#st = datetime.datetime.now()
st = datetime.datetime.now()
#RunNN( num_steps = 10, batch_size = 20, patch_size = 5, depth = 16, num_hidden = 64 \
#            , report_interval_steps = 5 , random_seed = 12, gradient_optimizer = 'Adam'\
#      , learning_rate = 0.001, beta = 0.01, learning_decay_impl = True  )
print('Total time taken',datetime.datetime.now()- st)

patch size  5
Using  Vanilla With no max pool. Drop out not supported yet
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

L2 reg invoked
Decaying learning rate Implemented
Using  Vanilla With no max pool. Drop out not supported yet
Using  Vanilla With no max pool. Drop out not supported yet
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Step    0 :: Minibatch ==> Loss: 24.527287 .Accuracy: 15.0% .Validation Acc.: 10.3% DataOffset:  25248
Step    5 :: Minibatch ==> Loss: 3512.427979 .Accuracy: 5.0% .Validation Acc.: 10.0% DataOffset:  20173
Total Time taken 0:01:45.120737
Total time taken 0:02:39.045693


### 5.0.3 Run Vanilla ConvNet
Test across diff patch size

In [38]:
st = datetime.datetime.now()
#RunNN( num_steps = 120000, batch_size = 20, patch_size = 5, depth = 16, num_hidden = 64 \
#            , report_interval_steps = 5000 , random_seed = 12, gradient_optimizer = 'Adam'\
#      , learning_rate = 0.001, learning_decay_impl = False ,beta = 0 )
print('Total time taken',datetime.datetime.now()- st)
#python operator plus = 59s, 54s, 57s - 95.7%

patch size  5
Using  Vanilla With no max pool. Drop out not supported yet
L2 reg invoked
NO Decaying learning rate Implemented
SELECTED OPTIMISER :: Adam with learning rate -  0.001
Using  Vanilla With no max pool. Drop out not supported yet
Using  Vanilla With no max pool. Drop out not supported yet
Initialized
Step    0 :: Minibatch ==> Loss: 24.425711 .Accuracy: 15.0% .Validation Acc.: 9.9% DataOffset:  25248
Time  2018-05-21 02:58:12.118533
Step 5000 :: Minibatch ==> Loss: 0.840437 .Accuracy: 80.0% .Validation Acc.: 91.8% DataOffset:  27308
Time  2018-05-21 03:09:59.419559
Step 10000 :: Minibatch ==> Loss: 0.403290 .Accuracy: 85.0% .Validation Acc.: 93.1% DataOffset:   5118
Time  2018-05-21 03:21:28.593204
Step 15000 :: Minibatch ==> Loss: 0.168340 .Accuracy: 100.0% .Validation Acc.: 93.3% DataOffset:   8704
Time  2018-05-21 03:33:00.676229
Step 20000 :: Minibatch ==> Loss: 0.432823 .Accuracy: 90.0% .Validation Acc.: 94.9% DataOffset:  19073
Time  2018-05-21 03:44:16.294486
Step 25

In [32]:
st = datetime.datetime.now()
#RunNN( num_steps = 50000, batch_size = 20, patch_size = 5, depth = 16, num_hidden = 64 \
#            , report_interval_steps = 3000 , random_seed = 12, gradient_optimizer = 'Adam'\
#      , learning_rate = 0.001, learning_decay_impl = False,l2_reg = True )
print('Total time taken',datetime.datetime.now()- st)
#python operator plus = 59s, 54s, 57s - 95.7%
# python two plus operator replaced with tf.nn.bias_add : 61s, 57s, 57s - 95.7%

patch size  5
Using  Vanilla With no max pool. Drop out not supported yet
L2 reg invoked
NO Decaying learning rate Implemented
SELECTED OPTIMISER :: Adam with learning rate -  0.001
Using  Vanilla With no max pool. Drop out not supported yet
Using  Vanilla With no max pool. Drop out not supported yet
Initialized
Step    0 :: Minibatch ==> Loss: 262.676086 .Accuracy: 10.0% .Validation Acc.: 10.3% DataOffset:  25248
Step 3000 :: Minibatch ==> Loss: 1.521674 .Accuracy: 90.0% .Validation Acc.: 91.7% DataOffset:  18896
Step 6000 :: Minibatch ==> Loss: 0.666630 .Accuracy: 100.0% .Validation Acc.: 94.7% DataOffset:  28157
Step 9000 :: Minibatch ==> Loss: 0.355961 .Accuracy: 95.0% .Validation Acc.: 95.5% DataOffset:  20336
Step 12000 :: Minibatch ==> Loss: 0.218071 .Accuracy: 100.0% .Validation Acc.: 96.2% DataOffset:  33444
Step 15000 :: Minibatch ==> Loss: 0.145269 .Accuracy: 100.0% .Validation Acc.: 95.2% DataOffset:   8704
Step 18000 :: Minibatch ==> Loss: 0.227120 .Accuracy: 95.0% .Valida

In [46]:
num_channels = 1
import tensorflow as tf
import datetime
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])
  # gives [True, False, True, True,...] based on whethe predictions and lables is equal or not
  #correct_prediction = tf.equal( tf.argmax(predictions, 1), tf.argmax(labels, 1)  )
  # cast converts bool to float [True, False, True, True] => [1,0,1,1]
  #return   tf.cast( (tf.reduce_mean(tf.cast(correct_prediction, tf.float32))), float)

print(datetime.datetime.now())

def GetOptimiser(tf, gradient_optimizer, loss, rate):    
    #print('gradient_optimizer is :: ',gradient_optimizer)
    if gradient_optimizer == 'GradientDescent':
        print('SELECTED OPTIMISER :: Gradient Descent optimisier with default rate - 0.05')
        return tf.train.GradientDescentOptimizer(0.05).minimize(loss)        
    elif  gradient_optimizer == 'Adam':
        print('SELECTED OPTIMISER :: Adam with learning rate - ', rate)
        return tf.train.AdamOptimizer(rate).minimize(loss)
    else:
        print('SELECTED OPTIMISER :: Invalid optimiser selected. Valid Optimiser : {GradientDescent | Adam}')
        return False

def InitNRunTfModel1(num_steps,  batch_size, patch_size, depth,  num_hidden,\
                report_interval_steps,  random_seed, gradient_optimizer, learning_rate, beta = 0.001):
    import tensorflow as tf
    import random
    print('patch size ', patch_size)
    graph = tf.Graph()
    with graph.as_default():
      # Input data.
      tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)
              # Variables.
      layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1))
      layer1_biases = tf.Variable(tf.zeros([depth]))
      layer2_weights = tf.Variable(tf.truncated_normal( [patch_size, patch_size, depth, depth], stddev=0.1))
      layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
      #layer3_weights = tf.Variable(tf.truncated_normal(
      #    [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
      #layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
      layer3_weights = tf.Variable(tf.truncated_normal([64, num_hidden], stddev=0.1))
      layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
      layer4_weights = tf.Variable(tf.truncated_normal(
          [num_hidden, num_labels], stddev=0.1))
      layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
               # Model.
      def conv2d(data, weights, biases, strides, padding = 'SAME'):
        conv = tf.nn.conv2d(data, weights, strides, padding='SAME') # sweep a 2-D filter over a batch of images, with tf.nn.conv2d  
        hidden = tf.nn.relu(tf.add(conv , biases))
        #hidden = tf.nn.elu(tf.add(conv , biases))
        return  hidden
      def maxpool2d(data , k=2):
        return tf.nn.max_pool(data, ksize = [1,k,k,1], strides = [1,k,k,1], padding = 'SAME')
      
      # Image -> Conv+MaxPool -> Conv+MaxPool -> FC-> O/p
      def model(data):       
            print('Using  Vanilla With no max pool. Drop out not supported yet')
            strides = [1, 2, 2, 1]   # A stride of sliding windows for each dimension. Here two dimension hence 4 stride param     
            conv1 = conv2d(data, layer1_weights, layer1_biases, strides, 'SAME')
            maxpool1 = maxpool2d(conv1, 2)
            conv2 = conv2d( maxpool1, layer2_weights, layer2_biases, strides, 'SAME')
            maxpool2 = maxpool2d(conv2, 2)
            #conv = tf.nn.conv2d(data, layer1_weights, strides, padding='SAME') # sweep a 2-D filter over a batch of images, with tf.nn.conv2d  
            #hidden = tf.nn.relu(tf.nn.bias_add(conv , layer1_biases))
            #conv = tf.nn.conv2d(hidden, layer2_weights, strides, padding='SAME')
            #hidden = tf.nn.relu(tf.nn.bias_add(conv , layer2_biases))           
            shape = maxpool2.get_shape().as_list()
            #reshape = tf.reshape(maxpool2, [shape[0], shape[1] * shape[2] * shape[3]])
            reshape = tf.reshape(maxpool2, [shape[0],shape[1] * shape[2] * shape[3] ])
            fc = tf.nn.relu(tf.nn.bias_add(tf.matmul(reshape, layer3_weights) , layer3_biases))
            return tf.nn.bias_add(tf.matmul(fc, layer4_weights) , layer4_biases)        
              # Training computation.
      logits = model(tf_train_dataset)
      loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))      
                # Optimizer.
      print('L2 reg invoked')
      #regularizer = 0.000
      #for k,v in weights.items():
      #      v = tf.Print(v,[v]) 
      regularizer = tf.nn.l2_loss( layer1_weights ) + tf.nn.l2_loss(layer2_weights ) + \
                    tf.nn.l2_loss( layer3_weights ) + tf.nn.l2_loss(layer4_weights ) 
                    #tf.nn.l2_loss( layer1_weights ) + tf.nn.l2_loss(layer2_weights ) + \
      #regularizer = tf.Print(regularizer,[regularizer])      
      #loss_no_reg = loss
      loss = tf.reduce_mean(loss + beta* regularizer )
      #optimizer = GetOptimiser(tf, gradient_optimizer, loss, rate = learning_rate)
      #optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
      optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
              # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(logits)
      valid_prediction = tf.nn.softmax( model(tf_valid_dataset))
      test_prediction = tf.nn.softmax( model(tf_test_dataset))
    #print('TensorFlow Graph Initialisation completed at ',datetime.datetime.now())
    
#def RunTfModel(graph, num_steps, batch_size, report_interval_steps = 50, random_seed = 12  ):        
    ################
    ### Run TF model
    ###############
    start_time = datetime.datetime.now()
    with tf.Session(graph=graph) as session:
      #tf.global_variables_initializer().run()
      tf.initialize_all_variables().run()
      print('Initialized')
      for step in range(num_steps):
        random.seed(random_seed*step)    
        offset = random.randint( 1, train_labels.shape[0] - batch_size )        
        #offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)        
        if (step % report_interval_steps == 0):            
            print('Step %4d :: Minibatch ==> Loss: %f .Accuracy: %.1f%% .Validation Acc.: %.1f%% DataOffset:%7d'\
                    % (step, l, accuracy(predictions, batch_labels), accuracy(valid_prediction.eval(), valid_labels), offset ))          
            print('Time ', datetime.datetime.now() )                        
      #print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
      test_prediction_0n9_percent =  test_prediction.eval()
      res = pd.DataFrame(np.argmax(test_prediction_0n9_percent,1))
      res.index += 1
      res.to_csv('results.csv')     
      #tf.reset_default_graph()
      session.close()    
    tf.reset_default_graph()    
    print('Training Completed. Total Time taken', datetime.datetime.now() - start_time)
    
    
def RunNN1(num_steps = 101, batch_size = 16, patch_size = 5, depth = 16, num_hidden = 64 \
         , report_interval_steps = 50, random_seed = 12, gradient_optimizer = 'None'\
         , learning_rate = 0.0001,beta = 0.001):        
    InitNRunTfModel1( num_steps = num_steps, batch_size = batch_size, patch_size = patch_size, \
            depth = depth, num_hidden = num_hidden , report_interval_steps = report_interval_steps,\
            random_seed = random_seed, gradient_optimizer= gradient_optimizer, learning_rate = learning_rate, beta=0.001)
    #RunTfModel(graph, num_steps = num_steps , batch_size = batch_size ,report_interval_steps = report_interval_steps, random_seed = random_seed  )
    
print('NN Function Init Completed', datetime.datetime.now()) 

2018-05-23 22:26:39.871248
NN Function Init Completed 2018-05-23 22:26:39.872545


In [5]:
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
#from skopt.plots import plot_convergence
#from skopt.plots import plot_objective, plot_evaluations
#from skopt.plots import plot_histogram, plot_objective_2D
from skopt.utils import use_named_args

In [3]:
# Defining HyperParamter Search Boundaries
dim_learning_rate = Real(low=1e-6, high=1e-2, prior='log-uniform', name='learning_rate')
dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_dense_nodes = Integer(low=5, high=512, name='num_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'], name='activation')


NameError: name 'Real' is not defined

In [47]:
st = datetime.datetime.now()
RunNN1( num_steps = 300000, batch_size = 40, patch_size = 5, depth = 16, num_hidden = 64 \
            , report_interval_steps = 10000 , random_seed = 12, gradient_optimizer = 'Adam'\
      , learning_rate = 0.001, beta = 0)
print('Total time taken',datetime.datetime.now()- st)


patch size  5
Using  Vanilla With no max pool. Drop out not supported yet
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

L2 reg invoked
Using  Vanilla With no max pool. Drop out not supported yet
Using  Vanilla With no max pool. Drop out not supported yet
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Step    0 :: Minibatch ==> Loss: 2.710370 .Accuracy: 5.0% .Validation Acc.: 9.8% DataOffset:  55341
Time  2018-05-23 22:27:07.741669
Step 10000 :: Minibatch ==> Loss: 0.151283 .Accuracy: 95.0% .Validation Acc.: 98.3% DataOffset:   5118
Time  2018-05-23 22:29:03.280594
Step 20000 :: Minibatch ==> Loss: 0.100629 .Accuracy: 100.0% .Validation Acc.: 98.3% DataOffset:  19073
Time  2018-05-23 22:31:52.301506
Step 30000 :: Minibatch ==> Loss: 0.105723 .Accuracy: 97.5% .Validation Acc.: 98.7% DataOffset:  58814
Time

KeyboardInterrupt: 

### Using Tflearn :
A simple python library built on top of tensorflow
