# M2177.003100 Deep Learning <br> Assignment #1 Part 3: Playing with Neural Networks by TensorFlow

Copyright (C) Data Science & AI Laboratory, Seoul National University. This material is for educational uses only. Some contents are based on the material provided by other paper/book authors and may be copyrighted by them. 

Previously in `Assignment2-1_Data_Curation.ipynb`, we created a pickle with formatted datasets for training, development and testing on the [notMNIST dataset](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html).

The goal of this assignment is to progressively train deeper and more accurate models using TensorFlow.

**Note**: certain details are missing or ambiguous on purpose, in order to test your knowledge on the related materials. However, if you really feel that something essential is missing and cannot proceed to the next step, then contact the teaching staff with clear description of your problem.

### Submitting your work:
<font color=red>**DO NOT clear the final outputs**</font> so that TAs can grade both your code and results.  
Once you have done **part 1 - 3**, run the *CollectSubmission.sh* script with your **Student number** as input argument. <br>
This will produce a compressed file called *[Your student number].tar.gz*. Please submit this file on ETL. &nbsp;&nbsp; (Usage: ./*CollectSubmission.sh* &nbsp; 20\*\*-\*\*\*\*\*)

In [2]:
## load modules in google colab environment
## run this cell only when using google colab
!pip install termcolor
from termcolor import colored

## print command in different color
def print_terminal_command(command) :
  print(colored('@ ' + command, 'green'))
  

print_terminal_command('pwd')
!pwd
print_terminal_command('ls -al')
!ls -al
print_terminal_command('mkdir -p data')
!mkdir -p data
print_terminal_command('ls -al')
!ls -al

# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
# PLEASE Comment this line on submission


url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = './data' # Change me to store data elsewhere

def download_progress_hook(count, blockSize, totalSize):
    """A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 5% change in download progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    dest_filename = os.path.join(data_root, filename)
    if force or not os.path.exists(dest_filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(dest_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', dest_filename)
    else:
        raise Exception(
          'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
    return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)

num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    if os.path.isdir(root) and not force:
    # You may override by setting force=True.
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(data_root)
        tar.close()
    data_folders = [
        os.path.join(root, d) for d in sorted(os.listdir(root))
        if os.path.isdir(os.path.join(root, d))]
    if len(data_folders) != num_classes:
        raise Exception(
          'Expected %d folders, one per class. Found %d instead.' % (
            num_classes, len(data_folders)))
    print(data_folders)
    return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
    """Load the data for a single letter label."""
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
    print(folder)
    num_images = 0
    for image in image_files:
        image_file = os.path.join(folder, image)
        try:
            image_data = (ndimage.imread(image_file).astype(float) - 
                        pixel_depth / 2) / pixel_depth
            if image_data.shape != (image_size, image_size):
                raise Exception('Unexpected image shape: %s' % str(image_data.shape))
            dataset[num_images, :, :] = image_data
            num_images = num_images + 1
        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')

    dataset = dataset[0:num_images, :, :]
    if num_images < min_num_images:
        raise Exception('Many fewer images than expected: %d < %d' %
                        (num_images, min_num_images))

    print('Full dataset tensor:', dataset.shape)
    print('Mean:', np.mean(dataset))
    print('Standard deviation:', np.std(dataset))
    return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
          # You may override by setting force=True.
          print('%s already present - Skipping pickling.' % set_filename)
        else:
            print('Pickling %s.' % set_filename)
            dataset = load_letter(folder, min_num_images_per_class)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)

    return dataset_names

train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)

def make_arrays(nb_rows, img_size):
    if nb_rows:
        dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
        labels = np.ndarray(nb_rows, dtype=np.int32)
    else:
        dataset, labels = None, None
    return dataset, labels

def merge_datasets(pickle_files, train_size, valid_size=0):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, image_size)
    train_dataset, train_labels = make_arrays(train_size, image_size)
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes

    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class
    for label, pickle_file in enumerate(pickle_files):       
        try:
            with open(pickle_file, 'rb') as f:
                letter_set = pickle.load(f)
                # let's shuffle the letters to have random validation and training set
                np.random.shuffle(letter_set)
                if valid_dataset is not None:
                    valid_letter = letter_set[:vsize_per_class, :, :]
                    valid_dataset[start_v:end_v, :, :] = valid_letter
                    valid_labels[start_v:end_v] = label
                    start_v += vsize_per_class
                    end_v += vsize_per_class

                train_letter = letter_set[vsize_per_class:end_l, :, :]
                train_dataset[start_t:end_t, :, :] = train_letter
                train_labels[start_t:end_t] = label
                start_t += tsize_per_class
                end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise

    return valid_dataset, valid_labels, train_dataset, train_labels

            
train_size = 200000
valid_size = 10000
test_size = 10000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

def randomize(dataset, labels):
    permutation = np.random.permutation(labels.shape[0])
    shuffled_dataset = dataset[permutation,:,:]
    shuffled_labels = labels[permutation]
    return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)


pickle_file = os.path.join(data_root, 'notMNIST.pickle')

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'valid_dataset': valid_dataset,
        'valid_labels': valid_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

[32m@ pwd[0m
/content
[32m@ ls -al[0m
total 16
drwxr-xr-x 1 root root 4096 Nov 29 18:21 .
drwxr-xr-x 1 root root 4096 Dec  3 22:06 ..
drwxr-xr-x 1 root root 4096 Nov 29 18:21 .config
drwxr-xr-x 2 root root 4096 Nov 29 18:21 sample_data
[32m@ mkdir -p data[0m
[32m@ ls -al[0m
total 20
drwxr-xr-x 1 root root 4096 Dec  3 22:11 .
drwxr-xr-x 1 root root 4096 Dec  3 22:06 ..
drwxr-xr-x 1 root root 4096 Nov 29 18:21 .config
drwxr-xr-x 2 root root 4096 Dec  3 22:11 data
drwxr-xr-x 2 root root 4096 Nov 29 18:21 sample_data
Attempting to download: notMNIST_large.tar.gz
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified ./data/notMNIST_large.tar.gz
Attempting to download: notMNIST_small.tar.gz
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and

`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.


Could not read: ./data/notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png : cannot identify image file './data/notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png' - it's ok, skipping.
Could not read: ./data/notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png : cannot identify image file './data/notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png' - it's ok, skipping.
Could not read: ./data/notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png : cannot identify image file './data/notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png' - it's ok, skipping.
Full dataset tensor: (52909, 28, 28)
Mean: -0.12825006
Standard deviation: 0.44312054
Pickling ./data/notMNIST_large/B.pickle.
./data/notMNIST_large/B
Could not read: ./data/notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png : cannot identify image file './data/notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png' - it's ok, skipping.
Full dataset tensor: (52911, 28, 28)
Mean: -0.0075630425
Standard deviation: 0.45449144
P

## Load datasets

First reload the data we generated in `Assignment2-1_Data_Curation.ipynb`.

In [0]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range
import os

#configuration for gpu usage
conf = tf.ConfigProto()
conf.gpu_options.per_process_gpu_memory_fraction = 0.4
conf.gpu_options.allow_growth = True
os.environ['CUDA_VISIBLE_DEVICES']='0'

In [4]:
pickle_file = 'data/notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [5]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
    # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


## TensorFlow tutorial: Fully Connected Network

We're first going to train a **fully connected network** with *1 hidden layer* with *1024 units* using stochastic gradient descent (SGD).

TensorFlow works like this:
* First you describe the computation that you want to see performed: what the inputs, the variables, and the operations look like. These get created as nodes over a computation graph. This description is all contained within the block below:

      with graph.as_default():
          ...

* Then you can run the operations on this graph as many times as you want by calling `session.run()`, providing it outputs to fetch from the graph that get returned. This runtime operation is all contained in the block below:

      with tf.Session(graph=graph) as session:
          ...

Let's load all the data into TensorFlow and build the computation graph corresponding to our training:

In [0]:
batch_size = 128
nn_hidden = 1024

graph = tf.Graph()
with graph.as_default():
    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_dataset = tf.placeholder(tf.float32,
                                      shape=(None, image_size * image_size))
    tf_labels = tf.placeholder(tf.float32, shape=(None, num_labels))
    
    # Variables. 
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, nn_hidden]))
    b1 = tf.Variable(tf.zeros([nn_hidden]))
    w2 = tf.Variable(tf.truncated_normal([nn_hidden, num_labels]))
    b2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    hidden = tf.tanh(tf.matmul(tf_dataset, w1) + b1)
    logits = tf.matmul(hidden, w2) + b2
    
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_labels, logits=logits))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    prediction = tf.nn.softmax(logits)

Let's run this computation and iterate:

In [7]:
num_steps = 10000

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.equal(np.argmax(predictions, 1), np.argmax(labels, 1)))
          / predictions.shape[0])

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict_train={tf_dataset: batch_data, tf_labels: batch_labels}
        _, l, predictions = session.run([optimizer, loss, prediction], feed_dict=feed_dict_train)
        if (step % 1000 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            valid_prediction = session.run(logits, feed_dict={tf_dataset: valid_dataset})
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction, valid_labels))
                  
    test_prediction = session.run(prediction, feed_dict={tf_dataset: test_dataset})
    print("Test accuracy: %.1f%%" % accuracy(test_prediction, test_labels))
    saver = tf.train.Saver()
    saver.save(session, "./model_checkpoints/my_model_final")

Initialized
Minibatch loss at step 0: 34.344711
Minibatch accuracy: 12.5%
Validation accuracy: 27.0%
Minibatch loss at step 1000: 3.208072
Minibatch accuracy: 77.3%
Validation accuracy: 78.9%
Minibatch loss at step 2000: 1.745303
Minibatch accuracy: 82.8%
Validation accuracy: 80.7%
Minibatch loss at step 3000: 1.150633
Minibatch accuracy: 78.9%
Validation accuracy: 81.8%
Minibatch loss at step 4000: 0.866558
Minibatch accuracy: 86.7%
Validation accuracy: 82.3%
Minibatch loss at step 5000: 1.392125
Minibatch accuracy: 81.2%
Validation accuracy: 78.6%
Minibatch loss at step 6000: 0.749648
Minibatch accuracy: 87.5%
Validation accuracy: 81.6%
Minibatch loss at step 7000: 0.611268
Minibatch accuracy: 88.3%
Validation accuracy: 81.6%
Minibatch loss at step 8000: 0.762344
Minibatch accuracy: 86.7%
Validation accuracy: 79.4%
Minibatch loss at step 9000: 0.984730
Minibatch accuracy: 85.2%
Validation accuracy: 81.4%
Test accuracy: 89.1%


So far, you have built the model in a naive way. However, TensorFlow provides a module named tf.layers for your convenience. 

From now on, build the same model as above using layers module.

In [0]:
graph_l=tf.Graph()
with graph_l.as_default():
    tf_dataset_l=tf.placeholder(tf.float32, shape=(None, image_size * image_size))
    tf_labels_l=tf.placeholder(tf.float32, shape=(None, num_labels))
    
    dense = tf.layers.dense(tf_dataset_l, nn_hidden, activation=tf.tanh)
    logits_l = tf.layers.dense(dense, num_labels)
    
    #Loss
    loss_l = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf_labels_l, logits=logits_l))
    
    #Optimizer
    optimizer_l = tf.train.GradientDescentOptimizer(0.5).minimize(loss_l)
    
    #Predictions for the training
    prediction_l = tf.nn.softmax(logits_l)

In [9]:
with tf.Session(graph=graph_l, config=conf) as session_l:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :].astype(float)
        feed_dict_l = {tf_dataset_l: batch_data, tf_labels_l: batch_labels}
        _, l_l, predictions_l = session_l.run([optimizer_l, loss_l, prediction_l], feed_dict=feed_dict_l)
        if(step % 1000 == 0):
            print("Minibatch loss at step %d: %f" % (step, l_l))
            feed_dict_val_l = {tf_dataset_l: valid_dataset}
            valid_prediction_l = session_l.run(prediction_l, feed_dict={tf_dataset_l: valid_dataset, tf_labels_l: valid_labels})
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction_l, valid_labels))

    feed_dict_test_l = {tf_dataset_l: test_dataset}
    test_prediction_l = session_l.run(prediction_l, feed_dict=feed_dict_test_l)
    print("Test accuracy: %.1f%%" % accuracy(test_prediction_l, test_labels))
    saver = tf.train.Saver()
    saver.save(session_l, "./model_checkpoints/my_model_final_using_layers")

Initialized
Minibatch loss at step 0: 2.388520
Validation accuracy: 44.1%
Minibatch loss at step 1000: 0.584907
Validation accuracy: 83.7%
Minibatch loss at step 2000: 0.471201
Validation accuracy: 85.3%
Minibatch loss at step 3000: 0.424750
Validation accuracy: 85.7%
Minibatch loss at step 4000: 0.354131
Validation accuracy: 86.4%
Minibatch loss at step 5000: 0.482221
Validation accuracy: 86.7%
Minibatch loss at step 6000: 0.330496
Validation accuracy: 87.6%
Minibatch loss at step 7000: 0.418526
Validation accuracy: 87.3%
Minibatch loss at step 8000: 0.255093
Validation accuracy: 87.6%
Minibatch loss at step 9000: 0.427875
Validation accuracy: 88.0%
Test accuracy: 93.3%


---
Problem 1
-------

**Describe below** why there is a difference in an accuracy between the graph using layer module and the graph which is built in a naive way.





---

In [10]:
print('temp')

temp


Describe here

---
Problem 2
-------

Try to get the best performance you can using a multi-layer model! (It doesn't matter whether you implement it in a naive way or using layer module. HOWEVER, you CANNOT use other type of layers such as conv.) 

The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.kr/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595). You may use techniques below.

1. Experiment with different hyperparameters: num_steps, learning rate, etc.
2. We used a fixed learning rate $\epsilon$ for gradient descent. Implement an annealing schedule for the gradient descent learning rate ([more info](http://cs231n.github.io/neural-networks-3/#anneal)). *Hint*. Try using `tf.train.exponential_decay`.    
3. We used a $\tanh$ activation function for our hidden layer. Experiment with other activation functions included in TensorFlow.
4. Extend the network to multiple hidden layers. Experiment with the layer sizes. Adding another hidden layer means you will need to adjust the code. 
5. Introduce and tune regularization method (e.g. L2 regularization) for your model. Remeber that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should imporve your validation / test accuracy.
6. Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

**Evaluation:** You will get full credit if your best test accuracy exceeds 93%. Save your best perfoming model as my_model_final using saver. (Refer to the cell above) 

---

In [0]:
num_steps = 15000
nn_hdim = 1024
learning_rate=0.01
batch_size = 128
reg_lambda = 0.005

my_graph = tf.Graph()
with my_graph.as_default():
    input_data = tf.placeholder(tf.float32, shape=(None, image_size * image_size))
    input_labels = tf.placeholder(tf.float32, shape=(None, num_labels))
    is_training = tf.placeholder(tf.bool)
    
    
    # L2 regularizer
    l2_regularizer = tf.contrib.layers.l2_regularizer(scale=reg_lambda)
    hidden_layer = tf.layers.dense(inputs=input_data, units=nn_hdim, activation=tf.nn.elu, kernel_regularizer=l2_regularizer)
    hidden_layer = tf.layers.dropout(hidden_layer, training=is_training)
    output_layer = tf.layers.dense(inputs=hidden_layer, units=num_labels, kernel_regularizer=l2_regularizer)
    
    # logits
    logits = output_layer
    
    # loss
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=input_labels, logits=logits))
    
    # optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    # prediction
    prediction = tf.nn.softmax(logits)
    
    
def random_sample(data, labels, batch_size):
    random_index = np.random.permutation(len(data))[:batch_size]
    batch_data = data[random_index]
    batch_labels = labels[random_index]
    return batch_data, batch_labels
    

In [14]:
from itertools import product

nn_hdims = [1024, 2048]
learning_rates = [0.001, 0.01, 0.05]
batch_sizes = [128, 256, 512]
reg_lambdas = [0.001, 0.005, 0.01, 0.05]

result = {} # key: case, value: test accuracy
cases = list(product(nn_hdims, learning_rates, batch_sizes, reg_lambdas))

for j, case in enumerate(cases):
    nn_hdim, learning_rate, batch_size, reg_lambda = case


    with tf.Session(graph=my_graph, config=conf) as my_session:
        tf.global_variables_initializer().run()

        for i in range(num_steps):
            batch_data, batch_labels = random_sample(train_dataset, train_labels, batch_size)
            feed_dict = {
                input_data: batch_data,
                input_labels: batch_labels,
                is_training: True
            }
            train_loss, _, predictions = my_session.run([loss, optimizer, prediction], feed_dict=feed_dict)


            # print train performance(include validation performance) per 1000 steps
#             if i % 1000 == 0:
#                 train_accuracy = accuracy(predictions, batch_labels)
#                 print('step: %4d, train loss: %.4f, train accuracy: %.2f%%' % (i, train_loss, train_accuracy))
#                 valid_feed_dict = {
#                     input_data: valid_dataset,
#                     input_labels: valid_labels,
#                     is_training: False
#                 }
#                 valid_loss, valid_predictions = my_session.run([loss, prediction], feed_dict=valid_feed_dict)
#                 valid_accuracy = accuracy(valid_predictions, valid_labels)
#                 print('step: %4d, valid loss: %.4f, valid accuracy: %2.f%%' % (i, valid_loss, valid_accuracy))


        # test accuracy
        test_feed_dict = {
            input_data: test_dataset,
            input_labels: test_labels,
            is_training: False
        }
        test_predictions = my_session.run(prediction, feed_dict=test_feed_dict)
        test_accuracy = accuracy(test_predictions, test_labels)
        print(('case: ' + str(case)).ljust(32, ' '), 'total case: %d / %d, test accuracy: %.2f%%' % (j, len(cases), test_accuracy))
        result[case] = test_accuracy
     

case: (1024, 0.001, 128, 0.001)  total case: 0 / 72, test accuracy: 91.00%
case: (1024, 0.001, 128, 0.005)  total case: 1 / 72, test accuracy: 90.81%
case: (1024, 0.001, 128, 0.01)   total case: 2 / 72, test accuracy: 91.24%
case: (1024, 0.001, 128, 0.05)   total case: 3 / 72, test accuracy: 90.75%
case: (1024, 0.001, 256, 0.001)  total case: 4 / 72, test accuracy: 91.33%
case: (1024, 0.001, 256, 0.005)  total case: 5 / 72, test accuracy: 91.24%
case: (1024, 0.001, 256, 0.01)   total case: 6 / 72, test accuracy: 91.52%
case: (1024, 0.001, 256, 0.05)   total case: 7 / 72, test accuracy: 91.66%
case: (1024, 0.001, 512, 0.001)  total case: 8 / 72, test accuracy: 93.02%
case: (1024, 0.001, 512, 0.005)  total case: 9 / 72, test accuracy: 92.59%
case: (1024, 0.001, 512, 0.01)   total case: 10 / 72, test accuracy: 93.69%
case: (1024, 0.001, 512, 0.05)   total case: 11 / 72, test accuracy: 92.86%
case: (1024, 0.01, 128, 0.001)   total case: 12 / 72, test accuracy: 90.90%
case: (1024, 0.01, 128

In [32]:
temp = []
for key in result :
    temp.append((key, result[key]))
    
temp.sort(reverse=True, key=(lambda x: x[1]))


# extract good cases
cases = good_cases = [x[0] for x in temp if x[1] >= 93.5]
print('len(good_cases): ', len(good_cases))

len(good_cases):  7


In [34]:
num_steps = 20000

for case in cases:
    nn_hdim, learning_rate, batch_size, reg_lambda = case


    with tf.Session(graph=my_graph, config=conf) as my_session:
        tf.global_variables_initializer().run()

        for i in range(num_steps):
            batch_data, batch_labels = random_sample(train_dataset, train_labels, batch_size)
            feed_dict = {
                input_data: batch_data,
                input_labels: batch_labels,
                is_training: True
            }
            train_loss, _, predictions = my_session.run([loss, optimizer, prediction], feed_dict=feed_dict)


            # print train performance(include validation performance) per 1000 steps
#             if i % 1000 == 0:
#                 train_accuracy = accuracy(predictions, batch_labels)
#                 print('step: %4d, train loss: %.4f, train accuracy: %.2f%%' % (i, train_loss, train_accuracy))
#                 valid_feed_dict = {
#                     input_data: valid_dataset,
#                     input_labels: valid_labels,
#                     is_training: False
#                 }
#                 valid_loss, valid_predictions = my_session.run([loss, prediction], feed_dict=valid_feed_dict)
#                 valid_accuracy = accuracy(valid_predictions, valid_labels)
#                 print('step: %4d, valid loss: %.4f, valid accuracy: %2.f%%' % (i, valid_loss, valid_accuracy))


        # test accuracy
        test_feed_dict = {
            input_data: test_dataset,
            input_labels: test_labels,
            is_training: False
        }
        test_predictions = my_session.run(prediction, feed_dict=test_feed_dict)
        test_accuracy = accuracy(test_predictions, test_labels)
        print(('case: ' + str(case)).ljust(32, ' '), 'test accuracy: %.2f%%' % (test_accuracy))
        result[case] = test_accuracy
     

case: (2048, 0.01, 512, 0.001)   test accuracy: 93.82%
case: (2048, 0.001, 512, 0.05)   test accuracy: 94.06%
case: (1024, 0.001, 512, 0.01)   test accuracy: 93.81%
case: (1024, 0.01, 512, 0.01)    test accuracy: 93.29%
case: (1024, 0.05, 512, 0.01)    test accuracy: 93.54%
case: (2048, 0.001, 512, 0.01)   test accuracy: 93.79%
case: (2048, 0.01, 512, 0.005)   test accuracy: 93.85%
