In [1]:
!git clone https://github.com/argonne-lcf/ai-science-training-series.git
%cd ai-science-training-series/02_neural_networks_python/
%load_ext autoreload
%autoreload 2

Cloning into 'ai-science-training-series'...
remote: Enumerating objects: 1809, done.[K
remote: Counting objects: 100% (427/427), done.[K
remote: Compressing objects: 100% (158/158), done.[K
remote: Total 1809 (delta 308), reused 357 (delta 267), pack-reused 1382[K
Receiving objects: 100% (1809/1809), 202.41 MiB | 24.73 MiB/s, done.
Resolving deltas: 100% (891/891), done.
Checking out files: 100% (240/240), done.
/content/ai-science-training-series/02_neural_networks_python


In [2]:
%matplotlib inline

import tensorflow as tf

import numpy
import matplotlib.pyplot as plt

In [3]:
# repeating the data prep from the previous notebook
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.astype(numpy.float32)
x_test  = x_test.astype(numpy.float32)

x_train /= 255.
x_test  /= 255.

print(x_train.shape)
x_train = x_train.reshape(x_train.shape[0], numpy.prod(x_train[0,:,:].shape))
x_test = x_test.reshape(x_test.shape[0], numpy.prod(x_test[0,:,:].shape))

print(x_train.shape)
y_train = y_train.astype(numpy.int32)
y_test  = y_test.astype(numpy.int32)

print()
print('MNIST data loaded: train:',len(x_train),'test:',len(x_test))
print('X_train:', x_train.shape)
print('y_train:', y_train.shape)

# one-hot encoding:
nb_classes = 10
y_train_onehot = tf.keras.utils.to_categorical(y_train, nb_classes)
y_test_onehot = tf.keras.utils.to_categorical(y_test, nb_classes)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(60000, 28, 28)
(60000, 784)

MNIST data loaded: train: 60000 test: 10000
X_train: (60000, 784)
y_train: (60000,)


In [4]:
# Here we import an implementation of a two-layer neural network 
# this code is based on pieces of the first assignment from Stanford's CSE231n course, 
# hosted at https://github.com/cs231n/cs231n.github.io with the MIT license
from fc_net import TwoLayerNet

In [5]:
num_features = x_train.shape[1] # this is the number of pixels
# The weights are initialized from a normal distribution with standard deviation weight_scale
model = TwoLayerNet(input_dim=num_features, hidden_dim=100, num_classes=nb_classes, weight_scale=.01)

In [6]:
# here you can take a look if you want at the initial loss from an untrained network
loss, gradients = model.loss(x_train, y_train_onehot)

In [7]:
# a simple implementation of stochastic gradient descent
def sgd(model, gradients, learning_rate):
    for p, w in model.params.items():
        dw = gradients[p]
        new_weights = w - learning_rate * dw
        model.params[p] = new_weights
    return model

In [8]:
# one training step
def learn(model, x_train, y_train_onehot, learning_rate):
    loss, gradients = model.loss(x_train, y_train_onehot)
    model = sgd(model, gradients, learning_rate)
    return loss, model

In [9]:
def accuracy(model, x, true_values):
    scores = model.loss(x)
    predictions = numpy.argmax(scores, axis=1)
    N = predictions.shape[0]
    acc = (true_values == predictions).sum() / N
    return acc

In [None]:
# Here's an example training loop using this two-layer model. Can you do better? 
# Let's just try a parameter scan over hidden dimensions, and run epochs until it seems to stall
# Maybe up to 500 is good? (And maybe can be computed in reasonable time?)
for this_hidden_dim in range(1,501):
  model = TwoLayerNet(input_dim=num_features, hidden_dim=this_hidden_dim, num_classes=nb_classes, weight_scale=.01)
  learning_rate = 1.0 #start with an aggressive rate and we'll drop it
  num_examples = x_train.shape[0]
  batch_size = 10000
  num_batches = int(num_examples / batch_size)
  #run a bunch of epochs, but probably will stop before it gets to this (hopefully)
  num_epochs = 100
  losses = numpy.zeros(num_batches*num_epochs,)
  indices = numpy.arange(num_examples)
  #variables to check if we've stalled
  noincrease_cnt = 0
  noincrease_max = 10 #arbitrary max number of trials to accept in a row with no decrease in max loss
  last_loss = 100.0 #will hold lowest loss
  for i,epoch in enumerate(range(0, num_epochs)):
      # in each epoch, we loop over all of the training examples
      learning_rate=learning_rate*numpy.exp(-8*numpy.log(2)*i/num_epochs) #exponentially decay learning rate
      for j,step in enumerate(range(0, num_batches)):
          # grabbing the next batch
          offset = step * batch_size
          batch_range = range(offset, offset+batch_size)
          x_train_batch = x_train[batch_range, :]
          y_train_batch = y_train_onehot[batch_range,:]
          
          # feed the next batch in to do one sgd step
          loss, model = learn(model, x_train_batch, y_train_batch, learning_rate)
          losses[j] = loss

      acc = accuracy(model, x_train, y_train)
      # reshuffle the data so that we get a new set of batches
      numpy.random.shuffle(indices)
      x_train = x_train[indices,:]
      y_train = y_train[indices] # keep this shuffled the same way for use in accuracy calculation
      y_train_onehot = y_train_onehot[indices,:]
      #check if this loss is better than the best lost
      if i>1 and loss > last_loss: 
        noincrease_cnt = noincrease_cnt + 1
      else:
        last_loss = loss #store best loss
        noincrease_cnt = 0 #reset counter
      #if we havent improved on our best loss in an arbitrary number of trials, give up  
      if noincrease_cnt > noincrease_max: 
        print("dim %d, epoch %d, loss %.5f, accuracy %.2f" % (this_hidden_dim, epoch, loss, acc))
        break


dim 1, epoch 46, loss 1.92722, accuracy 0.23
dim 2, epoch 24, loss 1.52544, accuracy 0.48
dim 3, epoch 30, loss 1.16603, accuracy 0.61
dim 4, epoch 42, loss 0.82920, accuracy 0.74
dim 5, epoch 45, loss 0.98110, accuracy 0.67
dim 6, epoch 40, loss 1.10110, accuracy 0.64
dim 7, epoch 28, loss 0.65468, accuracy 0.81
dim 8, epoch 35, loss 0.65885, accuracy 0.81
dim 9, epoch 38, loss 0.80631, accuracy 0.77
dim 10, epoch 35, loss 0.78834, accuracy 0.77
dim 11, epoch 44, loss 0.51884, accuracy 0.85
dim 12, epoch 22, loss 0.50747, accuracy 0.86
dim 13, epoch 33, loss 0.73206, accuracy 0.80
dim 14, epoch 23, loss 0.54663, accuracy 0.85
dim 15, epoch 30, loss 0.51447, accuracy 0.85
dim 16, epoch 30, loss 0.49415, accuracy 0.86
dim 17, epoch 24, loss 0.55573, accuracy 0.85
dim 18, epoch 22, loss 0.48997, accuracy 0.87
dim 19, epoch 23, loss 0.59558, accuracy 0.83
dim 20, epoch 23, loss 0.47922, accuracy 0.87
dim 21, epoch 36, loss 0.51688, accuracy 0.86
dim 22, epoch 23, loss 0.49295, accuracy 0.

# Homework: improve the accuracy of this model. 

Update this notebook so that the accuracy is improved. How high can you get it? You could change things directly in the notebook, such as increasing the number of epochs, changing the learning weight, changing the width of the hidden layer, etc. If you're more ambitious, you could also try changing the model definition itself by checking out the associated Python files. For example, you could add more layers to the network. The current notebook has a training accuracy of about 43%, but will vary with randomness.

As can be seen from the above parameter search (which ended prematurely as Colab disconnected at about ~6 hours) accuracy by this method (chosen learning rate and decay, number of hidden layers, etc.) gets up to about 89 % for sufficiently many hidden dimensions, and around ~20 dimensions seems to be enough to achieve this.