In [1]:
# A bit of setup

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [2]:
#load the Cifar10 data
from data_utils import load_CIFAR10
def get_CIFAR10_data(num_training=39000, num_validation=10000, num_test=10000):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the two-layer neural net classifier. 
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
        
    # Subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image

    # Reshape data to rows
    X_train = X_train.reshape(num_training, -1)
    X_val = X_val.reshape(num_validation, -1)
    X_test = X_test.reshape(num_test, -1)

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print 'Train data shape: ', X_train.shape
print 'Train labels shape: ', y_train.shape
print 'Validation data shape: ', X_val.shape
print 'Validation labels shape: ', y_val.shape
print 'Test data shape: ', X_test.shape
print 'Test labels shape: ', y_test.shape

Train data shape:  (39000L, 3072L)
Train labels shape:  (39000L,)
Validation data shape:  (10000L, 3072L)
Validation labels shape:  (10000L,)
Test data shape:  (10000L, 3072L)
Test labels shape:  (10000L,)


In [6]:
# Build the network
# 1. initialize the network

def init_neural_model(input_size, layersNum, hidden_sizes, output_size):
    """
    Initialize the weights and biases for a two-layer fully connected neural
    network. The net has an input dimension of D, a hidden layer dimension of H,
    and performs classification over C classes. Weights are initialized to small
    random values and biases are initialized to zero.

    Inputs:
    - input_size: The dimension D of the input data
    - layersNum: number of fully connected layers (hidden + output)
    - hidden_size: Array containing the number of neurons H(i) in each hidden layer(i)
    - ouput_size: The number of classes C

    Returns:
    A dictionary mapping parameter names to arrays of parameter values. It has
    the following keys:
    - W1: First layer weights; has shape (D, H)
    - b1: First layer biases; has shape (H,)
    - W2: Second layer weights; has shape (H, C)
    - b2: Second layer biases; has shape (C,)
    """
    # initialize a model
    model = {}
    # initialize first hidden layer
    model['W0'] = 0.00001 * np.random.randn(input_size, hidden_sizes[0])
    model['b0'] = np.zeros(hidden_sizes[0])
    # initialize hidden layers
    for i in range(layersNum-2):
        model['W'+str(i+1)] =  0.00001 * np.random.randn(hidden_sizes[i], hidden_sizes[i+1])
        model['b'+str(i+1)] = np.zeros(hidden_sizes[i+1])
    # initialize output layer
    model['W'+str(layersNum-1)] = 0.00001 * np.random.randn(hidden_sizes[layersNum-2], output_size)
    model['b'+str(layersNum-1)] = np.zeros(output_size)
    print 'model.shape=', len(model)
    return model

input_size = 3072 #32*32*3
layersNum = 3
hidden_sizes = [50,50] #an array of size layersNum-1
number_of_classes = 10 #output_size
model = init_neural_model(input_size, layersNum, hidden_sizes, number_of_classes)

model.shape= 6


In [7]:
# The neuralnet function below takes the data and weights and computes the class scores, the loss, and the gradients
# on the parameters.
relu = lambda x: x * (x > 0)
def neural_net(X, model, y=None, reg=0.0):
    # unpack variables from the model dictionary
    W = []
    b = []
    #cells = [numpy.array(a) for a in [[0,1,2,3], [2,3,4]]]
    for i in range(layersNum):
        W.append(model['W'+str(i)])
        b.append(model['b'+str(i)])
    
    
    N, D = X.shape
    C = b[-1].shape[0]
    
    
# compute the forward pass
    scores = np.zeros((N, C))
    #############################################################################
    # TODO: Perform the forward pass, computing the class scores for the input. #
    # Store the result in the scores variable, which should be an array of      #
    # shape (N, C).                                                             #
    #############################################################################
    
    layer1 = X.dot(W[0])+b[0]
    activate1 = relu(layer1)
    layer2 = activate1.dot(W[1]) + b[1]
    activate2 = relu(layer2)
    layer3 = activate2.dot(W[2]) + b[2]
    
    scores = layer3 # no need to make two variables, but more readable this way. #relu(X.dot(W1)+b1).dot(W2) + b2
    # If the targets are not given then jump out, we're done
    #if y is None:
     #   return scores
    #############################################################################
    # Compute Softmax classifier loss with L2 regularization                    #
    # log and e cancel out, sum splits                                          #
    # softmax loss is -sum(correct scores) + log(sum(e^[all scores (for each    #
    #  respective example)]))                                                   #
    #############################################################################
    rows = np.sum(np.exp(scores), axis=1)
    scores -= np.max(scores, axis = 1).reshape(N, 1)
    normalize_scores = np.exp(scores) / rows.reshape(N, 1)
    loss = np.sum(- np.log(normalize_scores[range(N), y]))
    loss = loss / N  + 0.5 * reg * np.sum(W[0] * W[0]) + 0.5 * reg * np.sum(W[1] * W[1]) + 0.5 * reg * np.sum(W[2] * W[2])
    # compute the gradients
    grads = {}
    #############################################################################
    # Compute the backward pass, computing the derivatives of the weights       #
    # and biases. Store the results in the grads dictionary. For example,       #
    # grads['W1'] should store the gradient on W1, and be a matrix of same size #
    #############################################################################

    dactivate3 = 1.0

    # Gradient of np.log(np.sum(np.exp(layer3), axis=1))
    dlayer3 = (np.exp(layer3).T / rows).T
    # Gradient of -layer3[range(N), y]:
    ys = np.zeros(dlayer3.shape)
    ys[range(N), y] = 1
    dlayer3 -= ys
    # / N term
    dlayer3 /= N
    # Chain rule
    dlayer3 *= dactivate3
    
    
    dactivate2 = dlayer3.dot(W[2].T)
    
    # Gradient of np.log(np.sum(np.exp(layer2), axis=1))
    dlayer2 = (np.exp(layer2).T / rows).T
    # Gradient of -layer2[range(N), y]:
    ys = np.zeros(dlayer2.shape)
    ys[range(N), y] = 1
    dlayer2 -= ys
    # / N term
    dlayer2 /= N
    # Chain rule
    dlayer2 *= dactivate2 
    
    # Chain rule, element-wise multiplication works out
    dactivate1 = dlayer2.dot(W[1].T)
    
    # Relu gradient
    dlayer1 = dlayer2 * (layer1>=0)

    dW1 = X.T.dot(dlayer1)
    dW2 = layer1.T.dot(dlayer2)
    dW3 = layer2.T.dot(dlayer3)
    
    # Same as matrix multiplication with 1-vector, chain rule works out
    db1 = np.sum(dlayer1, axis=0)
    db2 = np.sum(dlayer2, axis=0)
    db3 = np.sum(dlayer3, axis=0)
    
    # Regularization
    dW1 += reg * W[0]
    dW2 += reg * W[1]
    dW3 += reg * W[2]
    
    # Store
    grads['W0'] = dW1
    grads['W1'] = dW2
    grads['W2'] = dW3
    grads['b0'] = db1
    grads['b1'] = db2
    grads['b2'] = db3    
        
    return loss, grads

In [None]:
    """
    Optimize the parameters of a model to minimize a loss function. We use
    training data X and y to compute the loss and gradients, and periodically
    check the accuracy on the validation set.
    Inputs:
    - X: Array of training data; each X[i] is a training sample.
    - y: Vector of training labels; y[i] gives the label for X[i].
    - X_val: Array of validation data
    - y_val: Vector of validation labels
    - model: Dictionary that maps parameter names to parameter values. Each
      parameter value is a numpy array.
    - loss_function: A function that can be called in the following ways:
      scores = loss_function(X, model, reg=reg)
      loss, grads = loss_function(X, model, y, reg=reg)
    - reg: Regularization strength. This will be passed to the loss function.
    - learning_rate: Initial learning rate to use.
    - momentum: Parameter to use for momentum updates.
    - learning_rate_decay: The learning rate is multiplied by this after each
      epoch.
    - update: The update rule to use. One of 'sgd', 'momentum', or 'rmsprop'.
    - sample_batches: If True, use a minibatch of data for each parameter update
      (stochastic gradient descent); if False, use the entire training set for
      each parameter update (gradient descent).
    - num_epochs: The number of epochs to take over the training data.
    - batch_size: The number of training samples to use at each iteration.
    - acc_frequency: If set to an integer, we compute the training and
      validation set error after every acc_frequency iterations.
    - verbose: If True, print status after each epoch.
    Returns a tuple of:
    - best_model: The model that got the highest validation accuracy during
      training.
    - loss_history: List containing the value of the loss function at each
      iteration.
    - train_acc_history: List storing the training set accuracy at each epoch.
    - val_acc_history: List storing the validation set accuracy at each epoch.
    """

In [None]:
step_cache = {} # for storing velocities in momentum update
def train(X, y, X_val, y_val, 
        model, loss_function, 
        reg=0.0,
        learning_rate=0.138, momentum=0.9, learning_rate_decay=0.0008,
        update='momentum', sample_batches=True,
        num_epochs=20, batch_size=32, acc_frequency=None,
        verbose=False):
    N = X.shape[0]

    if sample_batches:
      iterations_per_epoch = N / batch_size # using SGD
    else:
      iterations_per_epoch = 1 # using GD
    num_iters = num_epochs * iterations_per_epoch
    epoch = 0
    best_val_acc = 0.0
    best_model = {}
    loss_history = []
    train_acc_history = []
    val_acc_history = []
    
    for it in xrange(num_iters):
      if it % 10 == 0:  print 'starting iteration ', it

      # get batch of data
      if sample_batches:
        batch_mask = np.random.choice(N, batch_size)
        X_batch = X[batch_mask]
        y_batch = y[batch_mask]
      else:
        # no SGD used, full gradient descent
        X_batch = X
        y_batch = y

      # evaluate cost and gradient
      cost, grads = loss_function(X_batch, model, y_batch, reg)
      loss_history.append(cost)

      cache = 0

      # perform a parameter update
      for p in model:
        # compute the parameter step
        if update == 'sgd':
          dx = -learning_rate * grads[p]

        elif update == 'momentum':
          if not p in step_cache: 
            step_cache[p] = np.zeros(grads[p].shape)

          #####################################################################
          # Momentum                                                          #
          #####################################################################
          step_cache[p] = momentum * step_cache[p] - learning_rate * grads[p]
          dx = step_cache[p]

        elif update == 'rmsprop':
          decay_rate = 0.99 # you could also make this an option TODO
          if not p in step_cache: 
            step_cache[p] = np.zeros(grads[p].shape)
          dx = np.zeros_like(grads[p]) # you can remove this after
          #####################################################################
          # RMSProp                                                           #
          #####################################################################
          self.step_cache[p] = decay_rate * step_cache[p] + (1 - decay_rate) * grads[p]**2
          dx = - learning_rate * grads[p] / np.sqrt(self.step_cache[p] + 1e-8)

        else:
          raise ValueError('Unrecognized update type "%s"' % update)

        # update the parameters
        model[p] += dx

      # every epoch perform an evaluation on the validation set
      first_it = (it == 0)
      epoch_end = (it + 1) % iterations_per_epoch == 0
      acc_check = (acc_frequency is not None and it % acc_frequency == 0)
      if first_it or epoch_end or acc_check:
        if it > 0 and epoch_end:
          # decay the learning rate
          learning_rate *= learning_rate_decay
          epoch += 1

        # evaluate train accuracy
        if N > 1000:
          train_mask = np.random.choice(N, 1000)
          X_train_subset = X[train_mask]
          y_train_subset = y[train_mask]
        else:
          X_train_subset = X
          y_train_subset = y
        
        scores_train = loss_function(X_train_subset, model)
        y_pred_train = np.argmax(scores_train, )
        train_acc = np.mean(y_pred_train == y_train_subset)
        train_acc_history.append(train_acc)

        # evaluate val accuracy
        scores_val = loss_function(X_val, model)
        y_pred_val = np.argmax(scores_val, )
        val_acc = np.mean(y_pred_val ==  y_val)
        val_acc_history.append(val_acc)
        
        # keep track of the best model based on validation accuracy
        if val_acc > best_val_acc:
          # make a copy of the model
          best_val_acc = val_acc
          best_model = {}
          for p in model:
            best_model[p] = model[p].copy()

        # print progress if needed
        if verbose:
          print ('Finished epoch %d / %d: cost %f, train: %f, val %f, lr %e'
                 % (epoch, num_epochs, cost, train_acc, val_acc, learning_rate))

    if verbose:
      print 'finished optimization. best validation accuracy: %f' % (best_val_acc, )
    # return the best model and the training history statistics
    return best_model, loss_history, train_acc_history, val_acc_history


best_model, loss_history, train_acc, val_acc = train(X_train, y_train, X_val, y_val,
                                             model, neural_net,
                                             num_epochs=50, reg=1.0,
                                             momentum=0.9, learning_rate_decay = 0.00083,
                                             learning_rate=0.01, verbose=True, acc_frequency=400)

starting iteration  0




Finished epoch 0 / 50: cost inf, train: 0.106000, val 0.099000, lr 1.000000e-02
starting iteration  10
starting iteration  20
starting iteration  30
starting iteration  40
starting iteration  50
starting iteration  60
starting iteration  70
starting iteration  80
starting iteration  90
starting iteration  100
starting iteration  110
starting iteration  120
starting iteration  130
starting iteration  140
starting iteration  150
starting iteration  160
starting iteration  170
starting iteration  180
starting iteration  190
starting iteration  200
starting iteration  210
starting iteration  220
starting iteration  230
starting iteration  240
starting iteration  250
starting iteration  260
starting iteration  270
starting iteration  280
starting iteration  290
starting iteration  300
starting iteration  310
starting iteration  320
starting iteration  330
starting iteration  340
starting iteration  350
starting iteration  360
starting iteration  370
starting iteration  380
starting iteratio

In [None]:
# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(loss_history)
plt.title('Loss history')
plt.xlabel('Iteration')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(train_acc)
plt.plot(val_acc)
plt.legend(['Training accuracy', 'Validation accuracy'], loc='lower right')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')

In [8]:
#parameters tuning
best_model = None # store the best model into this 
num_of_classes = 10
#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_model.                                                          #
#                                                                               #
# To help debug your network, it may help to use visualizations similar to the  #
# ones we used above; these visualizations will have significant qualitative    #
# differences from the ones we saw above for the poorly tuned network.          #
#                                                                               #
# Tweaking hyperparameters by hand can be fun, but you might find it useful to  #
# write code to sweep through possible combinations of hyperparameters          #
# automatically like we did on the previous assignment.                         #
#################################################################################
model = init_two_layer_model(input_size, layersNum, hidden_sizes,  num_of_classes) # input size, hidden size, number of classes

best_model, loss_history, train_acc, val_acc = train(X_train, y_train, X_val, y_val,
                                             model, neural_net,
                                             num_epochs=50, reg=0.8,
                                             momentum=0.9, learning_rate_decay = 0.0008,
                                             learning_rate=0.138, verbose=True)

#################################################################################
#                               END OF YOUR CODE                                #
#################################################################################

model.shape= 6
starting iteration  0
Finished epoch 0 / 25: cost 2.302591, train: 0.101000, val 0.099000, lr 1.000000e-04
starting iteration  10
starting iteration  20
starting iteration  30
starting iteration  40
starting iteration  50
starting iteration  60
starting iteration  70
starting iteration  80
starting iteration  90
starting iteration  100
starting iteration  110
starting iteration  120
starting iteration  130
starting iteration  140
starting iteration  150
starting iteration  160
starting iteration  170
starting iteration  180
starting iteration  190
starting iteration  200
starting iteration  210
starting iteration  220
starting iteration  230
starting iteration  240
starting iteration  250
starting iteration  260
starting iteration  270
starting iteration  280
starting iteration  290
starting iteration  300
starting iteration  310
starting iteration  320
starting iteration  330
starting iteration  340
starting iteration  350
starting iteration  360
starting iteration  370

In [11]:
#run on the test set
scores_test = neural_net(X_test, best_model)
print 'Test accuracy: ', np.mean(np.argmax(scores_test, ) == y_test)

Test accuracy:  0.1
