## Deep Learning - Neural Networks from Scratch

Member 1: Anish Batra, ab8166

Member 2: Prashant Mahajan,  prm349

In [34]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

## Data Loading

In [35]:
import torch
import torchvision
from torchvision.transforms import transforms

In [36]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


## Data Preprocessing

In [37]:
# convert data to numpy arrays
train_images = (trainset.train_data.reshape(50000, 3, 32, 32)) / 255.0
train_labels = np.array(trainset.train_labels).reshape(50000, )

test_images = (testset.test_data.reshape(10000, 3, 32, 32)) / 255.0
test_labels = np.array(testset.test_labels).reshape(10000, )

print(train_images.shape)
print(train_labels.shape)

print(test_images.shape)
print(test_labels.shape)

(50000, 3, 32, 32)
(50000,)
(10000, 3, 32, 32)
(10000,)


## Model Architecture

In [39]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """

    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)

        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        self.linear_cache = {}
        self.activation_cache = {}
        self.dropout_mask = {}

        # init parameters
        for l in range(1, self.num_layers):
            self.parameters['W' + str(l)] = np.random.normal(
                0, 7.5e-02, (layer_dimensions[l - 1], layer_dimensions[l]))
            self.parameters['b' + str(l)] = np.zeros(layer_dimensions[l])

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        out = None
        m = A.shape[0]
        out = np.dot(np.reshape(A, [m, -1]), W) + b
        cache = (A, W, b)
        return out, cache

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """
        if activation == 'relu':
            out = self.relu(A)
        return out

    def relu(self, X):
        out = np.maximum(X, 0)
        return out

    def dropout(self, A, prob):
        """
        :param A: Activation
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0], A.shape[1])
        M = (M > prob) * 1.0
        M /= (1 - prob)
        A *= M

        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        size = X.shape[0]
        X = np.reshape(X, [size, -1])

        no_layers = self.num_layers

        for layer in range(1, no_layers - 1):
            Z, self.linear_cache[str(layer)] = self.affineForward(
                X, self.parameters['W' + str(layer)],
                self.parameters['b' + str(layer)])
            self.activation_cache[str(layer)] = Z.copy()
            A = self.activationForward(Z, activation='relu')

            if self.drop_prob > 0:
                A, self.dropout_mask[str(layer)] = self.dropout(
                    A, self.drop_prob)
            X = A.copy()

        AL, cache = self.affineForward(
            X, self.parameters['W' + str(no_layers - 1)],
            self.parameters['b' + str(no_layers - 1)])

        return AL, cache

    def softmax(self, Z):
        AL_normal = Z - np.max(Z, axis=1, keepdims=True)
        K = np.sum(np.exp(AL_normal), axis=1, keepdims=True)
        log_probability = AL_normal - np.log(K)
        A = np.exp(log_probability)
        return A, log_probability

    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute cost
        N = AL.shape[0]
        A, log_probability = self.softmax(AL)
        cost = -np.sum(log_probability[np.arange(N), y]) / N

        if self.reg_lambda > 0:
            # add regularization
            for l in range(self.num_layers - 1, 0, -1):
                cost += 0.5 * self.reg_lambda * (np.sum(
                    np.square(self.parameters['W' + str(l)])))

        # gradient of cost
        dAL = A.copy()
        dAL[np.arange(N), y] -= 1
        dAL /= N

        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        X, W, b = cache
        dA, dW, db = None, None, None
        n = X.shape[0]
        m = np.reshape(X, [n, -1])
        dA = np.dot(dA_prev, W.T)
        dA = np.reshape(dA, X.shape)
        dW = np.dot(m.T, dA_prev)
        db = np.sum(dA_prev, axis=0)

        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        if activation == 'relu':
            out = self.relu_derivative(dA, cache)
        return out

    def relu_derivative(self, dx, activation_cache):
        dout, x = None, activation_cache
        relu_mask = (x >= 0)
        dout = dx * relu_mask
        return dout

    def dropout_backward(self, dA, cache):
        dA *= cache
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}

        dXL, dWL, dbL = self.affineBackward(dAL, cache)
        n = self.num_layers

        gradients['W' + str(n - 1)] = dWL
        gradients['b' + str(n - 1)] = dbL

        if self.reg_lambda > 0:
            gradients['W' + str(
                n - 1)] += self.reg_lambda * self.parameters['W' + str(n - 1)]

        for l in range(n - 2, 0, -1):
            reluBack = self.activationBackward(
                dXL, self.activation_cache[str(l)], activation='relu')

            if self.drop_prob > 0:
                reluBack = self.dropout_backward(reluBack,
                                                 self.dropout_mask[str(l)])

            dXL, dWL, dbL = self.affineBackward(reluBack,
                                                self.linear_cache[str(l)])

            if self.reg_lambda > 0:
                gradients['W' + str(
                    l)] = dWL + self.reg_lambda * self.parameters['W' + str(l)]
            else:
                gradients['W' + str(l)] = dWL
            gradients['b' + str(l)] = dbL

        return gradients

    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        for l in range(1, self.num_layers):
            self.parameters['W' + str(l)] = self.parameters[
                'W' + str(l)] - alpha * gradients['W' + str(l)]
            self.parameters['b' + str(l)] = self.parameters[
                'b' + str(l)] - alpha * gradients['b' + str(l)]

    def predict(self, X):
        """
        Make predictions for each sample
        """
        y_pred = []
        AL, cache = self.forwardPropagation(X)
        y_pred.append(np.argmax(AL, axis=1))
        return y_pred

    def train(self,
              X,
              y,
              iters=1000,
              alpha=0.0001,
              batch_size=100,
              print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """

        # Split input data into training and validation set
        X_train, y_train, X_val, y_val = self.dataset_split(X, y, 0.9)

        for i in range(0, iters):
            # get minibatch
            X_batch, y_batch = self.get_batch(X_train, y_train, batch_size)

            # forward prop
            AL, cache = self.forwardPropagation(X_batch)

            # compute loss
            cost, dAL = self.costFunction(AL, y_batch)

            # compute gradients
            gradients = self.backPropagation(dAL, y_batch, cache)

            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)

            if i % print_every == 0:
                # print cost, train and validation set accuracies
                AL_val, cache = self.forwardPropagation(X_val)
                print(
                    "Iteration Number: %d | Cost: %.4f | Train Accuracy: %.4f | Validation Accuracy: %.4f"
                    % (i, cost, self.accuracy(np.argmax(AL, axis=1), y_batch),
                       self.accuracy(np.argmax(AL_val, axis=1), y_val)))

    def dataset_split(self, X, y, ratio=0.8):
        """
        function to split the training data into training and validation
        
        :param X: input samples
        :param y: labels
        :param ratio: ratio of training data from total data
        :returns: training data and validation data
        """
        split = int(ratio * X.shape[0])
        indices = np.random.permutation(X.shape[0])
        training_idx, val_idx = indices[:split], indices[split:]
        X_train, X_val = X[training_idx, ...], X[val_idx, ...]
        y_train, y_val = y[training_idx], y[val_idx]

        return X_train, y_train, X_val, y_val

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        num_train = X.shape[0]
        mask = np.random.choice(num_train, batch_size)
        X_batch = X[mask]
        y_batch = y[mask]

        return X_batch, y_batch

    def accuracy(self, y_pred, y_test):
        '''
        This function tests the accuracy of the trained model
        :param y_pred: Predicted class 
        :param y_test: Actual Class
        :returns: accuracy
        '''
        y_pred = np.hstack(y_pred)
        accuracy = np.mean(y_pred == y_test, axis=0)
        return accuracy

## Print architecture of the Neural Network

In [40]:
network = NeuralNetwork([32 * 32 * 3, 800, 400, 200, 100, 50, 10])

print("Number of layers: ", network.num_layers, "\n")

for i in range(1, network.num_layers):
    print("Weight", i, "shape: ", network.parameters['W' + str(i)].shape)
    print("  Bias", i, "shape: ", network.parameters['b' + str(i)].shape)

Number of layers:  7 

Weight 1 shape:  (3072, 800)
  Bias 1 shape:  (800,)
Weight 2 shape:  (800, 400)
  Bias 2 shape:  (400,)
Weight 3 shape:  (400, 200)
  Bias 3 shape:  (200,)
Weight 4 shape:  (200, 100)
  Bias 4 shape:  (100,)
Weight 5 shape:  (100, 50)
  Bias 5 shape:  (50,)
Weight 6 shape:  (50, 10)
  Bias 6 shape:  (10,)


## Train the Neural Network

- Learning rate: 0.01
- Batch size: 128
- Number of iterations: 20,000
- Print after every: 100 iterations

---

- Training on 45,000 images (90%)
- Validating on 5,000 images (10%)
- Testing on 10,000 images

In [41]:
network.train(train_images, train_labels, alpha=0.01, iters=20000, batch_size=128, print_every=100)

Iteration Number: 0 | Cost: 2.4179 | Train Accuracy: 0.1250 | Validation Accuracy: 0.1020
Iteration Number: 100 | Cost: 2.1229 | Train Accuracy: 0.2812 | Validation Accuracy: 0.2258
Iteration Number: 200 | Cost: 1.9579 | Train Accuracy: 0.2969 | Validation Accuracy: 0.2886
Iteration Number: 300 | Cost: 1.9613 | Train Accuracy: 0.3203 | Validation Accuracy: 0.3066
Iteration Number: 400 | Cost: 2.0452 | Train Accuracy: 0.2344 | Validation Accuracy: 0.3284
Iteration Number: 500 | Cost: 1.8203 | Train Accuracy: 0.3203 | Validation Accuracy: 0.3412
Iteration Number: 600 | Cost: 1.9407 | Train Accuracy: 0.3281 | Validation Accuracy: 0.3306
Iteration Number: 700 | Cost: 1.7641 | Train Accuracy: 0.3828 | Validation Accuracy: 0.3098
Iteration Number: 800 | Cost: 1.7331 | Train Accuracy: 0.4219 | Validation Accuracy: 0.3580
Iteration Number: 900 | Cost: 1.8094 | Train Accuracy: 0.3359 | Validation Accuracy: 0.3558
Iteration Number: 1000 | Cost: 1.5634 | Train Accuracy: 0.4609 | Validation Accura

Iteration Number: 8900 | Cost: 1.1343 | Train Accuracy: 0.6094 | Validation Accuracy: 0.4740
Iteration Number: 9000 | Cost: 1.3333 | Train Accuracy: 0.5234 | Validation Accuracy: 0.4942
Iteration Number: 9100 | Cost: 1.2425 | Train Accuracy: 0.5312 | Validation Accuracy: 0.4906
Iteration Number: 9200 | Cost: 1.2026 | Train Accuracy: 0.6016 | Validation Accuracy: 0.4854
Iteration Number: 9300 | Cost: 1.2227 | Train Accuracy: 0.6094 | Validation Accuracy: 0.4984
Iteration Number: 9400 | Cost: 1.2718 | Train Accuracy: 0.5703 | Validation Accuracy: 0.4986
Iteration Number: 9500 | Cost: 1.1551 | Train Accuracy: 0.5781 | Validation Accuracy: 0.4940
Iteration Number: 9600 | Cost: 1.2254 | Train Accuracy: 0.5625 | Validation Accuracy: 0.5066
Iteration Number: 9700 | Cost: 1.0973 | Train Accuracy: 0.5781 | Validation Accuracy: 0.4962
Iteration Number: 9800 | Cost: 1.0772 | Train Accuracy: 0.6328 | Validation Accuracy: 0.4930
Iteration Number: 9900 | Cost: 1.2813 | Train Accuracy: 0.5312 | Valid

Iteration Number: 17700 | Cost: 1.1588 | Train Accuracy: 0.5625 | Validation Accuracy: 0.4844
Iteration Number: 17800 | Cost: 0.7329 | Train Accuracy: 0.7656 | Validation Accuracy: 0.5066
Iteration Number: 17900 | Cost: 0.9517 | Train Accuracy: 0.6094 | Validation Accuracy: 0.5060
Iteration Number: 18000 | Cost: 0.7904 | Train Accuracy: 0.7344 | Validation Accuracy: 0.5042
Iteration Number: 18100 | Cost: 0.9333 | Train Accuracy: 0.5781 | Validation Accuracy: 0.4696
Iteration Number: 18200 | Cost: 0.8145 | Train Accuracy: 0.6953 | Validation Accuracy: 0.5060
Iteration Number: 18300 | Cost: 0.6872 | Train Accuracy: 0.7500 | Validation Accuracy: 0.5010
Iteration Number: 18400 | Cost: 0.7881 | Train Accuracy: 0.7344 | Validation Accuracy: 0.5086
Iteration Number: 18500 | Cost: 0.7920 | Train Accuracy: 0.6719 | Validation Accuracy: 0.5116
Iteration Number: 18600 | Cost: 0.6874 | Train Accuracy: 0.7656 | Validation Accuracy: 0.4982
Iteration Number: 18700 | Cost: 0.6918 | Train Accuracy: 0.7

## Test on the testing set

In [43]:
y_pred = network.predict(test_images)
print(network.accuracy(y_pred,test_labels))

0.5085


## Save the results

In [44]:
np.save('ans1-uni.npy', y_pred)

## Load and check the results saved

In [45]:
data = np.load('ans1-uni.npy')
print(data)

[[3 9 0 ... 5 5 7]]
