# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Name, UNI

Member 2: Name, UNI

Member 3: Name, UNI

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [130]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        eps = .01
        self.parameters = {'layer_dimensions':layer_dimensions, 'drop_prob':drop_prob, 'reg_lambda':reg_lambda} 
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        # init parameters
        for l in range(1, self.num_layers):
            self.parameters['W' + str(l)] = np.random.randn(layer_dimensions[l], layer_dimensions[l-1]) * eps
            self.parameters['b' + str(l)]= np.zeros((layer_dimensions[l], 1))
    
        # print all params:
        print(self.parameters.keys())
#     def gradientCheck(self, theta, x):
#         eps = 1e-7
#         J_pos = J(theta + eps, x)
#         J_neg = J(theta - eps, x)
#         numerical_deriv = (J_pos - J-neg)/(2*eps)
#         analytic_deriv = JDeriv(theta, x)
#         error = norm(numerical_deriv-analyic_deriv)/norm()
#         return error
        
    def affineForward(self, A_, W_, b_):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        print("forward, shape W", W_.shape)
        print("forward, shape A", A_.shape)
        print("forward, shape b", b_.shape)
        return np.dot(W_, A_) + b_, [A_, W_, b_]
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        
        if activation == "relu":
            return self.relu(A)
        if activation == "softmax":
            return self.softmax(A)
        
        return A

    def softmax(self, X):
        # softmax
        return np.exp(X)/np.sum(np.exp(X), axis=0)
    
    def relu(self, X):
        return np.maximum(0,X)
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        
        # make a mask, binary mask
        # where that mask is 0, drop those activation units
        # where its one, scale them up by 1/(1-p)
        # in forward we need to apply this mask

        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        cache = {}
        A = X
        
        for l in range(1, self.num_layers):
            print("forward, layer", l)
            print("A SHAPE", A.shape)
            layer_cache = {}
            Z, cache_l = self.affineForward(A, self.parameters["W" + str(l)], self.parameters["b" + str(l)])
            
            layer_cache["linear"] = cache_l
            if l != self.num_layers - 1:
                cache_a = self.activationForward(Z)
            else:
                cache_a = self.activationForward(Z, activation="softmax")
                
            layer_cache['activation'] = cache_a
            cache[str(l)] = layer_cache
            A = cache_a
        
        return A, cache
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """

        
        # compute loss
        #+ np.multiply((1-Y), np.log(1-preds))
        # one hot encode true labels
        # only look at how close we are to previous labels
        true_labels = one_hot(y)
        preds = np.multiply(AL, true_labels)
        print("na in AL?", np.sum(np.isnan(AL)), "max in AL", np.max(AL))
        print("na in preds?", np.sum(np.isnan(preds)))
        
        cost = -np.sum(np.multiply(true_labels, np.log(AL)) + np.multiply((1-true_labels), np.log(1-AL)))/ true_labels.shape[1]

        if self.reg_lambda > 0:
            # add regularization
            pass
        
        # gradient of cost
        dAL = np.multiply(true_labels - preds, true_labels)
        
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        A, W, b = cache['linear']
        

        # activation backwards
        dZ_l = self.activationBackward(dA_prev, cache)
        
        dA = np.dot(W.T, dZ_l)
        dW = (1/dA_prev.shape[0]) * np.dot(dZ_l, A.T)
        db = (1/dA_prev.shape[0]) * np.sum(dZ_l, axis=1, keepdims=True)
        print("affine backward, db shape", db.shape)
        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        
        #dzl = dAl*gl_p(zl)
        # only passed in the cached linear layer
        
        # zl is avation at layer l, inputted as cache
       
        A, W, b = cache['linear']
        print("backwards, W shape", W.shape)
        print("backwards, A shape", A.shape)
        print("backwards, b shape", W.shape)
        zl =  np.dot(W, A) + b
        # dA is derivative last layer, first time it will be cost
        return np.multiply(dA , self.relu_derivative(zl))
        
    def relu_derivative(self, dx):
        return 1.0 * (dx > 0)

    def dropout_backward(self, dA, cache):

        # in backwards we need to apply that mask to the derivatives
        # so cache here contains the dropout mask you used in forward prop
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        dA_prev = dAL
        for l in range(self.num_layers-1, 0, -1):
            cached_l = cache[str(l)]
            # [A, W, b] for layer
            print("backprop, layer:", l)
            
            # affine backwards
            dA, dW, db = self.affineBackward(dA_prev ,cached_l)
            gradients["dW" + str(l)] = dW
            gradients["db" + str(l)] = db
        
           
            if self.drop_prob > 0:
                #call dropout_backward
                dA_prev = self.dropout_backward(dAL, cache)
            else:
                dA_prev = dA
            
            
        if self.reg_lambda > 0:
            # add gradients from L2 regularization to each dW
            pass
        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        L = self.num_layers
        print("updating params")
        # update by adding alpha * gradient to the params
        for l in range(1,L):
            print("W" + str(l) +" old shape", self.parameters["W" + str(l)].shape)
            self.parameters["W" + str(l)] = self.parameters["W" + str(l)] - alpha * gradients["dW" + str(l)]
            print("W" + str(l) +" new shape", self.parameters["W" + str(l)].shape)

            print("b" + str(l) +" old shape", self.parameters["b" + str(l)].shape)
            self.parameters["b" + str(l)] = self.parameters["b" + str(l)] - alpha * gradients["db" + str(l)]
            print("b" + str(l) +" new shape", self.parameters["b" + str(l)].shape)
            
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        # we should get validation
        
        print("X train shape:", X_train.shape)
        print("y train shape:", y_train.shape)
        # get minibatch
#         batch_x, batch_y = self.get_batch(X, y, batch_size)
#         print("batch 1 x, shape:", batch_x.shape)
        
        for i in range(0, iters):
            # should we change alpha each step?
            
            
            # forward prop
            last_layer, cache = self.forwardPropagation(X)

            # compute loss
            cost, cost_deriv = self.costFunction(last_layer, y)

            # compute gradients
            gradients = self.backPropagation(cost_deriv, y, cache)
            
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)
            
            if i % print_every == 0:
                # print cost, train and validation set accuracies
                print("COST", cost)
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        
        y_pred = self.forwardPropagation(x)
        
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        # are minibatches random
        start_batch = np.random.randint(0, X.shape[0]-batch_size)
        print("batch start:" , start_batch, start_batch + batch_size)
        X_batch = X[:,start_batch]
        y_batch = y[:,start_batch]
        print("X batch shape:", X_batch.shape)
        print("y batch shape:", y_batch.shape)
        return X_batch, y_batch

In [121]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [4]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0

    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [33]:
# Load the data
data_root_path = 'cifar10-hw1/'
#X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
#X_test = get_images(data_root_path + 'test')
print('Data loading done')

Data loading done


In [65]:
test = y_train[0:12]

## Part 1

#### Simple fully-connected deep neural network

In [131]:

layer_dimensions = [X_train.shape[0],8,5, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=100, alpha=.01, batch_size=100, print_every=100)

dict_keys(['b2', 'W3', 'W2', 'reg_lambda', 'W1', 'layer_dimensions', 'drop_prob', 'b3', 'b1'])
X train shape: (3072, 50000)
y train shape: (50000,)
forward, layer 1
A SHAPE (3072, 50000)
forward, shape W (8, 3072)
forward, shape A (3072, 50000)
forward, shape b (8, 1)
forward, layer 2
A SHAPE (8, 50000)
forward, shape W (5, 8)
forward, shape A (8, 50000)
forward, shape b (5, 1)
forward, layer 3
A SHAPE (5, 50000)
forward, shape W (10, 5)
forward, shape A (5, 50000)
forward, shape b (10, 1)
na in AL? 0 max in AL 0.100043173757
na in preds? 0
backprop, layer: 3
backwards, W shape (10, 5)
backwards, A shape (5, 50000)
backwards, b shape (10, 5)
affine backward, db shape (10, 1)
backprop, layer: 2
backwards, W shape (5, 8)
backwards, A shape (8, 50000)
backwards, b shape (5, 8)
affine backward, db shape (5, 1)
backprop, layer: 1
backwards, W shape (8, 3072)
backwards, A shape (3072, 50000)
backwards, b shape (8, 3072)
affine backward, db shape (8, 1)
updating params
W1 old shape (8, 3072)


affine backward, db shape (8, 1)
updating params
W1 old shape (8, 3072)
W1 new shape (8, 3072)
b1 old shape (8, 1)
b1 new shape (8, 1)
W2 old shape (5, 8)
W2 new shape (5, 8)
b2 old shape (5, 1)
b2 new shape (5, 1)
W3 old shape (10, 5)
W3 new shape (10, 5)
b3 old shape (10, 1)
b3 new shape (10, 1)
forward, layer 1
A SHAPE (3072, 50000)
forward, shape W (8, 3072)
forward, shape A (3072, 50000)
forward, shape b (8, 1)
forward, layer 2
A SHAPE (8, 50000)
forward, shape W (5, 8)
forward, shape A (8, 50000)
forward, shape b (5, 1)
forward, layer 3
A SHAPE (5, 50000)
forward, shape W (10, 5)
forward, shape A (5, 50000)
forward, shape b (10, 1)
na in AL? 0 max in AL 0.169613863788
na in preds? 0
backprop, layer: 3
backwards, W shape (10, 5)
backwards, A shape (5, 50000)
backwards, b shape (10, 5)
affine backward, db shape (10, 1)
backprop, layer: 2
backwards, W shape (5, 8)
backwards, A shape (8, 50000)
backwards, b shape (5, 8)
affine backward, db shape (5, 1)
backprop, layer: 1
backwards, W

KeyboardInterrupt: 

In [None]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [None]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [None]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0, reg_lambda=0)
NN2.train(X_train, y_train, iters=100, alpha=0.00001, batch_size=1000, print_every=10)

In [None]:
y_predicted2 = NN2.predict(X)
save_predictions(y_predicted, 'ans2-uni')

In [102]:
test = np.random.randint(10, size=(6))
test

array([4, 3, 2, 0, 2, 5])

In [103]:
T = one_hot(test)
T

array([[ 0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])



array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])