# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Name, UNI

Member 2: Name, UNI

Member 3: Name, UNI

In [86]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [199]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        # init parameters
        self.parameters = self.intializeParameters(layer_dimensions)
        
    def intializeParameters(self, layer_dimensions):
        eps = 0.01
        parameters = {}
        
        for l in range(self.num_layers - 1):
            parameters["W" + str(l+1)] = np.random.randn(layer_dimensions[l+1], layer_dimensions[l]) * eps
            parameters["b" + str(l+1)] = np.zeros((layer_dimensions[l+1],1))
        
        return parameters
    
    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        return np.dot(W,A) + b
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        return self.relu(A)

    def relu(self, X):
        return np.maximum(0,X)
            
    #def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        #return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        #cache_A = []
        #cache_W = []
        #cache_Z = []
        cache = {}
        cache["A" + str(0)] = X
        
        for l in range(self.num_layers - 2):
            #print(l)
            A_prev = cache['A' + str(l)]
            Z = self.affineForward(A_prev, self.parameters["W" + str(l+1)], self.parameters["b" + str(l+1)])
            A = self.relu(Z)
            #print(cache.keys())
            cache["A" + str(l+1)] = A
            cache["Z" + str(l+1)] = Z
            
        Z = self.affineForward(A, self.parameters["W" + str(self.num_layers - 1)], self.parameters["b" + str(self.num_layers - 1)])
        cache["Z" + str(self.num_layers - 1)] = Z
        AL = self.softmax(Z)
        #return AL, cache_A, cache_W, cache_Z
        return AL, cache
    
    def softmax(self, Z):
        score = np.exp(Z - np.max(Z, axis=0))
        score /= np.sum(score, axis=0)
        return score
        #print(score.sum(axis=0))
        
        
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        #print("hello")
        #print(y.shape)
        #print(len(y))
        #print(AL.shape)
        #cost = np.sum(-np.log(AL[y,range(len(y))])) / len(y)
        
        # gradient of cost 
        #dAL = AL
        #dAL[y,range(len(y))] -= 1
        #dAL[y,range(len(y))] /= len(y)
        
        #if self.reg_lambda > 0:
            # add regularization
            #pass
#         dAL = np.zeros((AL.shape[0], AL.shape[1]))
        y_one_hot = np.zeros((AL.shape[0], AL.shape[1]))
        
        cost = 0
        for i in range(len(y)):
            cost += -np.log(AL[y[i], i]) / len(y)
            y_one_hot[y[i], i] = 1
        
#         cost /= len(y)
        #if self.reg_lambda > 0:
        #pass
        
        dAL = (AL - y_one_hot) / len(y)
        
        return cost, dAL

    

    def affineBackward(self, dA_prev, cache, layer_num):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        dZ = self.activationBackward(dA_prev, cache, layer_num)
        
        W = self.parameters['W' + str(layer_num)]
        dA = np.dot(W.T, dZ)
        
        A = cache['A' + str(layer_num-1)]
        dw = np.dot(dZ, A.T)
        db = np.sum(dZ, axis = 1, keepdims = True)
        
        return dA, dw, db

    def activationBackward(self, dA, cache,  layer_num, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        dZ = dA * self.relu_derivative(cache['Z' + str(layer_num)])
        return dZ
        
    def relu_derivative(self, cached_x):
        dx = 1.0 * (cached_x > 0)
        return dx

    def dropout_backward(self, dA, cache):
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        
        dZ = dAL  #dZ for the last layer is equal to the gradient of the softmax
        
        W = self.parameters['W' + str(self.num_layers - 1)]
        dA = np.dot(W.T, dZ)
        
        A = cache['A' + str(self.num_layers - 2)]
        dW = np.dot(dZ, A.T)
        db = np.sum(dZ, axis = 1, keepdims = True)
        
        gradients['dw' + str(self.num_layers - 1)] = dW
        gradients['db' + str(self.num_layers - 1)] = db
        
        for l in range(self.num_layers - 2, 0, -1):
            dA, dW, db = self.affineBackward(dA, cache, layer_num)
            gradients['dw' + str(l)] = dW
            gradients['db' + str(l)] = db
            

        #if self.drop_prob > 0:
            #call dropout_backward
          
            
        #if self.reg_lambda > 0:
            # add gradients from L2 regularization to each dW
        
        #print(gradients.keys())
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        
        for l in range(self.num_layers-1):
            self.parameters["W" + str(l+1)] = self.parameters["W" + str(l+1)] - alpha * gradients["dw" + str(l+1)]
            self.parameters["b" + str(l+1)] = self.parameters["b" + str(l+1)] - alpha * gradients["db" + str(l+1)]
        

    def train(self, X, y, X_val, y_val, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        for i in range(0, iters):
            
            # get minibatch
            A_batch, y_batch = self.get_batch(X, y, batch_size)
            
            # forward prop
            AL, cache = self.forwardPropagation(A_batch)

            # compute loss
            cost, dAL = self.costFunction(AL, y_batch)
            
            # compute gradients
            gradients = self.backPropagation(dAL, y, cache)
            
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)

            if i % print_every == 0:
                # print cost, train and validation set accuracies
                
                train_y_predict = self.predict(A_batch)
                train_accuracy = np.sum((train_y_predict == y_batch) * 1) / len(y_batch)
        
                print('%d iteration, training cost : %f:'%(i, cost))
                print('%d iteration, training acc is %f:'%(i, train_accuracy))
                
                AL, c = self.forwardPropagation(X_val)
                val_cost, _ = self.costFunction(AL, y_val)
                val_y_predict = self.predict(X_val)
                val_accuracy = np.sum((val_y_predict == y_val) * 1) / len(y_val)
                
                print('%d iteration, val cost : %f:'%(i, val_cost))
                print('%d iteration, val acc : %f:'%(i, val_accuracy)) 
                
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        AL,c = self.forwardPropagation(X)
        prediction = np.argmax(AL, axis=0)
        return prediction

    def get_batch(self, X, y, batch_size):
        """
        check
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        
        """
        start_index = np.random.randint(0, X.shape[1] - batch_size)
        X_batch = X[:, start_index:(start_index + batch_size)]
        y_batch = y[start_index:(start_index + batch_size)]

        return X_batch, y_batch

In [194]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [195]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [196]:
# Load the data
from sklearn.cross_validation import train_test_split
data_root_path = '/mnt/c/Users/bk262/Desktop/DeepLearning/HW1-data/cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
#print(np.matrix(X_train).shape)
#print(np.matrix(y_train).shape)
x_train, x_val, pred_train, pred_val = train_test_split(np.transpose(X_train), np.transpose(y_train), test_size=0.1, random_state=42)
X_train, y_train = np.transpose(x_train), np.transpose(pred_train)
X_val, y_val = np.transpose(x_val), np.transpose(pred_val)
#print(np.matrix(X_train).shape)
#print(np.matrix(y_train).shape)
#print(np.matrix(X_val).shape)
#print(np.matrix(y_val).shape)
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'deer': 4, 'bird': 2, 'frog': 6, 'dog': 5, 'truck': 9, 'horse': 7, 'automobile': 1, 'ship': 8, 'airplane': 0, 'cat': 3}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


In [180]:
def softmax(Z):
    score = np.exp(Z - np.max(Z, axis=0))
    score /= np.sum(score, axis=0)
    print(score)
    print(score.sum(axis=0))

Z = np.arange(12).reshape(3,4)        
softmax(Z)

[[  3.29320439e-04   3.29320439e-04   3.29320439e-04   3.29320439e-04]
 [  1.79802867e-02   1.79802867e-02   1.79802867e-02   1.79802867e-02]
 [  9.81690393e-01   9.81690393e-01   9.81690393e-01   9.81690393e-01]]
[ 1.  1.  1.  1.]


## Part 1

#### Simple fully-connected deep neural network

In [203]:
#print(X_train.shape[0])
#layer_dimensions = [X_train.shape[0], 128, 32, 10]  # including the input and output layers
#NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, X_val, y_val, iters=30000, alpha=0.003, batch_size=128, print_every=100)

0 iteration, training cost : 1.244908:
0 iteration, training acc is 0.539062:
0 iteration, val cost : 1.380914:
0 iteration, val acc : 0.516400:
100 iteration, training cost : 1.295691:
100 iteration, training acc is 0.570312:
100 iteration, val cost : 1.380815:
100 iteration, val acc : 0.512200:
200 iteration, training cost : 1.020526:
200 iteration, training acc is 0.632812:
200 iteration, val cost : 1.382156:
200 iteration, val acc : 0.517000:
300 iteration, training cost : 1.025069:
300 iteration, training acc is 0.664062:
300 iteration, val cost : 1.382376:
300 iteration, val acc : 0.515800:
400 iteration, training cost : 1.083851:
400 iteration, training acc is 0.679688:
400 iteration, val cost : 1.377816:
400 iteration, val acc : 0.514400:
500 iteration, training cost : 1.152719:
500 iteration, training acc is 0.570312:
500 iteration, val cost : 1.376301:
500 iteration, val acc : 0.512600:
600 iteration, training cost : 1.281372:
600 iteration, training acc is 0.578125:
600 iter

In [175]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

(1000, 10000)
(300, 10000)
(30, 10000)


In [59]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

FileNotFoundError: [Errno 2] No such file or directory: 'ans1-uni.npy'

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [None]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0, reg_lambda=0)
NN2.train(X_train, y_train, iters=1000, alpha=0.00001, batch_size=1000, print_every=10)

In [None]:
y_predicted2 = NN2.predict(X)
save_predictions(y_predicted, 'ans2-uni')