# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Name, UNI

Member 2: Name, UNI

Member 3: Name, UNI

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [303]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        
        eps = .01
        self.parameters = {'layer_dimensions':layer_dimensions, 'drop_prob':drop_prob, 'reg_lambda':reg_lambda} 
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        # init parameters
        for l in range(1, self.num_layers):
            self.parameters['W' + str(l)] = np.random.normal(size=(layer_dimensions[l], layer_dimensions[l-1]))  * np.sqrt(2)/np.sqrt(layer_dimensions[l-1])
            self.parameters['b' + str(l)]= np.ones((layer_dimensions[l], 1)) * eps
    
        # print all params:
        print(self.parameters.keys())

        
#     def gradientCheck(self, theta, x):
#         eps = 1e-7
#         J_pos = J(theta + eps, x)
#         J_neg = J(theta - eps, x)
#         numerical_deriv = (J_pos - J-neg)/(2*eps)
#         analytic_deriv = JDeriv(theta, x)
#         error = norm(numerical_deriv-analyic_deriv)/norm()
#         return error
        
    def affineForward(self, A_, W_, b_):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
#         print("forward, shape W", W_.shape)
#         print("forward, shape A", A_.shape)
#         print("forward, shape b", b_.shape)
        return np.dot(W_, A_) + b_, [A_, W_, b_]
        

    def activationForward(self, A_, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        
        if activation == "relu":
            return self.relu(A_)
        if activation == "softmax":
            return self.softmax(A_)
        
        return A_

    def softmax(self, x):
        # softmax
        
        e_x = np.exp(x - np.max(x,axis=0))
        return e_x / e_x.sum( axis=0)
    
    def relu(self, X):
        return np.maximum(0,X)
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        
        # make a mask, binary mask
        # where that mask is 0, drop those activation units
        # where its one, scale them up by 1/(1-p)
        # in forward we need to apply this mask
        
        M = np.random.binomial([np.ones(A.shape)],1-prob)[0] * (1.0/(1-prob))
        A = np.multiply(A *(1/(1-prob)), M)
        
        return A, M

    def forwardPropagation(self, X, training=False):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        cache = {}
        A = X
        
        for l in range(1, self.num_layers):
            
            layer_cache = {}
            layer_cache["linear"] = [A,  self.parameters["W" + str(l)], self.parameters["b" + str(l)]]
            Z, cache_l = self.affineForward(A, self.parameters["W" + str(l)], self.parameters["b" + str(l)])
            if l != self.num_layers - 1:
                cache_a = self.activationForward(Z)
                if self.drop_prob > 0:
                    cache_a, dropout_mask = self.dropout(cache_a, self.drop_prob)
                    layer_cache['dropout'] = dropout_mask
            else:
                cache_a = Z
            
            
            
            layer_cache['activation'] = cache_a
            cache[str(l)] = layer_cache
            
            A = cache_a
        
        return A, cache
    
    def costFunction(self, AL, y, reg):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """

        # softmax
        
        AL = self.softmax(AL)
        # compute loss
        #+ np.multiply((1-Y), np.log(1-preds))
        # one hot encode true labels
        # only look at how close we are to previous labels
        true_labels = one_hot(y)

        
        y_hat = AL[y, range(y.shape[0])]
        #print("preds", y_hat)
        cost = -np.sum(np.log(y_hat))/ AL.shape[1]
        
        if self.reg_lambda > 0:
            # add regularization
            pass
        
        # gradient of cost, should be same shape as input
        # 
        #dAL = -np.divide(true_labels, AL) 
        dAL = AL - true_labels
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        A, W, b = cache['linear']
        #output_last_layer = cache[str(layer-1)]['activation']

     
        dW = (1/A.shape[1]) * np.dot(dA_prev,A.T)
        db = (1/A.shape[1]) * np.sum(dA_prev, axis=1, keepdims=True)
        
        dA_prev = np.dot(W.T,dA_prev)
        return dA_prev, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        
        #dzl = dAl*gl_p(zl)
        # only passed in the cached linear layer
        
        # zl is avation at layer l, inputted as cache
       
        zl = cache['activation']
        
        # dA is derivative last layer, first time it will be cost
        return np.multiply(dA , self.relu_derivative(zl, dA))
        
        
    def batchnorm_forward(self, X_, gamma, beta):
        mu = np.mean(X_, axis=1)
        var = np.var(X_, axis=1)

        X_norm = (X_ - mu) / np.sqrt(var + 1e-8)
        out = gamma * X_norm + beta

        cache = (X_, X_norm, mu, var, gamma, beta)

        return out, cache, mu, var
    
    def batchnorm_backward(dout, cache):
        X, X_norm, mu, var, gamma, beta = cache

        N, D = X.shape

        X_mu = X - mu
        std_inv = 1. / np.sqrt(var + 1e-8)

        dX_norm = dout * gamma
        dvar = np.sum(dX_norm * X_mu, axis=0) * -.5 * std_inv**3
        dmu = np.sum(dX_norm * -std_inv, axis=0) + dvar * np.mean(-2. * X_mu, axis=0)

        dX = (dX_norm * std_inv) + (dvar * 2 * X_mu / N) + (dmu / N)
        dgamma = np.sum(dout * X_norm, axis=0)
        dbeta = np.sum(dout, axis=0)

        return dX, dgamma, dbeta
    
    
    def relu_derivative(self, cached_x, dx):
        """
        Inputs:
        dx: upstream derivative
        cached_x: input of relu

        Returns:
        dx: gradient with respect to cached_x
        """
        # this should act element-wise as well
        # if cached_x is greater than 0, then the deriv is 1.0
        # if not, then the deriv is 0
#         print("\n\nrelu deriv shape dX", dx.shape)
#         print("relu deriv, shape cached_x", cached_x.shape)
#         print("\n\n")
        
        return 1.0 * (cached_x > 0)

    def dropout_backward(self, dA, cache):

        # in backwards we need to apply that mask to the derivatives
        # so cache here contains the dropout mask you used in forward prop
        mask = cache['dropout']
        p = self.drop_prob
        return np.multiply((1/(1-p)) * dA, mask)

    def backPropagation(self, dAL, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        dA_prev = dAL
        
        for l in range(self.num_layers-1, 0, -1):
            cached_l = cache[str(l)]
            if l != self.num_layers - 1:
                dA_prev = self.activationBackward(dA_prev, cached_l)
    
            # affine backwards
            
            dA, dW, db = self.affineBackward(dA_prev ,cached_l)
            gradients["dW" + str(l)] = dW
            gradients["db" + str(l)] = db
#             print("shapes: dA", dA.shape)
#             print("shapes: dW", dW.shape)
#             print("shapes: db", db.shape)
#             print("\n\n")
            
            if l != self.num_layers - 1 and self.drop_prob > 0:
                #call dropout_backward
                dA_prev = self.dropout_backward(dA, cached_l)
            else:
                dA_prev = dA
            
            
        if self.reg_lambda > 0:
            # add gradients from L2 regularization to each dW
            pass
        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        for l in range(1, self.num_layers):
            self.parameters["W" + str(l)] = self.parameters["W" + str(l)] - alpha * gradients["dW" + str(l)]
            self.parameters["b" + str(l)] = self.parameters["b" + str(l)] - alpha * gradients["db" + str(l)]
            
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        # we should get validation
        validation_size = int(X.shape[1]*.1)
        orig_size = X.shape[1]
        validation_X = X[:,orig_size-validation_size:]
        validation_y = y[orig_size-validation_size:]
        
        X = X[:,0:orig_size-validation_size]
        y = y[0:orig_size-validation_size]

        print("X train shape:", X.shape)
        print("y train shape:", y.shape)
        print("X validation shape:", validation_X.shape)
        print("y validation shape:", validation_y.shape)
        
        
        for i in range(0, iters):
           
            start_batch = np.random.randint(X.shape[1] - 2*batch_size)
            #start_batch = 0
            batch_x, batch_y = self.get_batch(X, y, batch_size,start_batch)
            
            # forward prop
            last_layer, cache = self.forwardPropagation(batch_x, training=True)
            # compute loss
            cost, cost_deriv = self.costFunction(last_layer, batch_y, self.reg_lambda)

            # compute gradients
            gradients = self.backPropagation(cost_deriv, cache)

            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)

            if i % print_every == 0:
                # print cost, train and validation set accuracies
                if i % 500 == 0 and i != 0:
                    alpha /= 2
                print("\n************")
                print("start batch", start_batch, "iter", i)
                print("COST", cost)
                print("alpha", alpha)
                print("************")
                #preds = self.predict(batch_x)
                preds = np.argmax(last_layer, axis=0)
                print("train accuracy = ", np.mean(preds == batch_y))
                preds_y = self.predict(validation_X)
                #print(gradients)
                print("validation set accuracy = ", np.mean(preds_y == validation_y))
                # delta gradients
                print(gradients.keys())
                #print(cost_deriv)
                for d in range(self.num_layers-1, 0, -1):
                    print("abs gradients dW"+str(d),np.mean(np.abs(gradients["dW"+str(d)])))
                
    def predict(self, X_):
        """
        Make predictions for each sample
        """
        
        forward, _ = self.forwardPropagation(X_, training=False)
        preds_ = np.argmax(forward, axis=0)
        
        return preds_

    def get_batch(self, X_, y_, batch_size, start_batch):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        # are minibatches random
        #get random set
        #print("get batch shape", X_.shape, y_.shape)
        ix = np.random.choice(X_.shape[1], batch_size, replace=True)
        X_batch = X_[:,ix]
        y_batch = y_[ix]
        return X_batch, y_batch

In [248]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [4]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((num_classes, y.shape[0]))
    y_one_hot[y, range(y.shape[0])] = 1
    return y_one_hot

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0

    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [5]:
# Load the data
data_root_path = 'cifar10-hw1/'
#X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
#X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [203]:
# layer_dimensions = [X_train.shape[0],4000, 2000, 10]  # including the input and output layers
# NN = NeuralNetwork(layer_dimensions)
# NN.train(X_train, y_train, iters=4000, alpha=.0001, batch_size=500, print_every=10)

In [283]:
X_train.shape[1]*.1

5000.0

In [227]:
layer_dimensions = [X_train.shape[0],9200, 3072,10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=1000, alpha=.003, batch_size=200, print_every=20)

dict_keys(['layer_dimensions', 'drop_prob', 'reg_lambda', 'W1', 'b1', 'W2', 'b2', 'W3', 'b3'])
X train shape: (3072, 50000)
y train shape: (50000,)

************
start batch 8480 iter 0
COST 3.52787601698
alpha 0.003
************
train accuracy =  0.09
validation set accuracy =  0.099664742828
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.128781521439
abs gradients dW2 0.00491939177519
abs gradients dW1 0.00131074271401

************
start batch 22661 iter 20
COST 3.65744249322
alpha 0.003
************
train accuracy =  0.14
validation set accuracy =  0.136564250552
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.049895746763
abs gradients dW2 0.00228176705527
abs gradients dW1 0.000752546291412

************
start batch 10515 iter 40
COST 2.57193766292
alpha 0.003
************
train accuracy =  0.175
validation set accuracy =  0.184943133594
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.0418304013458
abs


************
start batch 7783 iter 520
COST 1.87155607244
alpha 0.0015
************
train accuracy =  0.345
validation set accuracy =  0.351043965371
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00507625157315
abs gradients dW2 0.000506274937784
abs gradients dW1 0.000187447390971

************
start batch 18074 iter 540
COST 1.81319384949
alpha 0.0015
************
train accuracy =  0.39
validation set accuracy =  0.355521133933
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00694566226905
abs gradients dW2 0.000573969928157
abs gradients dW1 0.000202034127997

************
start batch 3268 iter 560
COST 1.82493477221
alpha 0.0015
************
train accuracy =  0.43
validation set accuracy =  0.360401459854
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.0080371942189
abs gradients dW2 0.000668386334858
abs gradients dW1 0.000228785392087

************
start batch 25059 iter 580
COST 1.8706213298
alpha 0

In [None]:
layer_dimensions = [X_train.shape[0],500, 250, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=5000, alpha=.1, batch_size=500, print_every=20)

dict_keys(['layer_dimensions', 'drop_prob', 'reg_lambda', 'W1', 'b1', 'W2', 'b2', 'W3', 'b3'])
X train shape: (3072, 45000)
y train shape: (45000,)
X validation shape: (3072, 5000)
y validation shape: (5000,)

************
start batch 37893 iter 0
COST 2.53619345687
alpha 0.1
************
train accuracy =  0.098
validation set accuracy =  0.0908
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.0162240591084
abs gradients dW2 0.00258269409548
abs gradients dW1 0.00285777948416

************
start batch 18483 iter 20
COST 2.15535130428
alpha 0.1
************
train accuracy =  0.23
validation set accuracy =  0.2314
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.0020740246964
abs gradients dW2 0.000173384042265
abs gradients dW1 0.000225679688596

************
start batch 40861 iter 40
COST 2.11403493626
alpha 0.1
************
train accuracy =  0.242
validation set accuracy =  0.2262
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
a


************
start batch 337 iter 540
COST 1.71066825964
alpha 0.05
************
train accuracy =  0.408
validation set accuracy =  0.4136
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00749752104401
abs gradients dW2 0.000788757949279
abs gradients dW1 0.000920881718872

************
start batch 3396 iter 560
COST 1.61855566876
alpha 0.05
************
train accuracy =  0.454
validation set accuracy =  0.4226
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00359397814113
abs gradients dW2 0.000392289712283
abs gradients dW1 0.000400666309144

************
start batch 29919 iter 580
COST 1.5918352175
alpha 0.05
************
train accuracy =  0.43
validation set accuracy =  0.434
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00414341185986
abs gradients dW2 0.000466462207818
abs gradients dW1 0.000546904745702

************
start batch 36198 iter 600
COST 1.60760264586
alpha 0.05
************
train accurac

validation set accuracy =  0.4684
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00437068195747
abs gradients dW2 0.000571974791256
abs gradients dW1 0.000663230263086

************
start batch 20171 iter 1100
COST 1.45005340072
alpha 0.025
************
train accuracy =  0.472
validation set accuracy =  0.4658
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.00507287303309
abs gradients dW2 0.000591533836426
abs gradients dW1 0.000650383509216

************
start batch 41345 iter 1120
COST 1.51596571915
alpha 0.025
************
train accuracy =  0.458
validation set accuracy =  0.471
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.0040873864038
abs gradients dW2 0.000505322142283
abs gradients dW1 0.000545495983183

************
start batch 1407 iter 1140
COST 1.53851509285
alpha 0.025
************
train accuracy =  0.474
validation set accuracy =  0.4722
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])


In [232]:
layer_dimensions = [X_train.shape[0],15000,10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=500, alpha=.001, batch_size=1000, print_every=20)

dict_keys(['layer_dimensions', 'drop_prob', 'reg_lambda', 'W1', 'b1', 'W2', 'b2'])
X train shape: (3072, 50000)
y train shape: (50000,)

************
start batch 28500 iter 0
COST 3.01135033467
alpha 0.001
************
train accuracy =  0.093
validation set accuracy =  0.103634618595
dict_keys(['dW2', 'db2', 'dW1', 'db1'])
abs gradients dW2 0.0963681152166
abs gradients dW1 0.00134736513366

************
start batch 9773 iter 20
COST 2.25621003281
alpha 0.001
************
train accuracy =  0.178
validation set accuracy =  0.114567684861
dict_keys(['dW2', 'db2', 'dW1', 'db1'])
abs gradients dW2 0.0210005148369
abs gradients dW1 0.000258961250303

************
start batch 10525 iter 40
COST 2.22197459286
alpha 0.001
************
train accuracy =  0.196
validation set accuracy =  0.185361375396
dict_keys(['dW2', 'db2', 'dW1', 'db1'])
abs gradients dW2 0.0290820322303
abs gradients dW1 0.000337921938262

************
start batch 6732 iter 60
COST 2.14608041935
alpha 0.001
************
trai

KeyboardInterrupt: 

In [230]:
layer_dimensions = [X_train.shape[0],1000,1000, 1000, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=500, alpha=.001, batch_size=500, print_every=20)

dict_keys(['layer_dimensions', 'drop_prob', 'reg_lambda', 'W1', 'b1', 'W2', 'b2', 'W3', 'b3', 'W4', 'b4'])
X train shape: (3072, 50000)
y train shape: (50000,)

************
start batch 32716 iter 0
COST 2.88244270501
alpha 0.001
************
train accuracy =  0.116
validation set accuracy =  0.0926456945264
dict_keys(['dW4', 'db4', 'dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW4 0.090012896713
abs gradients dW3 0.00547038320724
abs gradients dW2 0.00462144371347
abs gradients dW1 0.00228296577998

************
start batch 28349 iter 20
COST 2.3000239672
alpha 0.001
************
train accuracy =  0.12
validation set accuracy =  0.123450282534
dict_keys(['dW4', 'db4', 'dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW4 0.0110399230317
abs gradients dW3 0.000841760685675
abs gradients dW2 0.00078283129649
abs gradients dW1 0.000497942309516

************
start batch 32631 iter 40
COST 2.25945748714
alpha 0.001
************
train accuracy =  0.148
validation set accu


************
start batch 35663 iter 460
COST 2.03534834372
alpha 0.0005
************
train accuracy =  0.278
validation set accuracy =  0.277473222569
dict_keys(['dW4', 'db4', 'dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW4 0.0112781567172
abs gradients dW3 0.00077219720464
abs gradients dW2 0.000683219187827
abs gradients dW1 0.000393779836782

************
start batch 21732 iter 480
COST 2.03627004754
alpha 0.0005
************
train accuracy =  0.27
validation set accuracy =  0.287635995614
dict_keys(['dW4', 'db4', 'dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW4 0.0168780128111
abs gradients dW3 0.00123005060488
abs gradients dW2 0.00102405247794
abs gradients dW1 0.000568675994454


In [None]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [None]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [267]:

NN2 = NeuralNetwork(layer_dimensions, drop_prob=.25, reg_lambda=0)

NN2.train(X_train, y_train, iters=100, alpha=0.05, batch_size=1000, print_every=1)

dict_keys(['layer_dimensions', 'drop_prob', 'reg_lambda', 'W1', 'b1', 'W2', 'b2', 'W3', 'b3'])
X train shape: (3072, 50000)
y train shape: (50000,)

************
start batch 15097 iter 0
COST 4.38227552561
alpha 0.05
************
train accuracy =  0.092
validation set accuracy =  0.0996912034719
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.160780010789
abs gradients dW2 0.00379331624844
abs gradients dW1 0.00142146931977

************
start batch 14784 iter 1
COST 109.196056338
alpha 0.05
************
train accuracy =  0.105
validation set accuracy =  0.100859622767
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.326701860722
abs gradients dW2 0.0116309957645
abs gradients dW1 0.00558992890991

************
start batch 35837 iter 2
COST 61.6437142745
alpha 0.05
************
train accuracy =  0.095
validation set accuracy =  0.0999207144049
dict_keys(['dW3', 'db3', 'dW2', 'db2', 'dW1', 'db1'])
abs gradients dW3 0.131902462972
abs gr

KeyboardInterrupt: 

In [None]:
y_predicted2 = NN2.predict(X)
save_predictions(y_predicted, 'ans2-uni')

In [256]:
layer_dimensions

[3072, 3072, 3072, 10]

In [None]:
T = one_hot(test)
T

In [25]:
np.random.randint(10)

3