In [1]:
from scipy.special import expit
import os
import gzip
import pickle
import wget
import random
import numpy as np
from scipy.special import softmax
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline, BSpline



In [2]:
def plotThis(x, label ):
    xnew = np.linspace(np.arange(x.shape[0]).min(),np.arange(x.shape[0]).max(),300)
    spl = make_interp_spline(np.arange(x.shape[0]),x, k=3)
    power_smooth = spl(xnew)
    xx = plt.plot(xnew,power_smooth, label = label)

def vectorized_result(y):
    e = np.zeros((10, 1))
    e[y] = 1.0
    return e

def load_mnist():
    if not os.path.exists(os.path.join(os.curdir, 'data')):
        os.mkdir(os.path.join(os.curdir, 'data'))
        wget.download('http://deeplearning.net/data/mnist/mnist.pkl.gz', out='data')

    data_file = gzip.open(os.path.join(os.curdir, 'data', 'mnist.pkl.gz'), 'rb')
    training_data, validation_data, test_data = pickle.load(data_file, encoding='latin1')
    data_file.close()

    training_inputs = np.squeeze(np.asarray([np.reshape(x, (784, 1)) for x in training_data[0]]))
    training_results = np.squeeze(np.asarray([vectorized_result(y) for y in training_data[1]]))

    validation_inputs = np.squeeze(np.asarray([np.reshape(x, (784, 1)) for x in validation_data[0]]))
    validation_results = np.asarray(validation_data[1])
    
    test_inputs = np.squeeze(np.asarray([np.reshape(x, (784, 1)) for x in test_data[0]]))
    test_results = np.squeeze(np.asarray(test_data[1]))
    return training_inputs, training_results, validation_inputs, validation_results, test_inputs, test_results

training_inputs, training_results, validation_inputs, validation_results, test_inputs, test_results= load_mnist()

In [14]:
np.random.seed(5)
def softmaxx(z):
        return softmax(z, axis=1)

def sigmoid(z):
    return expit(z)

def sigmoid_prime(z):
    return z*(1-z)

def ReLU(z):
    return np.maximum(z, 0)

def ReLU_prime(x):
    return np.where(x > 0, 1.0, 0.0)

def tanh(x):
    return np.tanh(x)

def tanh_prime(x):
    return 1.0 - x**2

    
class NN(object):
    def __init__(self, network_units, activation_fn, lr, batch_size, initialization):
        self.network_units = network_units
        if activation_fn =="ReLU":
            self.activation_fn = ReLU
            self.activation_p = ReLU_prime
        elif activation_fn == "sigmoid":
            self.activation_fn = sigmoid
            self.activation_p = sigmoid_prime
        elif activation_fn == "tanh":
            self.activation_fn = tanh
            self.activation_p = tanh_prime
        self.parameters = {}
        self.gradients = {}
        self.initialization = initialization
        self.initialize_weights()
        self.lr = lr
        self.batch_size = batch_size
    def initialize_weights(self):
        
        for i in range(len(self.network_units)-1):
            
            self.parameters["b"+str(i)] = np.zeros((self.network_units[i+1]))
            
            if self.initialization == "glorot":
                self.parameters["w"+str(i)] = np.random.uniform( 
                                -np.sqrt(6./(self.network_units[i]+self.network_units[i+1])), 
                                np.sqrt(6./(self.network_units[i]+self.network_units[i+1])),
                                size=(self.network_units[i],self.network_units[i+1])
                                                              )
            elif self.initialization == "normal":
                self.parameters["w"+str(i)] = np.random.normal(scale=1, size=(self.network_units[i],self.network_units[i+1]))
            
            elif self.initialization == "zero":
                self.parameters["w"+str(i)] = np.zeros(shape=(self.network_units[i],self.network_units[i+1]))
    
            self.gradients["b"+str(i)] = np.zeros((self.network_units[i+1]))
            self.gradients["w"+str(i)] = np.zeros((self.network_units[i], self.network_units[i+1]))

            
    def num_parameters(self):
        pars = 0.0
        for i in range(len(self.network_units)-1):
            pars += self.network_units[i]
            pars += (self.network_units[i] * self.network_units[i+1])
        print("we have ", pars/1000000, " parameters")
    
    def forward(self, x):
        self.x = x
        self.h1 = self.activation_fn(np.dot(x, self.parameters["w0"]) + self.parameters["b0"])
        self.h2 = self.activation_fn(np.dot(self.h1, self.parameters["w1"]) + self.parameters["b1"])
        self.out = expit(np.dot(self.h2, self.parameters["w2"]) + self.parameters["b2"])
        return self.out
    
    def loss(self, y):
        self.y = y
        l = -y*np.log(self.out+1e-8)-(1-y)*np.log(1-self.out+1e-8)
        m_b_l = np.sum(l, axis=1)
        self.lossy = np.mean(m_b_l)
        return self.lossy
    
    def backward(self):
        
        
        dout = (-self.y/(self.out+1e-8)) + (1-self.y) * (1/(1-self.out+1e-8))
        del2 = self.out*(1-self.out)*dout
        doutdw2 = np.dot(self.h2.T, del2)/len(self.x)
        doutdb2 = np.mean(del2, axis=0)
        
        
        douth2 = np.dot(del2, self.parameters["w2"].T)
        del1 = self.activation_p(self.h2)*douth2
        doutdw1 = np.dot(self.h1.T, del1)/len(self.x)
        doutdb1 = np.mean(del1, axis=0)
        
        douth1 = np.dot(del1, self.parameters["w1"].T)
        del0 = self.activation_p(self.h1)*douth1
        doutdw0 = np.dot(self.x.T, del0)/len(self.x)
        doutdb0 = np.mean(del0, axis=0)
        
        self.gradients["w2"] = doutdw2
        self.gradients["b2"] = doutdb2
        self.gradients["w1"] = doutdw1
        self.gradients["b1"] = doutdb1
        self.gradients["w0"] = doutdw0
        self.gradients["b0"] = doutdb0
        
    def grad_check(self, single_training_example, single_training_result):
        output = self.forward(single_training_example)
        loss = self.loss(single_training_result)
        self.backward()
        epsilon = 1e-4
        
        #finite_difference:
        numerical_gradient = 0.0
        max_difference = 0
        for i in range(10):
            #do left
            self.parameters["w0"][0][i] = self.parameters["w0"][0][i] + epsilon
            self.forward(single_training_example)
            left_loss = self.loss(single_training_result)
            self.parameters["w0"][0][i] = self.parameters["w0"][0][i] - (2 * epsilon)
            self.forward(single_training_example)
            right_loss = self.loss(single_training_result)
            self.parameters["w0"][0][i] = self.parameters["w0"][0][i] + epsilon

            numerical_gradient = (left_loss - right_loss)/ (2*epsilon)
            print("numerical_gradient", numerical_gradient)
            
            analytic_gradient = self.gradients["w0"][0][i] 
            print("analytic gradient", analytic_gradient)
            current_difference = np.abs(numerical_gradient - analytic_gradient)
            print("difference: ",current_difference)
            if current_difference > max_difference:
                max_difference = current_difference
        #only once we are done with with this for loop for our N then we append Max_difference to our list of Max differences
        
        
                

    def update(self):
        for ind in ["w0", "w1", "w2", "b0", "b1", "b2"]:
            self.parameters[ind] -= self.lr * self.gradients[ind]

    def train(self,training_inputs, training_results, epochs, validation_inputs, validation_targets):
        loss = np.zeros((epochs))
        accuracy = np.zeros((epochs))
        for epoch in range(epochs):
            shuffle(training_inputs, training_results, random_state=0)
            mini_batches = [training_inputs[k:k+self.batch_size] for k in range(0, training_inputs.shape[0], self.batch_size)]
            mini_batches_results = [training_results[k:k+self.batch_size] for k in range(0, training_results.shape[0], self.batch_size)]
            #print(mini_batches_results[0])
            #break
            current_loss=0
            for mini_batch_index in range(len(mini_batches)):
                self.forward(mini_batches[mini_batch_index])
                current_loss += self.loss(mini_batches_results[mini_batch_index])
                self.backward()
                self.update()
            current_loss /= len(mini_batches)
            loss[epoch] = current_loss
            print("loss ", current_loss)
            currentAccuray = self.test(validation_inputs, validation_targets)
            accuracy[epoch] = currentAccuray
        self.grad_check(np.asarray([training_inputs[0]]), training_results[0])
        return loss, accuracy 
    def test(self, validation_inputs, validation_targets):
        results = self.forward(validation_inputs)
        labels = np.argmax(results, axis=1)
        accuracy = np.sum(labels == validation_targets)/100
        print("validation accuracy", accuracy)
        return accuracy

In [15]:
myNet = NN([784,500,500,10], "ReLU", lr=0.01, batch_size=32, initialization="glorot")
glorot_relu_loss, glorot_relu_accuracy = myNet.train(training_inputs, training_results, 1, validation_inputs, validation_results)

loss  1.169024070649404
validation accuracy 91.51
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0
numerical_gradient 0.0
analytic gradient 0.0
difference:  0.0


-0.03780831117196146
-0.03770831117196146


In [None]:
myNet = NN([784,500,500,10], "ReLU", lr=0.01, batch_size=32, initialization="normal")
normal_relu_loss, normal_relu_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)


In [None]:
myNet = NN([784,500,500,10], "ReLU", lr=0.01, batch_size=32, initialization="zero")
zero_relu_loss, zero_relu_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
%matplotlib notebook
plotThis(glorot_relu_loss, "glorot relu loss")
plotThis(zero_relu_loss, "zero relu loss")
plotThis(normal_relu_loss, "normal relu loss")

plt.title("training loss")
plt.legend()
plt.show()


In [None]:
%matplotlib notebook
plotThis(glorot_relu_accuracy, "glorot relu accuracy")
plotThis(zero_relu_accuracy, "zero relu accuracy")
plotThis(normal_relu_accuracy, "normal relu accuracy")

plt.title("evaluation accuracy")
plt.legend()
plt.show()


In [None]:
myNet = NN([784,500,500,10], "tanh", lr=0.01, batch_size=32, initialization="glorot")
glorot_tanh_loss, glorot_tanh_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
myNet = NN([784,500,500,10], "tanh", lr=0.01, batch_size=32, initialization="normal")
normal_tanh_loss, normal_tanh_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
myNet = NN([784,500,500,10], "tanh", lr=0.01, batch_size=32, initialization="zero")
zero_tanh_loss, zero_tanh_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
%matplotlib notebook
plotThis(glorot_tanh_loss, "glorot tanh loss")
plotThis(zero_tanh_loss, "zero tanh loss")
plotThis(normal_tanh_loss, "normal tanh loss")

plt.title("training loss")
plt.legend()
plt.show()


In [None]:
%matplotlib notebook
plotThis(glorot_tanh_accuracy, "glorot tanh accuracy")
plotThis(zero_tanh_accuracy, "zero tanh accuracy")
plotThis(normal_tanh_accuracy, "normal tanh accuracy")

plt.title("evaluation accuracy")
plt.legend()
plt.show()


In [None]:
myNet = NN([784,500,500,10], "sigmoid", lr=0.01, batch_size=32, initialization="glorot")
glorot_sigmoid_loss, glorot_sigmoid_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
myNet = NN([784,500,500,10], "sigmoid", lr=0.01, batch_size=32, initialization="normal")
normal_sigmoid_loss, normal_sigmoid_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
myNet = NN([784,500,500,10], "sigmoid", lr=0.01, batch_size=32, initialization="zero")
zero_sigmoid_loss, zero_sigmoid_accuracy = myNet.train(training_inputs, training_results, 10, validation_inputs, validation_results)

In [None]:
%matplotlib notebook
plotThis(glorot_sigmoid_loss, "glorot sigmoid loss")
plotThis(zero_sigmoid_loss, "zero sigmoid loss")
plotThis(normal_sigmoid_loss, "normal sigmoid loss")

plt.title("training loss")
plt.legend()
plt.show()


In [None]:
%matplotlib notebook
plotThis(glorot_sigmoid_accuracy, "glorot sigmoid accuracy")
plotThis(zero_sigmoid_accuracy, "zero sigmoid accuracy")
plotThis(normal_sigmoid_accuracy, "normal sigmoid accuracy")

plt.title("evaluation accuracy")
plt.legend()
plt.show()
