In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [None]:
config = {}

config['layer_specs'] = [784, 50, 10]  # The length of list denotes number of hidden layers; each element denotes number of neurons in that layer; first element is the size of input layer, last element is the size of output layer.
config['activation'] = 'tanh' # Takes values 'sigmoid', 'tanh' or 'ReLU'; denotes activation function for hidden layers
config['batch_size'] = 32  # Number of training samples per batch to be passed to network
config['epochs'] = 50  # Number of epochs to train the model
config['early_stop'] = True  # Implement early stopping or not
config['early_stop_epoch'] = 5  # Number of epochs for which validation loss increases to be counted as overfitting
config['L2_penalty'] = 0.000  # Regularization constant
config['momentum'] = True  # Denotes if momentum is to be applied or not
config['momentum_gamma'] = 0.9  # Denotes the constant 'gamma' in momentum expression
config['learning_rate'] = 0.001 # Learning rate of gradient descent algorithm


In [None]:
def softmax(x):
    return np.exp(x - max(x)) / float(sum(np.exp(x - max(x))))

In [None]:
def load_data(fname):
    data = pickle.load(open(fname, "rb"))
    
    images = data[:, :784]
    labels = data[:, 784]
    
    one_hot = np.zeros((labels.shape[0], 10))
    
    # Encode labels (0-9) to a one-hot encoding
    for i in range(one_hot.shape[0]):
        one_hot[i][int(labels[i])] = 1
        
    labels = one_hot
    
    return images, labels

In [None]:
class Activation:
    def __init__(self, activation_type = "sigmoid"):
        self.activation_type = activation_type
        self.x = None # Save the input 'x' for sigmoid or tanh or ReLU to this variable since it will be used later for computing gradients.
        
    def update(self, config):
        pass
    
    def save_weights(self):
        pass
        
    def restore_best_weights(self):
        pass
    
    def forward_pass(self, a):
        if self.activation_type == "sigmoid":
            return self.sigmoid(a)
    
        elif self.activation_type == "tanh":
            return self.tanh(a)
    
        elif self.activation_type == "ReLU":
            return self.ReLU(a)
        
    def backward_pass(self, delta):
        if self.activation_type == "sigmoid":
            grad = self.grad_sigmoid()
    
        elif self.activation_type == "tanh":
            grad = self.grad_tanh()
    
        elif self.activation_type == "ReLU":
            grad = self.grad_ReLU()
    
        return grad * delta
      
    def sigmoid(self, x):
        self.x = x
        
        # prevent overflow
        self.x = np.clip(self.x, -100, 100)

        return 1.0 / (1.0 + np.exp(-self.x)) 

    def tanh(self, x):
        self.x = x
        return np.tanh(x)

    def ReLU(self, x):
        self.x = x
        return x * (x > 0)

    def grad_sigmoid(self):
        return self.sigmoid(self.x) * (1 - self.sigmoid(self.x))

    def grad_tanh(self):
        return 1 - np.power(self.tanh(self.x), 2)

    def grad_ReLU(self):
        return self.x > 0


class Layer():
    def __init__(self, in_units, out_units):
        # np.random.seed(42)
        self.w = np.random.randn(in_units, out_units)         # Weight matrix
        self.b = np.zeros((1, out_units)).astype(np.float32)  # Bias
        self.x = None    # Save the input to forward_pass in this
        self.a = None    # Save the output of forward pass in this (without activation)
        self.d_x = None  # Save the gradient w.r.t x in this
        self.d_w = np.zeros((in_units, out_units)).astype(np.float32)  # Save the gradient w.r.t w in this
        self.d_b = np.zeros((1, out_units)).astype(np.float32)         # Save the gradient w.r.t b in this
        
        self.best_w = None
        self.best_b = None
        
        self.v_w = np.zeros((in_units, out_units)).astype(np.float32)
        self.v_b = np.zeros((1, out_units)).astype(np.float32)
        
    def update(self, config):
        if config['momentum']:
            self.v_w = config['learning_rate'] * self.d_w + config['momentum_gamma'] * self.v_w
            self.w += self.v_w
            
            self.v_b = config['learning_rate'] * self.d_b + config['momentum_gamma'] * self.v_b
            self.b += self.v_b
        else:
            self.w += config['learning_rate'] * self.d_w
            self.b += config['learning_rate'] * self.d_b
        self.d_w *= 0
        self.d_b *= 0
        
    def save_weights(self):
        self.best_w = self.w
        self.best_b = self.b
        
    def restore_best_weights(self):
        self.w = self.best_w
        self.b = self.best_b

    def forward_pass(self, x):
        self.x = x
        self.a = np.matmul(self.w.T, self.x)
        self.a += self.b[0,:]
        
        return self.a
  
    def backward_pass(self, delta):  
        self.d_w += delta * np.array([self.x]).T - config['L2_penalty'] * self.w
        self.d_x = np.matmul(delta, self.w.T)
        self.d_b += delta - config['L2_penalty'] * self.b 
                
        return self.d_x

      
class Neuralnetwork():
    def __init__(self, config):
        self.layers = []
        self.x = None        # Save the input to forward_pass in this
        self.y = None        # Save the output vector of model in this
        self.targets = None  # Save the targets in forward_pass in this variable
        
        for i in range(len(config['layer_specs']) - 1):
            self.layers.append( Layer(config['layer_specs'][i], config['layer_specs'][i+1]) )
            
            if i < len(config['layer_specs']) - 2:
                self.layers.append(Activation(config['activation']))  
    
    def loss_func(self, logits, targets):
        total = 0           # total accumulated loss
    
        return - np.log(np.dot(logits,targets))

    def update(self, config):
        for layer in self.layers:
            layer.update(config)
            
    def save_weights(self):
        for layer in self.layers:
            layer.save_weights()
            
    def restore_best_weights(self):
        for layer in self.layers:
            layer.restore_best_weights()
        
    def forward_pass(self, x, targets=None):
        x = x[0]
        
        self.x = x
        self.targets = targets
        
        # Propagate the data
        for layer in self.layers:
            x = layer.forward_pass(x)
        self.y = softmax(x)
        
        # Calculate the loss
        if self.targets is None:
            loss = None
        else:
            loss = self.loss_func(self.y, self.targets)
        
        return loss, self.y

    def backward_pass(self):
        delta = np.array([self.targets - self.y])
        
        for layer in reversed(self.layers):
            delta = layer.backward_pass(delta)

In [None]:
def get_random_samples(x, y, percentage):
    number_of_indices = len(x) * percentage / 100
    indices = np.random.permutation(len(x))[: int(number_of_indices)]
    
    randomX = [x[i] for i in indices]
    randomY = [y[i] for i in indices]
    
    randomX = np.array(randomX)
    randomY = np.array(randomY)
    
    return randomX, randomY

def accuracy(predicts, actuals):
    predicts = np.round_(predicts)
    correct = np.equal(actuals, predicts)
    
    return np.sum(correct) * 1.0 / correct.size

def test(model, X_test, y_test, config):
    predictions = []
    loss = 0
    
    for x, y in zip(X_test, y_test):
        l, p = model.forward_pass([x], y)
        
        predictions.append(p)
        loss += l
    
    loss = loss / len(X_test)
        
    acc = accuracy(predictions, y_test)
    return acc, loss

def trainer(model, X_train, y_train, X_valid, y_valid, X_test, y_test, config):   
    min_loss = 99999999
    best_epoch = 0
    early_stop_count = 0
    
    train_accs = []
    valid_accs = []
    test_accs = []
    
    for i in range(1, config['epochs'] + 1):        
        train_predictions = []
        train_loss = 0
        
        valid_predictions = []
        valid_loss = 0
        
        samples = 0
        for x, y in zip(X_train, y_train):
            l, p = model.forward_pass([x], y)
            model.backward_pass()
            
            samples += 1
            if samples == config['batch_size']:
                model.update(config)
                samples = 0
            
            train_predictions.append(p)
            train_loss += l
        
        train_loss = train_loss / len(X_train)
                    
        for x, y in zip(X_valid, y_valid):
            l, p = model.forward_pass([x], y)
            
            valid_predictions.append(p)
            valid_loss += l
            
        valid_loss = valid_loss / len(X_valid)
        
        if valid_loss >= min_loss:
            early_stop_count += 1
            
            if config['early_stop'] and early_stop_count == config['early_stop_epoch']:
                break
        else:
            early_stop_count = 0
            min_loss = valid_loss
            model.save_weights()
            best_epoch = i
        
        train_accuracy = accuracy(train_predictions, y_train)
        valid_accuracy = accuracy(valid_predictions, y_valid)
        
        train_accs.append(train_accuracy)
        valid_accs.append(valid_accuracy)
        
        test_accuracy, test_loss = test(model, X_test, y_test, config)
        
        test_accs.append(test_accuracy)
        
        # print("Epoch " + str(i) + "(Train): Acc = " + str(train_accuracy) + ", Loss = " + str(train_loss))
        # print("Epoch " + str(i) + "(Valid): Acc = " + str(valid_accuracy) + ", Loss = " + str(valid_loss))
        
    model.restore_best_weights()
    return best_epoch, min_loss, train_accs, valid_accs, test_accs

In [None]:
def est_gradient_given_b(eps, model, X_train, y_train, l_index, b_index):
    og_params = model.layers[l_index].b[0,b_index]
    
    model.layers[l_index].b[0,b_index] += eps
    loss1 = model.forward_pass(X_train, y_train)[0]   
    model.layers[l_index].b[0,b_index] -= (2.0 * eps)
    loss2 = model.forward_pass(X_train, y_train)[0]
    
    model.layers[l_index].b[0,b_index] = og_params
    
    return -(loss1 - loss2) / (2.0 * eps)

def est_gradient_given_w(eps, model, X_train, y_train, l_index, w_index):
    og_params = model.layers[l_index].w[w_index]
    
    model.layers[l_index].w[w_index] += eps
    loss1 = model.forward_pass(X_train, y_train)[0]   
    model.layers[l_index].w[w_index] -= (2.0 * eps)
    loss2 = model.forward_pass(X_train, y_train)[0]
    
    model.layers[l_index].w[w_index] = og_params
    
    return -(loss1 - loss2) / (2.0 * eps)

In [None]:
def calc_gradient_given_b(model, X_train, y_train, l_index, b_index):
    model.forward_pass(X_train, y_train)[0]
    model.backward_pass()
    
    return model.layers[l_index].d_b[0,b_index]

def calc_gradient_given_w(model, X_train, y_train, l_index, w_index):
    model.forward_pass(X_train, y_train)[0]
    model.backward_pass()
    
    return model.layers[l_index].d_w[w_index]

In [None]:
def compare_gradients(eps, model, X_train, y_train, weights, biases):
    est_grads = []
    calc_grads = []
    
    for i in range(len(weights)):
        layer = weights[i][0]
        param = weights[i][1]
        
        est_grads.append(est_gradient_given_w(eps, model, X_train, y_train, layer, param))
        calc_grads.append(calc_gradient_given_w(model, X_train, y_train, layer, param))
    
    for i in range(len(biases)):
        layer = biases[i][0]
        param = biases[i][1]
        
        est_grads.append(est_gradient_given_b(eps, model, X_train, y_train, layer, param))
        calc_grads.append(calc_gradient_given_b(model, X_train, y_train, layer, param))
    
    return est_grads, calc_grads

In [None]:
# if __name__ == "__main__":
#     train_data_fname = 'MNIST_train.pkl'
#     valid_data_fname = 'MNIST_valid.pkl'
#     test_data_fname = 'MNIST_test.pkl'

#     ### Train the network ###
#     model = Neuralnetwork(config)
#     X_train, y_train = load_data(train_data_fname)
#     X_valid, y_valid = load_data(valid_data_fname)
#     X_test, y_test = load_data(test_data_fname)
#     trainer(model, X_train, y_train, X_valid, y_valid, config)
#     test_acc = test(model, X_test, y_test, config)

In [None]:
# # part b

# ### To compare approximated gradient to backprop gradient ###

# 1) Uncomment the below code block
# 2) Change config['layer_specs'] to [784, 100, 100, 10]
# 3) Comment-out 'main' 
# 4) Rerun file

# train_data_fname = 'MNIST_train.pkl'
# valid_data_fname = 'MNIST_valid.pkl'
# test_data_fname = 'MNIST_test.pkl'
    
# X_train, y_train = load_data(train_data_fname)
# X_valid, y_valid = load_data(valid_data_fname)
# X_test, y_test = load_data(test_data_fname)

# toy_model = Neuralnetwork(config)

# eps = 0.01         # epsilon used to approximate gradient
    
# out_b = (4, 3)     # output bias weight 
# hid_b = (2, 82)    # hidden bias weight
    
# # 2 hidden to output weights
# hid_w1 = (2, (62,4))
# hid_w2 = (2, (36,7))
    
# # 2 input to hidden weights
# in_w1 = (0, (539,21))
# in_w2 = (0, (420,99))
    
# weights = [hid_w1, hid_w2, in_w1, in_w2]
# biases  = [out_b, hid_b]
    
# grads = compare_gradients(eps, toy_model, X_train, y_train[0], weights, biases)

# est_grads = grads[0]
# calc_grads = grads[1]
   
# ### Finish comparisons ###


In [None]:
# part c

train_data_fname = 'data/MNIST_train.pkl'
valid_data_fname = 'data/MNIST_valid.pkl'
test_data_fname = 'data/MNIST_test.pkl'
    
X_train, y_train = load_data(train_data_fname)
X_valid, y_valid = load_data(valid_data_fname)
X_test, y_test = load_data(test_data_fname)
        
epochs = []
losses = []
   
### Train the network ###
for i in range(10):
    model = Neuralnetwork(config)
    X_t, y_t = get_random_samples(X_train, y_train, 90)

    # print("*** Training model, run=" + str(i + 1) + " ***")
    e, l, train_accs, valid_accs, test_accs = trainer(model, X_t, y_t, X_valid, y_valid, X_test, y_test, config)
    epochs.append(e)
    losses.append(l)

    # print("Test: Acc = " + str(acc) + ", Loss = " + str(loss))

# print(epochs)
print(np.mean(epochs))  

# print(losses)
print(np.mean(losses))

average_epoch = np.mean(epochs)

config['epochs'] = int(np.round_(average_epoch))
config['early_stop'] = False

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))

print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5): 
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Best Model")
plt.legend(loc="lower right")

plt.savefig("images/c.png")
# plt.show()

In [None]:
# part d, where L2_penalty = 0.001

config['early_stop'] = False

config['epochs'] = int(np.round_(average_epoch))
config['epochs'] += int(np.round_(0.10 * config['epochs']))

config['L2_penalty'] = 0.001

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))    
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5): 
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Model With Regularization Factor = 0.001")
plt.legend(loc="lower right")

plt.savefig("images/d1.png")
# plt.show()

config['epochs'] = int(np.round_(average_epoch))
config['L2_penalty'] = 0.000

In [None]:
# part d, where L2_penalty = 0.0001

config['early_stop'] = False

config['epochs'] = int(np.round_(average_epoch))
config['epochs'] += int(np.round_(0.10 * config['epochs']))

config['L2_penalty'] = 0.0001

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5): 
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Model With Regularization Factor = 0.0001")
plt.legend(loc="lower right")

plt.savefig("images/d2.png")
# plt.show()

config['epochs'] = int(np.round_(average_epoch))
config['L2_penalty'] = 0.000

In [None]:
# part e

config['early_stop'] = False

print("sigmoid activation")
config['activation'] = 'sigmoid'

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Sigmoid Model")
plt.legend(loc="lower right")

plt.savefig("images/e1.png")
# plt.show()

# ---------------------------------------------------------------------------------------------------

print("relu activation")
config['activation'] = 'ReLU'

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5): 
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For ReLU Model")
plt.legend(loc="lower right")

plt.savefig("images/e2.png")
plt.show()

config['activation'] = 'tanh'

In [None]:
# part f.a

config['early_stop'] = False

print("half units")
config['layer_specs'] = [784, 25, 10]

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5): 
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Model With Halved Hidden Units")
plt.legend(loc="lower right")

plt.savefig("images/f1.png")
# plt.show()

# -------------------------------------------------------------------------------------------------------

print("double units")
config['layer_specs'] = [784, 100, 10]

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Model With Doubled Hidden Units")
plt.legend(loc="lower right")

plt.savefig("images/f2.png")
# plt.show()

config['layer_specs'] = [784, 50, 10]

In [None]:
# part f.b

config['early_stop'] = False

print("2 hidden layers")

# print("original 784, 50, 10 had 39700 parameters")
# print("original 784, 47, 47, 10 had 39527 parameters")

config['layer_specs'] = [784, 47, 47, 10]

best_model = Neuralnetwork(config)
e, l, tr_acc, v_acc, te_acc = trainer(best_model, X_train, y_train, X_valid, y_valid, X_test, y_test, config)

print("training accuracy")
for e in range(0, (len(tr_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(tr_acc[e]))
    
print("testing accuracy")
for e in range(0, (len(te_acc)+1), 5):
    print("Epoch" + str(e) + ": " + str(te_acc[e]))

plt.plot(range(config['epochs']), tr_acc, 'b--', label="training accuracy")
plt.plot(range(config['epochs']), te_acc, 'r-', label="testing accuracy")

plt.grid(True)

plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training vs. Testing Accuracy For Model With 2 Hidden Layers")
plt.legend(loc="lower right")

plt.savefig("images/f3.png")
# plt.show()

config['layer_specs'] = [784, 50, 10]