# Problem 2: Implementing a Multi-layer Perceptron

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [1]:
# !pip install seaborn
# Install seaborn (needed to plot confusion matrix) by uncommenting the above line

In [525]:


def sigmoid_forward(a):
    # calculates the sigmoid activation function
    # a: pre-activation values
    # returns: activated values
    return 1.0 / (1.0 + 1/np.exp(a))

def sigmoid_backward(grad_accum, a):
    # grad_accum: the gradient of the loss function w.r.t to z
    # a: the pre-activation values
    # returns: the gradient of the loss w.r.t to the preactivation values, a
    out = []
    for i in range(len(grad_accum)-1):
        out.append(grad_accum[i+1] *  np.multiply(a, (1.0 - a))[i])
    return np.array(out).T # np.dot(grad_accum[1:], np.multiply(a, (1.0 - a)).T)

def linear_forward(x, weight, bias):
    # Computes the forward pass of the linear layer
    # x: input of layer
    # weight, bias: weights and bias of neural network layer
    # returns: output of linear layer
    x_out = []
    for i_layer, (i_w, i_b) in enumerate(zip(weight, bias)):
        x_out.append(np.dot(x, i_w) + i_b)
    return np.array(x_out)

def linear_backward(grad_accum, x, weight, bias):
    #  Derivative of the linear layer w.r.t 
    # grad_accum: gradient of loss w.r.t function after linear layer
    # returns dl_dw: gradient of loss w.r.t to weights 
    # returns dl_dx: gradient of loss w.r.t to input, x  
    # return dl_dw, dl_dx 

    dl_dw = np.dot(np.insert(x, 0, 1, axis = 0), grad_accum).T
    dl_dx = np.dot(grad_accum, np.insert(weight, 0, bias, axis = 1)).T
    return dl_dw, dl_dx
    
def softmax_xeloss_forward(b, labels):
    # Input parameters: 
    ## b: pre-activation 
    # calculates the softmax of the vector b
    # calculates the cross entropy loss between the softmax of b and the labels 
    # returns: l
    y_hat = np.exp(b-100000) / np.sum(np.exp(b-100000))
    l = np.multiply(labels, np.log(y_hat).T)
    return l
    
def softmax_xeloss_backward(yhat, labels):
    # Input parameters:
    # yhat: predictions of the neural network
    # labels: target of the network
    # returns: dl_db gradient of loss w.r.t to b
    return yhat - labels
    
def data_load():
    # load in the data provided in "data/"
    # Unzip fashion_mnist.zip
    # Unzipped manually because I wasn unsure if I could import zipfile for this assignment base on conditions on pdf.
    x_train = pd.read_csv('./data/train.csv', header=None)
    x_test = pd.read_csv('./data/test.csv', header=None)
    
    y_train = x_train.iloc[:,-1]
    x_train = x_train.drop(x_train.columns[[-1]], axis = 1)
    y_test = x_test.iloc[:,-1]
    x_test = x_test.drop(x_test.columns[[-1]], axis = 1)
    
    return x_train, y_train, x_test, y_test

def load_params():
    alpha_weights = np.loadtxt('params/alpha1.txt', delimiter=',')
    beta_weights = np.loadtxt('params/alpha2.txt', delimiter=',')
    alpha_bias= np.loadtxt('params/beta1.txt', delimiter=',')
    beta_bias = np.loadtxt('params/beta2.txt', delimiter=',')
    return alpha_weights, beta_weights, alpha_bias, beta_bias

def one_hot_encode(y):
    # convert categorical target features to one hot encoded data
    labels = np.sort(y.unique())
    y_encoded = pd.DataFrame(np.zeros((len(y), len(labels))), columns = labels.astype(str))
    for i in labels:
        y_encoded.loc[y == i, str(i)] = 1
    return y_encoded




def train(batchsize=1 , eta = 0.01, num_epochs=100, h = 256, init='default'):
    X_train, y_train, X_test, y_test = data_load()  
    
    y_train = one_hot_encode(y_train) 
    y_test = one_hot_encode(y_test) 

    if init == 'default':
        alpha_weights, beta_weights, alpha_bias, beta_bias = load_params()
    elif init=='zeros':
        # initialize weights and biases to 0
        alpha_weights = np.zeros(h * len(X_train.columns))
        beta_weights = np.zeros(h * len(y_train.columns))
        alpha_bias = np.zeros(h)
        beta_bias = np.zeros(len(y_train.columns))
    elif init=='ones':
        # initialize weights and biases to 1
        alpha_weights = np.ones(h * len(X_train.columns))
        beta_weights = np.ones(h * len(y_train.columns))
        alpha_bias = np.ones(h)
        beta_bias = np.ones(len(y_train.columns))
    elif init=='random':
        # initialize weights and biases to random values between -1 and 1
        alpha_weights = np.random.uniform(-1, 1, h * len(X_train.columns))
        beta_weights = np.random.uniform(-1, 1, h * len(y_train.columns))
        alpha_bias = np.random.uniform(-1, 1, h)
        beta_bias = np.random.uniform(-1, 1, len(y_train.columns))

        
    train_loss_list = []
    test_loss_list = []
    acc_list = []
    
    
    for epoch in (range(num_epochs)):
        #continue
        error = []
        y_pred_train = np.zeros_like(y_train)
        # Iterate over batches of data        
        for i in range(batchsize):
    
            # do not shuffle data

            # select batch
            X_sample = X_train[i: i + 1]
            y_sample = y_train[i: i + 1]

            
            ######## FORWARD 
            # Linear -> Sigmoid -> Linear -> Softmax
            forward1 = linear_forward(X_sample, alpha_weights, alpha_bias)
            activation1 = sigmoid_forward(forward1)
            forward2 = linear_forward(activation1.T, beta_weights, beta_bias)
            y_pred_train[i:i+1] = (np.exp(forward2) / np.sum(np.exp(forward2))).T
            error.append(softmax_xeloss_forward(forward2, y_sample))
            
            ######## BACKWARD 
            grad_softmax = softmax_xeloss_backward(y_pred_train[i], y_sample)
            grad_beta, grad_z = linear_backward(grad_softmax, activation1, beta_weights, beta_bias)
            grad_sigmoid = sigmoid_backward(grad_z, forward1)
            grad_alpha, grad_x = linear_backward(grad_sigmoid, X_sample.to_numpy().T, alpha_weights, alpha_bias)

            ######## UPDATE
            alpha_weights = alpha_weights - eta * grad_alpha[:,1:]
            alpha_bias = alpha_bias - eta * grad_alpha[:,0]
            beta_weights = beta_weights - eta * grad_beta[:,1:]
            beta_bias = beta_bias - eta * grad_beta[:,0]



        # store average training loss for the epoch
        error.append(softmax_xeloss_forward(forward2, y_sample))
        train_loss_list.append(np.mean(error))
        
        # calculate test predictions and loss
        error = []
        y_pred_test = np.zeros_like(y_test)
        # Iterate over batches of data        
        for i in range(batchsize):
    
            # do not shuffle data

            # select batch
            x_sample = X_test[i: i + 1]
            y_sample = y_test[i: i + 1]
#             alpha_weights = alpha_weights[i:((i + 1) * 784)]
#             beta_weights = beta_weights[i:((i + 1) * 256)]
            
            ######## FORWARD 
            forward1 = linear_forward(X_sample, alpha_weights, alpha_bias)
            activation1 = sigmoid_forward(forward1)
            forward2 = linear_forward(activation1.T, beta_weights, beta_bias)
            y_pred_test[i:i+1] = (np.exp(forward2) / np.sum(np.exp(forward2))).T
            error.append(softmax_xeloss_forward(forward2, y_sample))
        error.append(softmax_xeloss_forward(forward2, y_sample))
        test_loss_list.append(np.mean(error))
        
        # calculate test accuracy
        total = len(y_test)
        correct = (y_pred_test == y_test).sum()
        acc_list.append(correct/float(total))
    # return train_loss_list, test_loss_list, as well as test and train predictions
    #pass
    return train_loss_list, test_loss_list, y_pred_train, y_pred_test
    


# Plot Loss 

In [240]:
# Plot training loss, testing loss as a function of epochs

In [526]:
train_loss, test_loss, train_pred, test_pred = train()

  y_hat = np.exp(b-100000) / np.sum(np.exp(b-100000))
  return 1.0 / (1.0 + 1/np.exp(a))
  return 1.0 / (1.0 + 1/np.exp(a))
  return 1.0 / (1.0 + 1/np.exp(a))
  out.append(grad_accum[i+1] *  np.multiply(a, (1.0 - a))[i])


# Confusion Matrix

In [484]:
def plot_confusion(yhat, y, title = '[Training or Test] Set'):

    pred_train = np.argmax(yhat, axis=1)
    true_train = np.argmax(y, axis=1)
    print(true_train.shape)
    conf_train = np.zeros((10,10))
    for i in range(len(y)):
        conf_train[ true_train[i], pred_train[i] ] += int(1)
        
    sns.heatmap(conf_train, annot=True, fmt='.3g')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Title')
    plt.show()
# plot_confusion(yhat_train, y_train, title = "Training Set")
# plot_confusion(yhat_test, y_test, title = "Test Set")
#yhat: predictions
#y: one-hot-encoded labels


# Correct and Incorrect Classification Samples

In [485]:
def plot_image(vector, out_f_name, label=None):
    """
    Takes a vector as input of size (784) and saves as an image
    """
    image = np.asarray(vector).reshape(28, 28)
    plt.imshow(image, cmap='gray')
    if label:
        plt.title(label)
    plt.axis('off')
    plt.savefig(f'{out_f_name}.png', bbox_inches='tight')
    plt.show()


In [None]:
# Use plot_image function to display samples that are correctly and incorrectly predicted

# Effect Of Learning Rate

# Effect of Initialization