# A quick rewind on the process of learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets

First, we'll load the data and create one-hot vectors

In [None]:
# dataset = datasets.load_wine()
dataset = datasets.load_iris()
X = dataset['data']
y = dataset['target']

X.shape, y.shape

Next, shuffle it and split into training and test datasets

In [None]:
rand_ind = np.arange(y.shape[0])
np.random.shuffle(rand_ind)

X = X[rand_ind]
y = y[rand_ind]

X = X / X.max(axis=0) # normalize dataset

split = 100

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

X.shape, y.shape, X_train.shape, y_train.shape, X_test.shape, y_test.shape

Now let's define a model. It will be a two-layer fully connected neural network with ReLU activation.

In [None]:
def new_model(d_in=X.shape[1], d_hidden=20, d_out=(y.max() + 1)):
    print(d_in, d_hidden, d_out)
    model = {}
    # first layer
    model["W1"] = np.random.rand(d_in, d_hidden) #weights
    model["b1"] = np.zeros(d_hidden) # biases

    # second layer
    model["W2"] = np.random.rand(d_hidden, d_out) #weights
    model["b2"] = np.zeros(d_out) # biases
    return model
    


We'll also define some utility functions:

In [None]:
# remember ReLU activation?
def relu(activation):
    return activation * (activation > 0)

def accuracy(y_pred, y_actual):
    accurate = (np.argmax(y_pred, axis=1) == np.argmax(y_actual, axis=1)).sum()
    return accurate / y_pred.shape[0]

def to_onehot(y_labels):
    onehot = np.zeros((y_labels.shape[0], y_labels.max() + 1))
    onehot[np.arange(y_labels.shape[0]), y_labels] = 1
    return onehot

The forward pass through the network looks like this.

As you can see, a forward pass through fully connected network of a layer is nothing more than multiplication of input by some matrix and addition of a bias vector to the result!


In [None]:
def forward(model, X):
    W1, b1, W2, b2 = model["W1"], model["b1"], model["W2"], model["b2"]
    
    a1 = X
    #forward pass through layer 1
    o1 = a1 @ W1 + b1
    # pass output 1 through ReLU to compute activation 2
    a2 = relu(o1)
    # pass activation 2 through layer 2
    o2 = a2 @ W2 + b2
    # output 2 are our predictions!
    y_pred = o2
    # we're going to need intermediate values if from the forward pass
    # in training the model - hence cache
    cache = a1, o1, a2, o2
    
    return y_pred, cache

Next, let's define what happens during training

In [None]:
def train(model, X=X, y=y, lr=1e-3):
    # a matrix of one-hot vectors representing the labels
    # example: if datapoint has label 2, the corresponding one-hot vector will be [0,1,0]
    W1, b1, W2, b2 = model["W1"], model["b1"], model["W2"], model["b2"]

    # first, predictions need to be made
    y_pred, cache = forward(model, X)
    
    a1, o1, a2, o2 = cache
    
    # we'll compute loss using softmax loss funtion
    # http://ufldl.stanford.edu/tutorial/supervised/SoftmaxRegression/
    XE = np.exp(y_pred)
    true_indices = np.arange(y.shape[0]), y
    true_XE = XE[true_indices]
    XE_sums = XE.sum(axis=1)
    
    loss = (-np.log(true_XE / XE_sums)).sum() / X.shape[0]
    
    y_onehot = to_onehot(y)
    acc = accuracy(y_pred, y_onehot)
    
    # gradient of cost with respect to y_pred
    # = how should y_pred change in order for cost to be greater?
    do2 = ((XE.T / XE_sums).T - y_onehot) / X.shape[0]
    # backpropagation of gradient through layer 2
    # compute gradient with respect to W2, b2, and activation_2
    dW2 = a2.T @ do2 
    db2 = do2.sum(axis=0)
    da2 = do2 @ W2.T
    
    # propagation through relu
    do1 = da2 * (o1 > 0)
    
    # backpropagation through layer 1
    dW1 = a1.T @ do1
    db1 = do1.sum(axis=0)
    
    # this is gradient with respect to the input 
    # - we don't actually need it
    da1 = do1 @ W1.T
    
    model["W2"] -= lr * dW2
    model["b2"] -= lr * db2
    model["W1"] -= lr * dW1
    model["b1"] -= lr * db1
    
    return loss, acc
    

Let's see that in action!

In [None]:
cost_hist = []
acc_hist = []
model = new_model(d_hidden=20)
iters = 1000
for i in range(iters):
    loss, acc = train(model, lr=1e-2)
    cost_hist.append(loss)
    acc_hist.append(acc)

print('cost')
plt.plot(np.arange(iters), cost_hist)
plt.show()
print('accuracy')
plt.plot(np.arange(iters), acc_hist)
plt.show()

And what's the accuracy on the test set?

In [None]:
y_pred, _ = forward(model, X_test)
accuracy(y_pred, to_onehot(y_test))