In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [79]:
data = pd.read_csv('train.csv')

In [80]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Each row is and example.

In [81]:
# convert the pandas dataframe to a numpy array so we can perform matrix operations
data = data.to_numpy()
print('Train: ',data.shape)

Train:  (42000, 785)


In [82]:
m, n = data.shape
# shuffle the data so we can split it into train and validation sets
np.random.shuffle(data)
# split the data into train and validation sets
train_set = data[:30000].T # we transpose the data so that columns are examples
y_train = train_set[0] # first column is the label
X_train = train_set[1:n] # rest of the columns are the features

# validation data
val_set = data[30000:m].T # validation data and we transpose the data
y_val = val_set[0] # first column is the label
X_val = val_set[1:n] # rest of the columns are the features

In [83]:
X_train.shape

(784, 30000)

In [84]:
y_train.shape

(30000,)

In [85]:
y_train

array([2, 6, 8, ..., 3, 7, 9], shape=(30000,))

In [86]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(784, 30000))

In [87]:
def init_params():
    '''Function to initialise the model parameters'''
    W1 = np.random.rand(10, 784) - 0.5 # weight matrix for hidden layer
    b1 = np.random.rand(10, 1) - 0.5 # bias for hidden layer
    W2 = np.random.rand(10, 10) - 0.5 # weight matrix for output layer
    b2 = np.random.rand(10, 1) - 0.5 #  bias for output layer
    return W1, b1, W2, b2

def ReLU(Z):
    '''ReLU activation function
    This is an elementwise operation so it goes
    through each element of Z and applies the ReLU function'''
    return np.maximum(0, Z)

def softmax(Z):
    '''Softmax activation function'''
    expZ = np.exp(Z - np.max(Z)) # subtracting the max for numerical stability
    return expZ / expZ.sum(axis=0, keepdims=True) # return softmax of Z

def forward_prop(W1, b1, W2, b2, X):
    '''Function to perform forward propagation'''
    Z1 = W1.dot(X) + b1 # linear activation for hidden layer, 784 x m
    A1 = ReLU(Z1) # non-linear activation for hidden layer
    Z2 = W2.dot(A1) + b2 # linear activation for output layer
    A2 = softmax(Z2) # non-linear activation for output layer
    return Z1, A1, Z2, A2 # return all the activations and linear combinations

def one_hot_encoding(y):
    '''Function to perform one hot encoding'''
    m = y.shape[0] # number of examples
    y_onehot = np.zeros((10, m)) # one hot encoding of y
    for i in range(m):
        y_onehot[y[i], i] = 1 # set the correct class to 1
    return y_onehot

def back_prop(Z1, A1, Z2, A2, W2, X, Y):
    '''Function to perform backpropagation'''
    m = Y.size # number of examples
    one_hot_y = one_hot_encoding(Y) # one hot encode the labels
    dZ2 = A2 - one_hot_y # derivative of loss with respect to Z2
    dW2 = (1/m) * dZ2.dot(A1.T) # derivative of loss with respect to W2
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True) # derivative of loss with respect to b2

    dA1 = W2.T.dot(dZ2) # derivative of loss with respect to A1
    dZ1 = dA1 * (Z1 > 0) # derivative of loss with respect to Z1. Z1 > 0 is an elegant derivative of ReLU
    dW1 = (1/m) * dZ1.dot(X.T) # derivative of loss with respect to W1
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True) # derivative of loss with respect to b1

    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    '''Function to update the model parameters using gradient descent'''
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    return W1, b1, W2, b2

In [88]:
def get_predictions(A2):
    '''Function to get the predictions from the output of the network'''
    return np.argmax(A2, axis=0) # return the index of the max element in each column

def get_accuracy(predictions, Y):
    '''Function to get the accuracy of the model'''
    return np.sum(predictions == Y) / Y.size # return the accuracy

def gradient_descent(X, Y, epochs, learning_rate): # epochs are the number of iterations
    '''Function to perform gradient descent'''
    W1, b1, W2, b2 = init_params() # initialise the model parameters
    for i in range(epochs):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X) # forward propagation
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W2, X, Y) # backpropagation
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate) # update the model parameters
        if i % 10 == 0:
            print('Epoch %d' % i)
            predictions = get_predictions(A2) # get the predictions
            print('Accuracy: %.2f' % get_accuracy(predictions, Y)) # print the accuracy
    return W1, b1, W2, b2

In [89]:
gradient_descent(X_train, y_train, epochs=100, learning_rate=0.1)

  return expZ / expZ.sum(axis=0, keepdims=True) # return softmax of Z


Epoch 0
Accuracy: 0.10
Epoch 10
Accuracy: 0.10
Epoch 20
Accuracy: 0.10
Epoch 30
Accuracy: 0.10
Epoch 40
Accuracy: 0.10
Epoch 50
Accuracy: 0.10
Epoch 60
Accuracy: 0.10
Epoch 70
Accuracy: 0.10
Epoch 80
Accuracy: 0.10
Epoch 90
Accuracy: 0.10


(array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]], shape=(10, 784)),
 array([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan]]),
 array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
      