## Neural Network Practice on the MNIST data set

### Setup

In [None]:
import numpy as np
import struct
from array import array


class MnistDataloader(object):
    def __init__(
        self,
        training_images_filepath,
        training_labels_filepath,
        test_images_filepath,
        test_labels_filepath,
    ):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        labels = []
        with open(labels_filepath, "rb") as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError(
                    "Magic number mismatch, expected 2049, got {}".format(magic)
                )
            labels = array("B", file.read())

        with open(images_filepath, "rb") as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError(
                    "Magic number mismatch, expected 2051, got {}".format(magic)
                )
            image_data = array("B", file.read())
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols : (i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img

        return images, labels

    def load_data(self):
        x_train, y_train = self.read_images_labels(
            self.training_images_filepath, self.training_labels_filepath
        )
        x_test, y_test = self.read_images_labels(
            self.test_images_filepath, self.test_labels_filepath
        )
        return (x_train, y_train), (x_test, y_test)


In [None]:
import random
import matplotlib.pyplot as plt
import os

data_base_path = "../data/mnist"
training_images_filepath = os.path.join(
    data_base_path, "train-images-idx3-ubyte/train-images-idx3-ubyte"
)
training_labels_filepath = os.path.join(
    data_base_path, "train-labels-idx1-ubyte/train-labels-idx1-ubyte"
)
test_images_filepath = os.path.join(
    data_base_path, "t10k-images-idx3-ubyte/t10k-images-idx3-ubyte"
)
test_labels_filepath = os.path.join(
    data_base_path, "t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte"
)


def show_images(images, title_texts):
    cols = 5
    rows = int(len(images) / cols) + 1
    plt.figure(figsize=(30, 20))
    index = 1
    for x in zip(images, title_texts):
        image = x[0]
        title_text = x[1]
        plt.subplot(rows, cols, index)
        plt.imshow(image, cmap=plt.cm.gray)
        if title_text != "":
            plt.title(title_text, fontsize=15)
        index += 1


mnist_dataloader = MnistDataloader(
    training_images_filepath,
    training_labels_filepath,
    test_images_filepath,
    test_labels_filepath,
)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append(f"training image [{str(r)}] = {str(y_train[r])}")

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])
    titles_2_show.append(f"test image [{str(r)}] = {str(y_test[r])}")

show_images(images_2_show, titles_2_show)


#### Identifying the shape of the data

In [None]:
import pandas as pd

df = pd.DataFrame(
    [
        ("x_train", type(x_train), "Raw image data"),
        (
            "x_train[0]",
            type(x_train[0]),
            "2D array representing pixel values for a single image",
        ),
        ("y_train", type(y_train), "Labels"),
    ],
    columns=["variable", "type", "description"],
)
df


In [None]:
len(x_train), len(x_train[0]), len(x_train[0][0]), type(x_train[0][0]), type(x_train[0][0][0])

### Construct input and output layers

The next step is to construct a random neural network.  I'll start by constructing the input layer and output layer.

First, define some utilities:

1. need a way to flatten a 28x28 image into a single list
2. need a method to normalize a list of numbers

In [None]:
import itertools
from typing import Iterable, Sequence, TypeVar

T = TypeVar("T")

# unused
def _flatten_2d_data(data: Iterable[Iterable[T]]) -> itertools.chain[T]:
    """Flatten a 2d matrix into a 1d matrix by
    chaining together each row.

    Note: this function is unused; I just like the use of generics!
    """
    return itertools.chain.from_iterable(data)

def normalize(a: np.ndarray) -> np.ndarray:
    return a / np.sum(a)


def construct_random_weights_matrix(input_size: int, output_size: int) -> np.ndarray:
    """Returns a matrix of random values taken from the
    standard normal distribution and scaled down by a reasonable constant

    Note: the arguments are ordered input -> output; this is the opposite
    order for how matrices are described (rows, columns)
    """
    return np.random.randn(output_size, input_size) * 0.01

In [None]:
# convert training data into an np.ndarray to examine its shape
pixel_grid = np.array(x_train[0])

input_layer = pixel_grid.flatten()
output_layer = np.zeros(len(set(y_train))) # 10 possible categories

print(f"{pixel_grid.shape=}")
print(f"{input_layer.shape=}")
print(f"{output_layer.shape=}")
print(f"{output_layer=}")

### Construct random weights matrix

`np.random.randn` returns values from the standard normal distribution, a common initialization
method for neural networks.

The matrix must be of size `len(output)` x `len(input)`, which makes sense as a linear transformation
from dimensionality 784 to dimensionality 10.

Here I scale the inital weights by 0.01, a relatively arbitrary value.  The values should be small because in a network with many layers, large initial weights can lead to large outputs from neurons, which when passed through an activation function (like a sigmoid or a tanh), might end up in the saturation region of the function. This can lead to vanishing gradients during backpropagation, making the network harder to train.

Note that techniques like Xavier initialization and He initialization improve upon this random method.

In [None]:
# Initialize the weights with small random values
weights = np.random.randn(output_layer.size, input_layer.size) * 0.01
weights

In [None]:
weights.shape

#### Predicting one image

Now find the output of applying the weights to the first image.

In [None]:
label1 = y_train[0]
print(f"{input_layer.shape=} {label1=}")

Apply the transformation

In [None]:
output1 = np.dot(weights, input_layer)
output1

Normalize the output layer

In [None]:
output1 = normalize(output1)
output1

#### Loss function

Now define the *loss function*; I'll use the common **squared error loss**.

In [None]:
def loss(label: int, output_layer: np.ndarray) -> float:
    """Squared error loss."""
    if abs(np.sum(output_layer) - 1) > 0.000001:
        raise Exception("output_layer must be normalized")
    return sum(
        val**2 if i != label else (val - 1) ** 2 for i, val in enumerate(output_layer)
    )


In [None]:
loss(label1, output1)

#### Challenges for next steps

- Can I add a hidden layer and define a procedure for applying the transformations?
- Can I compute the gradient vector from the loss function?

Let's construct the weights from scratch again to gain an intuition for how this can be made into a procedure.

In [None]:
# convert training data into an np.ndarray to examine its shape
pixel_grid = np.array(x_train[0])

input_layer = pixel_grid.flatten()
output_layer = np.zeros(len(set(y_train))) # 10 possible categories

hidden_layer1 = np.zeros(50)

weights0 = construct_random_weights_matrix(input_layer.size, hidden_layer1.size)
weights1 = construct_random_weights_matrix(hidden_layer1.size, output_layer.size)

print(f"{pixel_grid.shape=}")
print(f"{input_layer.shape=}")
print(f"{hidden_layer1.shape=}")
print(f"{output_layer.shape=}")
print(f"{output_layer=}")

print(f"{weights0.shape=}")
print(f"{weights1.shape=}")

In [None]:
hidden_layer1 = np.dot(weights0, input_layer)
output1 = np.dot(weights1, hidden_layer1)
output1

In [None]:
loss(label1, normalize(output1))

Now let's try adding *N* hidden layers!

In [None]:
NUM_HIDDEN_LAYERS = 7

pixel_grid = np.array(x_train[0])

input_layer = pixel_grid.flatten()
output_layer = np.zeros(len(set(y_train)))  # 10 possible categories

hidden_layers: list[np.ndarray] = []
weight_matrices = []

# add hidden_layers of random length
for _ in range(NUM_HIDDEN_LAYERS):
    hidden_layers.append(np.zeros(random.randrange(10, 70)))

# note to self: if I model a layer, I think that each layer should own
# the weights that feed into it, not the weights that follow it, simply
# because the output layer is more akin to a hidden layer than the input layer is.
weight_matrices.append(
    construct_random_weights_matrix(input_layer.size, hidden_layers[0].size)
)
for i in range(1, len(hidden_layers)):
    weight_matrices.append(
        construct_random_weights_matrix(
            hidden_layers[i - 1].size, hidden_layers[i].size
        )
    )
weight_matrices.append(
    construct_random_weights_matrix(hidden_layers[-1].size, output_layer.size)
)

assert len(weight_matrices) == NUM_HIDDEN_LAYERS + 1


In [None]:
def forward_propagate(
    input_layer: np.ndarray, weights_matrices: list[np.ndarray]
) -> np.ndarray:
    """Given an input_layer and list of weight matrices,
    perform matrix multiplication until the weights are exhausted.
    """
    for weights_matrix in weights_matrices:
        input_layer = np.dot(weights_matrix, input_layer)
    return input_layer


In [None]:
output = forward_propagate(input_layer, weight_matrices)
output

## Tutorial assisted

First, get the training data into the correct shape.  I'll stick with row-major format.

In [None]:
examples = np.array(x_train).reshape(len(x_train), -1)
labels = np.array(y_train)
input_layer_size = examples.shape[1]
hidden_layer_size = 10
output_layer_size = len(set(labels)) # number of categories

print(f"{examples.shape=}")
print(f"{labels.shape=}")
print(f"{input_layer_size=}")
print(f"{hidden_layer_size=}")
print(f"{output_layer_size=}")

In [None]:
DEV_SIZE = 1000
X_dev = examples[:DEV_SIZE]
Y_dev = labels[:DEV_SIZE]

X_train = examples[DEV_SIZE:]
Y_train = labels[DEV_SIZE:]

X_dev.shape, Y_dev.shape, X_train.shape, Y_train.shape

In [None]:
def ReLU(Z: np.ndarray) -> np.ndarray:
    return np.maximum(0, Z)


def deriv_ReLU(Z: np.ndarray) -> np.ndarray:
    return Z > 0


def softmax(Z: np.ndarray) -> np.ndarray:
    return np.exp(Z) / np.sum(np.exp(Z))


def one_hot(Y: np.ndarray) -> np.ndarray:
    """Constructs a matrix whose rows represent
    the labels (Y), and for each row, a 1 exists
    in the column corresponding to the value of that
    label.  All other entries are 0.
    """
    one_hot_Y = np.zeros((Y.size, output_layer_size))
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y


def forward_prop(
    X: np.ndarray,
    W1: np.ndarray,
    b1: np.ndarray,
    W2: np.ndarray,
    b2: np.ndarray,
):
    Z1 = W1.dot(X.T).T + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1.T).T + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2


def back_prop(
    Y: np.ndarray,
    Z1: np.ndarray,
    A1: np.ndarray,
    Z2: np.ndarray,
    A2: np.ndarray,
    W2: np.ndarray,
):
    m = Y.size
    ohY = one_hot(Y)
    dZ2 = A2 - ohY
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, 1)
    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1)
    dW1 = 1 / m * dZ1.dot(Y)
    db1 = 1 / m * np.sum(dZ1, 1)

    return dW1, db1, dW2, db2


def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1[:, None]
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2[:, None]
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2


def get_predictions(A2):
    return np.argmax(A2, 0)


def compute_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size


def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(X, W1, b1, W2, b2)
        dW1, db1, dW2, db2 = back_prop(Y, Z1, A1, Z2, A2, W2)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 50 == 0:
            a = compute_accuracy(get_predictions(A2), Y)
            print(f"iteration: {i}")
            print(f"accuracy: {a}")

    return W1, b1, W2, b2


# initialize the parameters
def init_params():
    W1 = np.random.rand(hidden_layer_size, input_layer_size) - 0.5
    b1 = np.random.randn(hidden_layer_size) - 0.5
    W2 = np.random.rand(output_layer_size, hidden_layer_size) - 0.5
    b2 = np.random.randn(output_layer_size) - 0.5
    return W1, b1, W2, b2


In [None]:
W1, b1, W2, b2 = gradient_descent(examples, labels, 100, 0.1)

In [None]:
a = np.array([1,2,3,4])
b = np.array([5,3,3,4])
a.dot(b)

In [None]:
a[:, None]