In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
import tensorflow as tf

2023-09-18 22:37:18.652029: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from tensorflow.keras.datasets import mnist

# Data Utility

Here we import the MNIST Dataset (a quite used dataset). We use the tensorflow loader to help us out and abstract all the methods away.

In [10]:
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

We then normalize the data so we can bound all the data between [0, 1]

In [11]:
train_images = train_images.astype("float32") / 255.0
test_images = test_images.astype("float32") / 255.0

In [13]:
# Some metadata on our data
print(f"Train Data Shape: {train_images.shape}")
print(f"Train Label Shape: {train_labels.shape}")
print(f"Test Data Shape: {test_images.shape}")
print(f"Test Label Shape: {test_labels.shape}")

Train Data Shape: (60000, 28, 28)
Train Label Shape: (60000,)
Test Data Shape: (10000, 28, 28)
Test Label Shape: (10000,)


# Dense Layer

Here I will build a dense layer from scratch. Note, I am not going to put all the checks such as matrix dimension validation etc. However, it helps us see under the hood. All dense networks are motivated by the equation:

$$
y = Wx + b
$$

In addition, all neurons are configured with a loose version of Xavier Initilization Scheme.

In [None]:
class Dense_Layer(tf.Module):
    """
    Regular Dense Layer found in many regular Neural Networks
    """
    def __init__(
        self, 
        input_size, 
        output_size, 
        output_layer=False, 
        activation_function="linear",
        **kwargs
    ):
        super().__init__(**kwargs)
        
        self.input_size = input_size;
        self.output_size = output_size;
        self.output_layer = output_layer;
        self.activation_function = activation_function;
        
        # Weight Scheme
        self.w = tf.Variable(
            tf.random.normal([input_size, output_size]) * tf.sqrt(2 / (input_size + output_size)),
            name='w'
        );
        
        # Bias Scheme
        self.b = tf.Variable(0.0, name='b');
        
    def __call__(self, x):
        match self.activation_function: # Works with Python 3.10 and above
            case "leaky_relu":
                result = tf.nn.leaky_relu(x @ self.w + self.b)
            case "relu":
                result = tf.nn.relu(x @ self.w + self.b)
            case "softmax":
                result = tf.nn.softmax(x @ self.w + self.b)
            case "sigmoid":
                result = tf.nn.sigmoid(x @ self.w + self.b)
            case _:
                result = (x @ self.w + self.b) # I believe this is just the linear result
                
        return result

# Convolutional Layer 

This is the convolutional layer is specified by a kernel that moves across the image and aggregates information. Unlike dense networks, the kernel serves as the weight matrix! Often times, normal convolutional layers go through the process of:

$$
\text{Convolution} \longrightarrow \text{Non-linear Activation} \longrightarrow \text{Pooling layer (optional)}
$$

For brevity of the notebook, I will not be using a pooling layer (also our task is very simple enough).

Here we implement a kernel that moves across an image or feature map to extract features. The GIF below depicts such a process:
<center>
    <img src="https://media.giphy.com/media/i4NjAwytgIRDW/giphy.gif" width="500" height="500" />
</center>

Since our images are small with dimensions of $28 \times 28$, we will use the popular $3 \times 3$ kernel. The kernel filter values will be using Xavier initialization similar to our dense network above. We imitate the figure above:

In [34]:
# Define a 5 x 5 tensor like that above
matrix = tf.constant(
    [
        [1, 1, 1, 0, 0], 
        [0, 1, 1, 1, 0], 
        [0, 0, 1, 1, 1],
        [0, 0, 1, 1, 0],
        [0, 1, 1, 0, 0],
    ], dtype=tf.float32)

# We define a 3 x 3 kernel
kernel = tf.constant(
    [
        [1, 0, 1], 
        [0, 1, 0], 
        [1, 0, 1],
    ], dtype=tf.float32)

def simple_convolve_operation(matrix, kernel):
    """
    This method assumes no padding and assumes 1 stride for simplicity purposes
    Assumes Grayscale or 2-rank tensor, kernel is also 2-rank tensor
    Assumes that kernel is smaller than image
    Feature-Dim = (Init Size + padding - kernel Size + 1) / Stride -> (Init Size - Kernel-Size + 1)
    """
    row, col = matrix.shape
    k_row, k_col = kernel.shape

    feature_map = tf.Variable(tf.zeros([row - k_row + 1, col - k_col + 1], dtype=tf.float32)) # Make this Tensor Mutable
    
    row_max = int(row - k_row) + 1 # 0, 1, 2
    col_max = int(col - k_col) + 1 # 0, 1, 2

    for r in range(row_max):
        for c in range(col_max):
            splice_tensor = matrix[r:r+k_row, c:c+k_col]
            _y = tf.reduce_sum(splice_tensor * kernel)
            feature_map[r, c].assign(_y)
            
    return feature_map

feature_map_1 = simple_convolve_operation(matrix, kernel)
print(feature_map_1)

<tf.Variable 'Variable:0' shape=(3, 3) dtype=float32, numpy=
array([[4., 3., 4.],
       [2., 4., 3.],
       [2., 3., 4.]], dtype=float32)>


However, what if we want to convolve over a stack of feature maps? Say we have a $5 \times 5 \times 1$ feature map. We convolve it to get 32 (we specify this) feature maps. Now we want to get 64 feature maps from the $5 \times 5 \times 32$ image. We also need to include the number of channels now. We can reimagine the following kernel and tensor (not matrix as its stacked) as:

$$
\begin{bmatrix}
1 & 1 & 1 & 0 & 0 \\ 
0 & 1 & 1 & 0 & 0 \\ 
0 & 0 & 1 & 1 & 1 \\ 
0 & 0 & 1 & 1 & 0 \\ 
0 & 1 & 1 & 0 & 0 \\ 
\end{bmatrix}
\times 32 \quad \text{- Stacked ontop of each other}
$$

$$
\begin{bmatrix}
1 & 0 & 1 \\ 
0 & 1 & 0 \\ 
1 & 0 & 1
\end{bmatrix}
\times 32 \quad \text{- Stacked ontop of each other}
$$


We can do the following:

In [50]:
tensor_depth_32 = tf.stack([matrix] * 32, axis = -1)
kernel_depth_32 = tf.stack([kernel] * 32, axis = -1)

print(f"Tensor Size: {tensor_depth_32.shape}")
print(f"Kernel Size: {kernel_depth_32.shape}")

def convolve_depthwise(matrix, kernel):
    """
    Convolve a stack of feature maps depthwise with a kernel.
    This method assumes no padding and assumes 1 stride for simplicity purposes.
    Assumes matrix is 3-rank tensor (height, width, depth), kernel is 3-rank tensor (height, width, depth).
    Assumes that kernel is smaller than matrix in terms of height and width.
    """
    height, width, depth = matrix.shape
    k_height, k_width, k_depth = kernel.shape

    # Ensure the depth of the kernel matches the depth of the matrix
    assert depth == k_depth

    # Calculate output dimensions
    output_height = height - k_height + 1
    output_width = width - k_width + 1

    feature_map = tf.Variable(tf.zeros([output_height, output_width], dtype=tf.float32)) # Make this Tensor Mutable

    for h in range(output_height):
        for w in range(output_width):
            splice_tensor = matrix[h:h+k_height, w:w+k_width, :]  # Extract sub-tensor for convolution
            # Element-wise multiply the sub-tensor with the kernel, then sum across all channels and spatial dimensions
            _y = tf.reduce_sum(splice_tensor * kernel)
            feature_map[h, w].assign(_y)

    return feature_map

    
feature_map_stacked = convolve_depthwise(tensor_depth_32, kernel_depth_32)
print(feature_map_stacked)

# Noticed that 4 * 32 = 128 as its stacked!
# Noticed that 3 * 32 = 128 as its stacked!

Tensor Size: (5, 5, 32)
Kernel Size: (3, 3, 32)
<tf.Variable 'Variable:0' shape=(3, 3) dtype=float32, numpy=
array([[128.,  96., 128.],
       [ 64., 128.,  96.],
       [ 64.,  96., 128.]], dtype=float32)>


After convolving over the stacked feature maps. We need to do this 64 times with 64 unique kernels to get 64 feature maps!

In [51]:
tensors_list = []

for filter_times in range(64):
    tensors_list.append(convolve_depthwise(tensor_depth_32, kernel_depth_32))
    
stacked_tensors_64 = tf.stack(tensors_list)
print(stacked_tensors_64)

tf.Tensor(
[[[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.  96. 128.]]

 [[128.  96. 128.]
  [ 64. 128.  96.]
  [ 64.

# Writing the Convolutional Layer