In [1]:
import tensorflow as tf

### Loading the MNIST dataset

In [2]:
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Reshaping
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype("float32") / 255

### A simple Dense class
Let’s implement a simple Python class, _NaiveDense_, that creates two TensorFlow
variables, **W** and **b**, and exposes a **\_\_call\_\_()** method that applies the following transformation:
```
output = activation(dot(W, input) + b)
```

In [3]:
class NaiveDense:
	def __init__(self, input_size, output_size, activation):
		self.activation = activation

		# Create a matrix, W, of shape (input_size, output_size) initialized with random values
		w_shape = (input_size, output_size)
		w_initial_value = tf.random.uniform(w_shape, minval=0, maxval=1e-1)
		self.W = tf.Variable(w_initial_value)

		# Create a vector, b, of shape (output_size), initialized with zeros
		b_shape = (output_size,)
		b_initial_value = tf.zeros(b_shape)
		self.b = tf.Variable(b_initial_value)

	# Apply the forward pass
	def __call__(self, inputs):
		return self.activation(tf.matmul(inputs, self.W) + self.b)

	# Convenience method for retrieving the layer's weights
	@property
	def weights(self):
		return [self.W, self.b]

### A simple Sequential class
Now, let’s create a _NaiveSequential_ class to chain these layers. It wraps a list of layers
and exposes a **\_\_call\_\_()** method that simply calls the underlying layers on the
inputs, in order. It also features a _weights_ property to easily keep track of the layers’
parameters.

In [4]:
class NaiveSequential:
	def __init__(self, layers):
		self.layers = layers

	def __call__(self, inputs):
		x = inputs  # Start with the input data
		for layer in self.layers:
		   x = layer(x)  # Apply each layer to the data sequentially
		return x

	@property
	def weights(self):
	   weights = []  # Initialize an empty list to collect weights
	   for layer in self.layers:
		   weights += layer.weights  # Add the weights of the current layer to the list
	   return weights  # Return the list of all weights from all layers

### Mock Keras model
Using this _NaiveDense_ class and this _NaiveSequential_ class, we can create a mock Keras model.

In [5]:
model = NaiveSequential([
	# First dense layer: takes input of size 28x28 (flattened image), outputs 512 features with ReLU activation
	NaiveDense(input_size=28 * 28, output_size=512, activation=tf.nn.relu),
	# Second dense layer: takes 512 inputs from the previous layer, outputs 10 class scores with softmax activation
	NaiveDense(input_size=512, output_size=10, activation=tf.nn.softmax)
])
# Check that the model has exactly 4 weight tensors:
# Each NaiveDense layer has 2 weight tensors: one for weights (W), one for biases (b)
# So 2 layers × 2 weights each = 4 weights in total
assert len(model.weights) == 4

### Batch Generator
We need a way to iterate over the MNIST data in mini-batches.

In [6]:
import math

class BatchGenerator:
	def __init__(self, images, labels, batch_size=128):
		# Ensure the number of images and labels are the same
		assert len(images) == len(labels)
		
		self.index = 0  # Start index for batching
		self.images = images  # Dataset of input images
		self.labels = labels  # Corresponding labels for the images
		self.batch_size = batch_size  # Number of samples per batch
		
		# Total number of batches needed to cover the entire dataset
		self.num_batches = math.ceil(len(images) / batch_size)

	def next(self):
		# Select a slice of images and labels from the current index up to index + batch_size
		images = self.images[self.index : self.index + self.batch_size]
		labels = self.labels[self.index : self.index + self.batch_size]

		# Move the index forward by batch_size for the next call
		self.index += self.batch_size

		# Return the current batch of images and labels
		return images, labels

### Running one training step
The most difficult part of the process is the “training step”: updating the weights of
the model after running it on one batch of data.

We need to:
1. Compute the predictions of the model for the images in the batch.
2. Compute the loss value for these predictions, given the actual labels.
3. Compute the gradient of the loss with regard to the model’s weights.
4. Move the weights by a small amount in the direction opposite to the gradient.
---

The purpose of the “weight update” step (represented by ```update_weights``` function) is to move the weights by “a bit” in a direction that will reduce the loss on this batch.

In [7]:
from tensorflow.keras import optimizers

lr = 1e-3  # the "learning rate" determines the magnitude of the step towards the loss minimum
optimizer = optimizers.SGD(lr)

def update_weights(gradients, weights):
	optimizer.apply_gradients(zip(gradients, weights))

In [8]:
def one_training_step(model, images_batch, labels_batch):
	with tf.GradientTape() as tape:
		# Run the "forward pass" (compute the model-s predictions under a GradientTape scope)
		predictions = model(images_batch)
		per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(
			labels_batch, predictions)
		average_loss = tf.reduce_mean(per_sample_losses)
	
	# Compute the gradient of the loss with regard to the weights
	# "gradients" is a list where each entry corresponds to a weight from the model.weights list
	gradients = tape.gradient(average_loss, model.weights) 
	
	# Update the weights using the gradients
	update_weights(gradients, model.weights)
	
	return average_loss

### The full training loop
An epoch of training simply consists of repeating the training step for each batch in the training data, and the full training loop is simply the repetition of one epoch.

In [9]:
def fit(model, images, labels, epochs, batch_size=128):
	for epoch_counter in range(epochs):
		print(f"Epoch {epoch_counter}")
		batch_generator = BatchGenerator(images, labels)
		for batch_counter in range(batch_generator.num_batches):
			images_batch, labels_batch = batch_generator.next()
			loss = one_training_step(model, images_batch, labels_batch)
			if batch_counter % 100 == 0:
				print(f"loss at batch {batch_counter}: {loss:.2f}")

In [10]:
fit(model, train_images, train_labels, epochs=10, batch_size=128)

Epoch 0
loss at batch 0: 6.58
loss at batch 100: 2.20
loss at batch 200: 2.16
loss at batch 300: 2.05
loss at batch 400: 2.20
Epoch 1
loss at batch 0: 1.89
loss at batch 100: 1.85
loss at batch 200: 1.79
loss at batch 300: 1.68
loss at batch 400: 1.82
Epoch 2
loss at batch 0: 1.56
loss at batch 100: 1.56
loss at batch 200: 1.47
loss at batch 300: 1.40
loss at batch 400: 1.50
Epoch 3
loss at batch 0: 1.31
loss at batch 100: 1.32
loss at batch 200: 1.22
loss at batch 300: 1.19
loss at batch 400: 1.27
Epoch 4
loss at batch 0: 1.11
loss at batch 100: 1.14
loss at batch 200: 1.03
loss at batch 300: 1.03
loss at batch 400: 1.11
Epoch 5
loss at batch 0: 0.97
loss at batch 100: 1.01
loss at batch 200: 0.89
loss at batch 300: 0.91
loss at batch 400: 0.99
Epoch 6
loss at batch 0: 0.86
loss at batch 100: 0.90
loss at batch 200: 0.79
loss at batch 300: 0.83
loss at batch 400: 0.90
Epoch 7
loss at batch 0: 0.78
loss at batch 100: 0.82
loss at batch 200: 0.71
loss at batch 300: 0.76
loss at batch 40

### Evaluating the model
We can evaluate the model by taking the _argmax_ of its predictions over the test images,
and comparing it to the expected labels.

In [11]:
import numpy as np

predictions = model(test_images)
predictions = predictions.numpy()
predicted_labels = np.argmax(predictions, axis=1)
matches = predicted_labels == test_labels
print(f"accuracy: {matches.mean():.2f}")

accuracy: 0.81
