# Lab 4 - A Deeper Dive on CNNs
  <a target="_blank" href="https://colab.research.google.com/github/andrew-nash/CS6421-labs-2025/blob/main/CS6421_Lab_04.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

# Data Loading And Cleaning

For this lab, we will continue to use the MNIST dataset

In [None]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# data normalizing
x_train, x_test = x_train / 255.0, x_test / 255.0

In [None]:
print("Train shape", x_train.shape)
print("Test shape",  x_test.shape)

In [None]:
plt.imshow(x_train[0])


In [None]:
plt.imshow(x_test[0])


Currently, `x_train` and `x_test` are arrays of square 28x28 greyscale images. For compatibility with CNN models, as per the last lab, we will reshape the dataset to add a colour channel.

In [None]:
x_train_clean = x_train.reshape(-1,28,28,1)
x_test_clean = x_test.reshape(-1,28,28,1)

Now lets consider our output labels.

In [None]:
y_train.shape, y_test.shape



### One-hot Encoding


In [None]:
y_train_clean = tf.one_hot(indices=y_train, depth=10)
y_test_clean = tf.one_hot(indices=y_test, depth=10)

It is important to note that if your labels do not consist of the integers $0,1,2,3,4,\dots$ additional processing will be required to produce the one-hot vectors

### Softmax Activation

We previously discussed the softmax activation function, that maps a set of arbritary activations to a probability distribution.

The formula for this is

\begin{equation}
  softmax(x_i) = \frac{e^{x_i}}{\sum_{j=1}^Ne^{x_j}}
\end{equation}


In [None]:
@tf.function
def softmax(x):
  return tf.exp(x) / tf.reduce_sum(tf.exp(x))

### Cross-Entropy Loss
Cross entropy loss rewards low-entropy predicted probability distributions, which have high confidence in the predicted class.

\begin{equation}
  CE(y, p) = -\sum_{i=1}^{N}y_i\log(p_i)
\end{equation}

Where $y$ is the one-hot vector encoding the true class, $p$ is the predicted probability distribution over the classes.

In [None]:
@tf.function
def cross_entropy(y, p):
  # Shape of y and p are each (BATCH SIZE, 10)
  # if we don't use the tf.reduce_mean, and axis=1, this will compute the
  # sum of the loss of each sample in the batch
  # By including the reduce_mean, we are getting the avergae loss over the batch
  return tf.reduce_mean(-tf.reduce_sum(y * tf.math.log(p),axis=1))

In [None]:
tf.reduce_sum(np.ones((16,10)), axis=1)

In [None]:
!pip install -U keras-tuner

In [None]:
%load_ext tensorboard

In [None]:
import keras_tuner as kt

# Visualizing CNN Filters

In this lab, we will create a simple CNN model similarly to the last lab, but this time we will try to viualize the filters that are being learned to better understand the model's behaviour.

##  Visualize the feature maps for the first image in the Test Dataset

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Input(shape=(28,28,1)))

model.add(tf.keras.layers.Conv2D(filters=8, kernel_size=(4,4), strides=(1,1), activation="elu"))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))

model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(10, activation=softmax))


model.compile(loss=cross_entropy,
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=["accuracy"])

model.fit(x_train_clean, y_train_clean, validation_split=0.2, epochs=5, batch_size=16)

In [None]:
filters, biases = model.layers[0].get_weights()

N_FILTERS = filters.shape[-1]

f_min, f_max = filters.min(), filters.max()
filters = (filters - f_min) / (f_max - f_min)

for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(filters[:,:,0,i], cmap='gray')
  plt.axis('off')

Keras allows us to extract feature maps from any part of our model as follows:

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[0].output)

feature_maps = model_subsegment(x_test[0].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(25,25), cmap='gray')
  plt.axis('off')

And after maxpool, ...

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[1].output)

feature_maps = model_subsegment(x_test[0].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(12,12), cmap='gray')
  plt.axis('off')

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[0].output)

feature_maps = model_subsegment(x_test[3].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(25,25), cmap='gray')
  plt.axis('off')

## Using a model with regularizatoin

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Input(shape=(28,28,1)))

model.add(tf.keras.layers.Conv2D(filters=8, kernel_size=(4,4), strides=(1,1), activation="elu", kernel_regularizer=tf.keras.regularizers.L1L2(0.01)))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2,2)))

model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(10, activation=softmax))


model.compile(loss=cross_entropy,
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=["accuracy"])

model.fit(x_train_clean, y_train_clean, validation_split=0.2, epochs=5, batch_size=16)

In [None]:
filters, biases = model.layers[0].get_weights()

N_FILTERS = filters.shape[-1]

f_min, f_max = filters.min(), filters.max()
filters = (filters - f_min) / (f_max - f_min)

for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(filters[:,:,0,i], cmap='gray')
  plt.axis('off')

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[0].output)

feature_maps = model_subsegment(x_test[0].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(25,25), cmap='gray')
  plt.axis('off')

After the maxpool, ...

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[1].output)

feature_maps = model_subsegment(x_test[0].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(12,12), cmap='gray')
  plt.axis('off')

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[0].output)

feature_maps = model_subsegment(x_test[3].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(25,25), cmap='gray')
  plt.axis('off')

In [None]:
model_subsegment = tf.keras.models.Model(inputs=model.inputs, outputs=model.layers[1].output)

feature_maps = model_subsegment(x_test[3].reshape(1,28,28,1)).numpy()
N_FILTERS = feature_maps.shape[-1]
f_min, f_max = feature_maps.min(), feature_maps.max()
features = (feature_maps - f_min) / (f_max - f_min)


for i in range(N_FILTERS):
  plt.subplot(2,4,i+1)
  plt.imshow(features[:,:,:,i].reshape(12,12), cmap='gray')
  plt.axis('off')

# Transfer Learning

In your early lectures, you have seen the concepts of Transfer Learning.

The following is a simple example of using a pre-trained image classifcation model with some light transfer learning.

In [None]:
# Typically, we only use trasnfer learning on more complex data - we will upscale
# the MNIST data to simulate this, and include colour channels
x_train_modified = tf.image.grayscale_to_rgb(tf.image.resize(x_train_clean, (56,56)))
x_test_modified = tf.image.grayscale_to_rgb(tf.image.resize(x_test_clean, (56,56)))

In [None]:
pre_trained_model = tf.keras.applications.ConvNeXtTiny(
    include_top=False,
    weights="imagenet",
    input_shape=(56,56,3)
)

In [None]:
pre_trained_model.summary()

We don't want to undo the extensive training this model has undergone, so we shall freeze the weights and biases

In [None]:
pre_trained_model.trainable = False

Our input shape is satisfactory for our re-processed data, but we need to add an appropiate final Dense layer to get the correct output shape.

This uses the Keras functional API

In [None]:
final_layer = tf.keras.layers.Dense(10, activation=softmax)(pre_trained_model.output)

Add this to the model

In [None]:
our_model = tf.keras.models.Model(pre_trained_model.input, final_layer)

In [None]:
our_model.summary()

Now, the only weights and biases that will be trained are those in our final Dense layer.

In [None]:
our_model.compile(loss=cross_entropy,
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=["accuracy"])

our_model.fit(x_train_modified, y_train_clean, validation_split=0.2, epochs=5, batch_size=16)

As you see, even with a 'Tiny' model, this can take a cponsiderable amount of time to train