In [None]:
# Residual Networks for Robust Image Classification: A Hands-On Investigation of
#           Vanishing Gradient Mitigation Across MNIST and CIFAR-10 Datasets

# Language: Python 3 (TensorFlow, Keras, PyTorch)
# Author(s): Aman Kumpawat

# Description:
# This notebook implements a full deep learning pipeline for image classification using two datasets:
# MNIST (handwritten digits) and CIFAR-10 (colored object images). It explores various architectures
# including improved CNNs, ResNet-18, no-skip ResNet, and baseline models. The goal is to benchmark
# model performance, visualize training behaviors, and demonstrate both black-box and optional white-box testing.

# Inputs:
# - TensorFlow datasets: MNIST and CIFAR-10 loaded via keras.datasets
# - Model architectures: Keras Sequential APIs and PyTorch custom classes
# - Training parameters and manual configurations

# Outputs:
# - Model summaries and layer visualizations
# - Accuracy/loss curves and confusion matrices
# - Class distribution plots and prediction samples
# - Performance metrics from black-box testing (precision, recall, F1)

# Internal Structure (31 Cells Across 5 Major Sections):

# SECTION 1: Setup & Dependencies
#   Cell 1: Mount Google Drive
#   Cell 2: Install & import packages (TensorFlow, Keras, PyTorch, Matplotlib, etc.)
#
# SECTION 2: MNIST Dataset (Improved CNN in Keras)
#   Cell 3: Load & preprocess MNIST, show class balance
#   Cell 4: Define Improved CNN for MNIST
#   Cell 5: Train the MNIST model with TensorBoard logging
#   Cell 6: Evaluate test performance and show confusion matrix
#   Cell 7: Visualize predictions and prediction probabilities
#   Cell 8: Visualize Conv layer filters and Dense output weights
#
# SECTION 3: CIFAR-10 Dataset (Improved CNN in Keras)
#   Cell 9: Load & normalize CIFAR-10, plot class distribution
#   Cell 10: Build deeper CIFAR-10 CNN with regularization
#   Cell 11: Train CIFAR-10 CNN, apply class weights
#   Cell 12: Plot accuracy/loss over epochs
#   Cell 13: Plot log-scale loss
#   Cell 14: Evaluate test performance and confusion matrix
#   Cell 15: Show CIFAR-10 predictions + prediction bars
#   Cell 16: Visualize Conv and Dense weights (CIFAR)
#
# SECTION 4: ResNet Experiments (PyTorch)
#   Cell 17: Define ResidualBlock, ResNet18, NoSkipResNet, BaselineCNN
#   Cell 18: Define training loop, optimizer, loss functions
#   Cell 18.5: Training Utility Function for ResNet18
#   Cell 19: Train ResNet-18 on MNIST
#   Cell 20: Plot MNIST ResNet accuracy/loss (log scale)
#   Cell 21: Evaluate MNIST ResNet (confusion matrix, predictions)
#   Cell 22: Train ResNet-18 on CIFAR-10
#   Cell 23: Plot CIFAR-10 ResNet training curves
#   Cell 24: Evaluate CIFAR-10 ResNet
#   Cell 25: Train NoSkip ResNet on CIFAR
#   Cell 26: NoSkip training curves
#   Cell 27: Train Baseline CNN on CIFAR
#   Cell 28: Evaluate Baseline CNN
#
# SECTION 5: Final Summary & Testing
#   Cell 29: Display model summaries from PyTorch (torchsummary)
#   Cell 30: Visualize PyTorch model weights
#   Cell 31: Explicit black-box testing using sklearn classification report
#
# Notes:
# - All visualizations follow a clean and consistent matplotlib style.
# - TensorBoard is used to track training logs.
# - All testing is done with black-box evaluation; white-box is optional and present in weight plots.


In [None]:
SECTION 1

In [None]:
# Cell 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cell 2: Install and Import Required Libraries

# Description:
#   This cell installs and imports all necessary libraries used throughout the deep learning project.
#   It ensures the environment is prepared with required packages for model development, training,
#   evaluation, and visualization.
#   Input: None
#   Output: Installs and loads all necessary libraries.

# This cell ensures all necessary libraries for deep learning, data preprocessing, evaluation, and visualization are available
# in the environment.
# It includes both core packages (TensorFlow, Keras, NumPy, Matplotlib) and supporting modules for metrics, model summaries,
# and confusion matrices.

!pip install -q tensorflow numpy matplotlib seaborn scikit-learn tensorflow-addons --no-deps

# import core libraries for deep learning and data analysis
import numpy as np                             # numerical computations
import matplotlib.pyplot as plt                # plotting and visualizations
import seaborn as sns                          # statistical data visualizations
import tensorflow as tf                        # core deep learning framework
from tensorflow import keras                   # high-level TensorFlow API
from tensorflow.keras import layers, models    # neural network components and model building
from sklearn.metrics import confusion_matrix   # evaluation metric
from tensorflow.keras.utils import to_categorical  # one-hot encoding utility


In [None]:
SECTION 2

In [None]:
# Cell 3: Load and Preprocess MNIST Dataset

# Description:
#   This cell loads and preprocesses the MNIST dataset.
#   It performs normalization of pixel values, reshapes input for CNNs, computes class distributions,
#   visualizes class frequencies, and calculates class weights to compensate for imbalance.
#   Input: None
#   Output: Preprocessed datasets (x_train, x_test), label arrays (y_train, y_test), class distribution plot,
#   class weights dictionary.

# This cell loads the MNIST digit dataset, normalizes the pixel data, computes class distributions,
# and prepares class weights for handling imbalance (if needed).

# load mnist handwritten digits dataset from keras
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# convert pixel values from 0–255 to 0–1 range for better training stability
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

# reshape input data from (samples, 28, 28) to (samples, 28, 28, 1)
# required format for Conv2D layers which expect channels last (grayscale: 1 channel)
x_train = x_train.reshape(-1, 28, 28, 1)
x_test = x_test.reshape(-1, 28, 28, 1)

# print basic dataset info: number of samples and image shapes
print(f"MNIST training samples: {x_train.shape[0]}, test samples: {x_test.shape[0]}")
print(f"MNIST image shape: {x_train.shape[1:]}, Label shape: {y_train.shape}")

# compute distribution of classes in training set
# returns unique labels and how often each occurs
unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))  # map label to count

# display formatted class distribution
print("\nMNIST Class Distribution:")
for digit in range(10):
    print(f" - digit {digit}: {class_counts[digit]}")

# plot the frequency of each digit class using a bar chart
plt.figure(figsize=(6, 4))
sns.barplot(x=list(class_counts.keys()), y=list(class_counts.values()), color='skyblue')
plt.title("MNIST Training Dataset Class Distribution")
plt.xlabel("Digit Class")
plt.ylabel("Number of Examples")
plt.tight_layout()
plt.show()

# compute class weights to balance loss during training
# useful when some digits are underrepresented
total_samples = len(y_train)
print("\nComputed class weights for MNIST:")
class_weights = {}
for cls, count in class_counts.items():
    # formula: total_samples / (num_classes * class_count)
    weight = total_samples / (len(class_counts) * count)
    class_weights[cls] = weight
    print(f" - digit {cls}: weight = {weight:.4f}")


In [None]:
# Cell 4: Define Improved CNN Model for MNIST

# Description:
#   This cell defines an improved CNN model architecture for the MNIST dataset.
#   The model includes two convolutional blocks with batch normalization and dropout layers for regularization.
#   A final dense layer maps features to digit classes.
#   Input: None (definition only)
#   Output: A compiled Keras Sequential model object ready for training.

# This cell defines a deeper CNN model with batch normalization and dropout for better generalization on MNIST.

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout, BatchNormalization

# Function Header
# Description:
#   Constructs and compiles a convolutional neural network for classifying MNIST digit images.
#   Parameters: None
#   Returns: A compiled Sequential model with convolutional, batch norm, dropout, and dense layers.
def build_mnist_model():
    model = Sequential(name="MNIST_CNN_Improved")

    # Block 1: Two convolutional layers followed by max pooling and dropout
    model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(28, 28, 1)))  # input conv
    model.add(BatchNormalization())  # normalize activations to improve training stability
    model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))  # second conv layer
    model.add(BatchNormalization())  # normalize again
    model.add(MaxPooling2D(pool_size=(2, 2)))  # downsample spatial size
    model.add(Dropout(0.25))  # prevent overfitting

    # Block 2: Deeper convolutional stack with same structure
    model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    # Fully connected layers
    model.add(Flatten())  # flatten feature maps to 1D vector
    model.add(Dense(128, activation='relu'))  # dense hidden layer
    model.add(Dropout(0.5))  # heavy dropout for regularization
    model.add(Dense(10, activation='softmax'))  # output layer for 10 digit classes

    # compile model with Adam optimizer and sparse categorical loss for integer labels
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Object Header
# Purpose: Create an instance of the improved CNN model for MNIST classification
mnist_model = build_mnist_model()

# display architecture summary in the output cell
mnist_model.summary()


In [None]:
# Cell 5: Train MNIST_CNN_Improved Model

# Description:
#   This cell trains the improved CNN model (MNIST_CNN_Improved) on the MNIST dataset.
#   Training uses class weights to handle class imbalance and a validation split to monitor performance.
#   A TensorBoard callback is included for logging training metrics.
#   Input: Preprocessed training data, compiled model
#   Output: Training history object (loss, accuracy, val_loss, val_accuracy over epochs)

# This cell trains the improved CNN on MNIST using computed class weights to balance the training process.
# A validation split is used to monitor overfitting, and TensorBoard callback is added optionally.

# training parameters
epochs_mnist = 50             # number of training epochs
batch_size_mnist = 128        # batch size for training iterations

# Object Header
# Purpose: TensorBoard callback to enable real-time visualization of training metrics
# Note: Viewable via `%tensorboard --logdir logs/mnist` in a notebook environment
tensorboard_cb = keras.callbacks.TensorBoard(log_dir="./logs/mnist", histogram_freq=1)

# Object Header
# Purpose: Train the CNN model on training data using weighted classes and validation split
# Returns: Keras History object that stores loss and accuracy per epoch
history_mnist = mnist_model.fit(
    x_train, y_train,
    validation_split=0.1,        # 10% of training data used for validation
    batch_size=batch_size_mnist,
    epochs=epochs_mnist,
    class_weight=class_weights,  # use previously computed weights to balance loss
    callbacks=[tensorboard_cb],  # enable logging callback
    verbose=2                    # show training progress per epoch
)


In [None]:
# Cell 6: Evaluate MNIST Model Performance (Accuracy, Loss, Confusion Matrix)

# Description:
#   This cell evaluates the performance of the trained CNN model on the MNIST test dataset.
#   It uses black-box testing to report test accuracy and loss, and visualizes model predictions
#   using a confusion matrix heatmap to show classification accuracy per digit class.
#   Input: Trained model, test data (x_test, y_test)
#   Output: Accuracy and loss metrics, confusion matrix plot

# This cell evaluates the trained CNN on the MNIST test set using black-box testing.
# It computes accuracy, loss, and shows a heatmap of the confusion matrix for visualizing prediction
# performance across classes.

# evaluate model performance on test set
test_loss, test_acc = mnist_model.evaluate(x_test, y_test, verbose=0)
print(f"MNIST Test Accuracy: {test_acc:.4f}")
print(f"MNIST Test Loss: {test_loss:.4f}")

# generate class probability predictions for test set
y_pred_probs = mnist_model.predict(x_test, verbose=0)

# select most likely predicted class for each example
y_pred = np.argmax(y_pred_probs, axis=1)

# compute confusion matrix comparing predicted and true labels
cm = confusion_matrix(y_test, y_pred)

# visualize confusion matrix as annotated heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=True)
plt.title("MNIST Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks(ticks=np.arange(10), labels=np.arange(10))
plt.yticks(ticks=np.arange(10), labels=np.arange(10))
plt.tight_layout()
plt.show()


In [None]:
# Cell 7: Visualize MNIST Predictions and Class Probabilities

# Description:
#   This cell visualizes model predictions on sample MNIST test images.
#   Each image is shown alongside its predicted label and a softmax probability bar chart.
#   This visual illustrates model confidence and prediction correctness across digit classes.
#   Input: Trained model, sample subset of x_test/y_test
#   Output: Grid of digit images and confidence bar plots per sample

# This cell shows a grid of sample predictions alongside their true labels, and plots softmax probability
# bars for each image.
# It highlights the model’s confidence, correctness, and class-level distinctions (499testing3-style).

# number of samples to visualize
num_images = 8

# select first `num_images` from the test set
sample_images = x_test[:num_images]
sample_labels = y_test[:num_images]

# predict class probabilities for selected samples
sample_probs = mnist_model.predict(sample_images, verbose=0)

# convert probabilities to predicted class indices
sample_preds = np.argmax(sample_probs, axis=1)

# create subplots: top row for images, bottom row for confidence bars
fig, axes = plt.subplots(2, num_images, figsize=(16, 5))
fig.suptitle("MNIST Predictions and Class Probabilities", fontsize=16)

for i in range(num_images):
    # plot grayscale digit image with true and predicted label
    axes[0, i].imshow(sample_images[i].squeeze(), cmap='gray')
    axes[0, i].set_title(
        f"True: {sample_labels[i]}\nPred: {sample_preds[i]}",
        fontsize=10,
        color='green' if sample_preds[i] == sample_labels[i] else 'red'  # highlight correct/incorrect predictions
    )
    axes[0, i].axis('off')

    # plot softmax probability distribution across all 10 digit classes
    axes[1, i].bar(np.arange(10), sample_probs[i], color='gray')
    axes[1, i].set_xticks(np.arange(10))
    axes[1, i].set_ylim(0, 1)
    axes[1, i].set_title("Confidence", fontsize=10)
    axes[1, i].set_xlabel("Digit Class")

# ensure proper spacing between plots
plt.tight_layout()
plt.show()


In [None]:
# Cell 8: Visualize Conv and Dense Weights (MNIST)

# Description:
#   This cell provides white-box testing visualizations of the MNIST model.
#   It displays the learned convolutional filters from the first Conv2D layer
#   and the weights from the final dense (output) layer as a heatmap.
#   Input: Trained CNN model
#   Output: Visualizations of conv filters and dense layer weights

# This cell visualizes the learned filters from the first convolutional layer, and the dense layer weights as a heatmap.
# It helps evaluate internal structure and feature separation (white-box testing).

# Function Header
# Description:
#   Extracts and visualizes filters from the first Conv2D layer in the model.
#   Parameters: model - trained Keras model, dataset_name - label for the plot title
#   Returns: None (displays grayscale filter images)
def show_conv1_weights(model, dataset_name="MNIST"):
    conv_weights = model.layers[0].get_weights()[0]  # shape: (3, 3, 1, 32)
    num_filters = conv_weights.shape[-1]
    fig, axes = plt.subplots(1, min(8, num_filters), figsize=(12, 3))
    fig.suptitle(f"{dataset_name} Conv Layer 1 Filters", fontsize=14)

    for i in range(min(8, num_filters)):
        filt = conv_weights[:, :, 0, i]  # extract filter i
        axes[i].imshow(filt, cmap='gray')
        axes[i].axis("off")
    plt.tight_layout()
    plt.show()

# Function Header
# Description:
#   Extracts and visualizes weights from the final Dense output layer.
#   Parameters: model - trained Keras model, dataset_name - label for the plot title
#   Returns: None (displays a heatmap of weights)
def show_dense_weights(model, dataset_name="MNIST"):
    dense_weights = None
    for layer in model.layers:
        if isinstance(layer, Dense) and layer.units == 10:  # match output layer with 10 classes
            dense_weights = layer.get_weights()[0]  # extract weights matrix
            break

    if dense_weights is not None:
        plt.figure(figsize=(10, 4))
        sns.heatmap(dense_weights.T, cmap='magma', cbar=True)
        plt.title(f"{dataset_name} Dense Layer Weight Matrix (Output Layer)")
        plt.xlabel("Features")
        plt.ylabel("Digit Classes")
        plt.tight_layout()
        plt.show()
    else:
        print("Dense output layer not found.")

# Object Header
# Purpose: Call white-box testing functions to visualize CNN internals
show_conv1_weights(mnist_model)
show_dense_weights(mnist_model)


In [None]:
SECTION 3

In [None]:
# Cell 9: Load and Preprocess CIFAR-10 Dataset

# Description:
#   This cell loads and preprocesses the CIFAR-10 dataset.
#   It includes normalization of pixel values, flattening label arrays, class name mapping,
#   visualization of class distribution, and computation of class weights for balanced training.
#   Input: None
#   Output: Preprocessed CIFAR-10 training/test sets, class weights dictionary, class distribution plot

# This cell loads the CIFAR-10 dataset, normalizes the image data, maps class labels to names,
# and computes class weights for balanced training.

import pandas as pd  # fix: import pandas

# load cifar-10 data from keras
(x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = keras.datasets.cifar10.load_data()
y_train_cifar = y_train_cifar.flatten()  # flatten label arrays to 1D
y_test_cifar = y_test_cifar.flatten()

# normalize pixel values to range [0,1] for stable training
x_train_cifar = x_train_cifar.astype("float32") / 255.0
x_test_cifar = x_test_cifar.astype("float32") / 255.0

# print dataset shape information
print(f"CIFAR-10 training samples: {x_train_cifar.shape[0]}, test samples: {x_test_cifar.shape[0]}")
print(f"Image shape: {x_train_cifar.shape[1:]}, Label shape: {y_train_cifar.shape}")

# class labels for CIFAR-10
cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']

# compute number of examples per class
cifar_counts = pd.Series(y_train_cifar).value_counts().sort_index()

# print label counts alongside class names
print("\nCIFAR-10 Class Distribution:")
for i, count in cifar_counts.items():
    print(f" - {cifar10_classes[i]} ({i}): {count}")

# visualize class distribution with bar plot
plt.figure(figsize=(7, 4))
sns.barplot(x=cifar10_classes, y=cifar_counts.values, hue=cifar10_classes, legend=False, palette='Greens')
plt.title("CIFAR-10 Training Dataset Class Distribution")
plt.xlabel("Class")
plt.ylabel("Number of Samples")
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

# compute class weights using sklearn utility
from sklearn.utils import class_weight

cifar_class_weights = class_weight.compute_class_weight(
    class_weight='balanced',               # use inverse frequency formula
    classes=np.unique(y_train_cifar),      # class labels
    y=y_train_cifar                        # target labels
)
cifar_class_weights = dict(enumerate(cifar_class_weights))  # convert to dictionary for training API

# print computed weights for each class
print("\nComputed class weights for CIFAR-10:")
for k, v in cifar_class_weights.items():
    print(f" - {cifar10_classes[k]} ({k}): {v:.4f}")


In [None]:
# Cell 10: Build CIFAR10_CNN_Improved Model

# Description:
#   This cell defines and compiles an improved CNN model for the CIFAR-10 dataset.
#   The architecture consists of three convolutional blocks with increasing filter sizes,
#   each followed by batch normalization and dropout. Fully connected layers follow for final classification.
#   Input: None (model structure defined internally)
#   Output: A compiled Keras Sequential model object for CIFAR-10 classification

# This cell defines a deeper CNN model for CIFAR-10 with 3 convolutional blocks using
# increasing filter sizes, batch normalization, dropout, and two fully connected layers.

from tensorflow.keras import models, layers

# Function Header
# Description:
#   Builds and compiles a CNN for classifying CIFAR-10 images.
#   The model uses Conv2D, BatchNormalization, Dropout, MaxPooling, and Dense layers.
#   Returns: Compiled Keras model object
def build_cifar_model():
    model = models.Sequential(name="CIFAR10_CNN_Improved")

    # Block 1: Initial feature extraction with small filters
    model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(32, 32, 3)))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(32, (3, 3), padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))  # downsample spatial resolution
    model.add(layers.Dropout(0.25))  # regularization

    # Block 2: Deeper feature extraction
    model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.3))

    # Block 3: Higher-level patterns
    model.add(layers.Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(layers.Dropout(0.4))

    # Fully connected classifier
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(10, activation='softmax'))  # final output layer for 10 CIFAR classes

    # Compile model with categorical loss and accuracy metric
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

# Object Header
# Purpose: Instantiate and summarize the CIFAR-10 CNN model
cifar_model = build_cifar_model()
cifar_model.summary()


In [None]:
# Cell 11: Train CIFAR-10 CNN Improved Model

# Description:
#   This cell trains the improved CNN model for CIFAR-10.
#   It uses class weights to balance training, a validation split to monitor overfitting,
#   and TensorBoard for logging accuracy/loss during training.
#   Input: Preprocessed CIFAR-10 data, compiled model
#   Output: Training history object with accuracy/loss metrics

# This cell trains the improved CIFAR-10 model using the training data, with validation split
# and class weights applied.
# A clean training log is shown using TensorBoard logging for visualization of performance curves.

# set training parameters
epochs_cifar = 50             # number of training epochs
batch_size_cifar = 128        # batch size per training iteration

# Object Header
# Purpose: TensorBoard callback for CIFAR-10 training visualization
# Output: Logs training and validation metrics to ./logs/cifar10_improved
tensorboard_cb_cifar = keras.callbacks.TensorBoard(
    log_dir="./logs/cifar10_improved",
    histogram_freq=1
)

# Object Header
# Purpose: Train the compiled CIFAR-10 CNN model with validation and class balancing
# Returns: Keras History object containing training/validation loss and accuracy
history_cifar = cifar_model.fit(
    x_train_cifar, y_train_cifar,
    epochs=epochs_cifar,
    batch_size=batch_size_cifar,
    validation_split=0.1,             # hold out 10% of training data for validation
    class_weight=cifar_class_weights, # apply computed weights to account for class imbalance
    callbacks=[tensorboard_cb_cifar], # enable TensorBoard logging
    verbose=2
)


In [None]:
# Cell 12: Plot CIFAR-10 Training and Validation Curves

# Description:
#   This cell visualizes training and validation accuracy/loss over epochs for the CIFAR-10 model.
#   It plots both curves to assess learning trends, convergence, and overfitting.
#   Input: Keras training history object
#   Output: Accuracy and loss plots for training vs. validation sets

# Cell 12: Plot CIFAR-10 Training and Validation Curves
# This cell visualizes training accuracy and loss per epoch alongside validation metrics.
# It helps track convergence and potential overfitting (mirrors MNIST plotting structure).

# extract accuracy and loss from training history
acc = history_cifar.history['accuracy']         # training accuracy per epoch
val_acc = history_cifar.history['val_accuracy'] # validation accuracy per epoch
loss = history_cifar.history['loss']            # training loss per epoch
val_loss = history_cifar.history['val_loss']    # validation loss per epoch
epochs_range = range(1, len(acc) + 1)           # x-axis range for plots

# plot training vs. validation accuracy
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.title("CIFAR-10 Accuracy per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# plot training vs. validation loss
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.title("CIFAR-10 Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Cell 13: Log-Scale Loss Plot for CIFAR-10

# Description:
#   This cell visualizes training and validation loss curves on a logarithmic scale for the CIFAR-10 model.
#   Log-scaling makes it easier to interpret trends in loss values, especially when they become small.
#   Input: Training and validation loss arrays
#   Output: Log-scaled loss plot per epoch

# This cell plots the training and validation loss curves using a logarithmic scale on the y-axis.
# It helps visualize performance trends more clearly when loss values become small.

# log-scale loss plot
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, loss, label='Training Loss')         # plot raw training loss
plt.plot(epochs_range, val_loss, label='Validation Loss')   # plot raw validation loss
plt.yscale('log')                                            # apply log scaling to y-axis
plt.title("CIFAR-10 Log-Scale Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Log Loss")
plt.legend()
plt.grid(True, which="both", ls="--", linewidth=0.5)         # enable fine grid lines
plt.tight_layout()
plt.show()


In [None]:
# Cell 14: Evaluate Test Accuracy and Confusion Matrix (CIFAR-10)

# Description:
#   This cell evaluates the final performance of the trained CIFAR-10 model on the test set.
#   It prints test accuracy and loss (black-box testing), then visualizes prediction correctness
#   using a labeled confusion matrix.
#   Input: Trained model and CIFAR-10 test set
#   Output: Accuracy/loss metrics and confusion matrix heatmap

# This cell evaluates the trained CIFAR-10 model using the test set, prints the test accuracy and loss,
# and visualizes the confusion matrix to show true vs predicted class distributions.

# evaluate model on test data
test_loss, test_acc = cifar_model.evaluate(x_test_cifar, y_test_cifar, verbose=0)
print(f"CIFAR-10 Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}")

# generate prediction probabilities for all test samples
y_pred_probs = cifar_model.predict(x_test_cifar, verbose=0)

# convert probabilities to class indices
y_pred = np.argmax(y_pred_probs, axis=1)

# compute confusion matrix using true vs predicted labels
conf_mat = confusion_matrix(y_test_cifar, y_pred)

# plot confusion matrix with class name annotations
plt.figure(figsize=(7, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=cifar10_classes,
            yticklabels=cifar10_classes)
plt.title("CIFAR-10 Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.xticks(rotation=30)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# Cell 15: Show CIFAR-10 Predictions and Probability Bars

# Description:
#   This cell performs black-box testing visualization for the CIFAR-10 model.
#   It randomly selects test images, shows predicted vs true labels, and
#   plots softmax probability distributions to interpret model confidence.
#   Input: Trained model, CIFAR-10 test data
#   Output: Grid of test images with class probability bar plots

# This cell displays a few test images from CIFAR-10 alongside their predicted labels,
# true labels, and class probability distributions.

# randomly select a subset of test images to visualize
num_images = 6
indices = np.random.choice(len(x_test_cifar), num_images, replace=False)
images = x_test_cifar[indices]             # image samples
true_labels = y_test_cifar[indices]        # corresponding true class labels

# get predicted probabilities and class labels
pred_probs = cifar_model.predict(images)   # softmax probabilities
pred_labels = np.argmax(pred_probs, axis=1)  # predicted class indices

# create side-by-side subplot layout (image + probability bars)
fig, axes = plt.subplots(num_images, 2, figsize=(10, 2.2 * num_images))

for i in range(num_images):
    # show image with predicted and true class
    axes[i, 0].imshow(images[i])
    axes[i, 0].axis('off')
    axes[i, 0].set_title(
        f"True: {cifar10_classes[true_labels[i]]}\nPred: {cifar10_classes[pred_labels[i]]}",
        color='green' if true_labels[i] == pred_labels[i] else 'red',
        fontsize=11
    )

    # plot bar chart of class probabilities
    axes[i, 1].bar(np.arange(10), pred_probs[i], color='skyblue')
    axes[i, 1].set_xticks(np.arange(10))
    axes[i, 1].set_xticklabels(cifar10_classes, rotation=30)
    axes[i, 1].set_ylim(0, 1)
    axes[i, 1].set_title("Prediction Probabilities", fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# Cell 16: Visualize Weights of Conv and Dense Layers (CIFAR-10)

# Description:
#   This cell performs white-box testing visualizations of the trained CIFAR-10 model.
#   It displays the learned filters from the first convolutional layer and the weights
#   of the final dense layer in heatmap form to inspect internal feature representations.
#   Input: Trained model
#   Output: Filter visualizations and dense layer heatmap

# This cell visualizes the first layer's learned filters and the fully connected (dense) layer weight matrix.

# Function Header
# Description:
#   Displays visualizations of the learned filters from the first convolutional layer of the CIFAR-10 model.
#   Parameters: model - trained Keras model
#   Returns: None (shows a grid of RGB filters)
def show_cifar_conv_filters(model):
    filters = model.layers[0].get_weights()[0]  # shape: (3, 3, 3, 32)
    filters = (filters - filters.min()) / (filters.max() - filters.min())  # normalize filter values to [0, 1]
    num_filters = filters.shape[-1]
    n_cols = 8
    n_rows = (num_filters + n_cols - 1) // n_cols  # ceiling division to get rows needed

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 1.5 * n_rows))
    fig.suptitle("CIFAR-10 Conv1 Filters", fontsize=14)

    for i in range(n_rows * n_cols):
        ax = axes[i // n_cols, i % n_cols]
        if i < num_filters:
            ax.imshow(filters[:, :, :, i])  # visualize each filter as RGB image
        ax.axis('off')
    plt.tight_layout()
    plt.show()

# Function Header
# Description:
#   Visualizes the learned weight matrix from the final dense layer using a heatmap.
#   Parameters: model - trained Keras model
#   Returns: None (displays heatmap of weights)
def show_cifar_fc_weights(model):
    for layer in model.layers[::-1]:
        if isinstance(layer, layers.Dense):  # search backward for the last Dense layer
            weights = layer.get_weights()[0]
            break
    plt.figure(figsize=(10, 4))
    sns.heatmap(weights, cmap='magma', cbar=True)
    plt.title("CIFAR-10 Final Dense Layer Weights")
    plt.xlabel("Features")
    plt.ylabel("Classes")
    plt.tight_layout()
    plt.show()

# Object Header
# Purpose: Call visualization functions to inspect internal filter and weight structures
show_cifar_conv_filters(cifar_model)
show_cifar_fc_weights(cifar_model)


In [None]:
SECTION 4

In [None]:
# Cell 17: Define ADTs - ResidualBlock, ResNet18, BaselineCNN
# This cell defines the core architecture components used in the custom ResNet and baseline CNN models.
# These user-defined ADTs include ResidualBlock (with skip connections), a simplified ResNet18, and a basic CNN without skip connections.
# These modules allow experimentation with model depth and skip vs no-skip architectures across datasets.

import torch
import torch.nn as nn
import torch.nn.functional as F

# user-defined ADT: ResidualBlock (standard skip connection block for ResNet)
# description: builds a 2-layer convolutional residual block with optional downsampling
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.skip = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.skip = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1,
                          stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.skip(x)
        return F.relu(out)

# user-defined ADT: ResNet18
# description: defines a 4-block ResNet model using stacked residual blocks with global average pooling and a final classifier
class ResNet18(nn.Module):
    def __init__(self, num_classes=10, in_channels=3):
        super(ResNet18, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)

        self.layer1 = self._make_layer(64, 2, stride=1)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, out_channels, blocks, stride):
        layers = [ResidualBlock(self.in_channels, out_channels, stride)]
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# user-defined ADT: BaselineCNN
# description: defines a simple 4-layer convolutional model with downsampling but no skip connections
class BaselineCNN(nn.Module):
    def __init__(self, num_classes=10, in_channels=3):
        super(BaselineCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(128)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(256)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1, bias=False)
        self.bn4 = nn.BatchNorm2d(512)
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512, num_classes)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu(self.bn2(self.conv2(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu(self.bn3(self.conv3(x)))
        x = F.max_pool2d(x, 2)
        x = self.relu(self.bn4(self.conv4(x)))
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


In [None]:
# Cell 18: Define Training Loop, Optimizer, and Learning Rate Scheduler

# Description:
#   This cell defines training and evaluation utilities for PyTorch models using black-box testing methodology.
#   It includes label smoothing loss, learning rate scheduling, mixed-precision support, and training loops.
#   Input: PyTorch model, dataloaders, optimizer, loss function, epoch count
#   Output: Best trained model (by validation accuracy), accuracy and loss per epoch

# This cell defines helper functions for training and evaluation of PyTorch models using black-box testing principles.
# It includes support for mixed-precision training (optional), accuracy tracking, and adjustable learning rate.

import torch.optim as optim
import copy
import time

# User-Defined ADT Header
# ADT Name: SmoothCrossEntropyLoss
# Description: Implements label smoothing to regularize classification and prevent overconfident predictions.
class SmoothCrossEntropyLoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super(SmoothCrossEntropyLoss, self).__init__()
        self.smoothing = smoothing

    def forward(self, pred, target):
        num_classes = pred.size(1)
        one_hot = torch.zeros_like(pred).scatter(1, target.unsqueeze(1), 1)
        one_hot = one_hot * (1 - self.smoothing) + (1 - one_hot) * self.smoothing / (num_classes - 1)
        log_prob = torch.nn.functional.log_softmax(pred, dim=1)
        return -(one_hot * log_prob).sum(dim=1).mean()

# Function Header
# Description:
#   Adjusts the learning rate based on epoch number using step decay.
#   Decreases learning rate after fixed epoch thresholds.
# Parameters: optimizer - optimizer to adjust, epoch - current epoch number, base_lr - starting learning rate
# Returns: None
def adjust_learning_rate(optimizer, epoch, base_lr=0.1):
    lr = base_lr
    if epoch > 40:
        lr *= 0.01
    elif epoch > 20:
        lr *= 0.1
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

# Function Header
# Description:
#   Trains the model for one epoch using mixed-precision if enabled.
#   Computes running loss and accuracy across the epoch.
# Parameters: model, loader, criterion, optimizer, scaler, epoch, total_epochs
# Returns: Average loss, accuracy
def train_one_epoch(model, loader, criterion, optimizer, scaler, epoch, total_epochs):
    model.train()
    correct, total, running_loss = 0, 0, 0.0
    start_time = time.time()

    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()

        # use mixed-precision context if enabled
        with torch.cuda.amp.autocast(enabled=USE_FP16):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    acc = correct / total
    avg_loss = running_loss / total
    print(f"[Epoch {epoch}/{total_epochs}] Train Loss: {avg_loss:.4f} | Train Acc: {acc:.4f} | Time: {time.time() - start_time:.2f}s")
    return avg_loss, acc

# Function Header
# Description:
#   Evaluates model performance on validation/test set.
#   Computes average loss and classification accuracy.
# Parameters: model, loader, criterion
# Returns: Average loss, accuracy
def evaluate(model, loader, criterion):
    model.eval()
    correct, total, running_loss = 0, 0, 0.0

    with torch.no_grad():
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

    acc = correct / total
    avg_loss = running_loss / total
    return avg_loss, acc

# Function Header
# Description:
#   Trains and evaluates a PyTorch model over multiple epochs.
#   Tracks and restores the best model based on validation accuracy.
# Parameters: model, train_loader, test_loader, epochs, criterion, optimizer
# Returns: Best-performing model and its validation accuracy
def train_and_evaluate_model(model, train_loader, test_loader, epochs, criterion, optimizer):
    model.to(device)
    scaler = torch.cuda.amp.GradScaler(enabled=USE_FP16)

    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    for epoch in range(1, epochs + 1):
        adjust_learning_rate(optimizer, epoch)
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, scaler, epoch, epochs)
        val_loss, val_acc = evaluate(model, test_loader, criterion)
        print(f"Eval Loss: {val_loss:.4f} | Eval Acc: {val_acc:.4f}\n")

        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_model_wts)
    return model, best_acc


In [None]:
# Cell 18.5: Training Utility Function for ResNet18

# Description:
#   This function implements a PyTorch training utility for models such as ResNet18.
#   It trains and evaluates over a given number of epochs, tracks accuracy and loss for both training
#   and validation sets, and returns the best model based on validation accuracy.
#   Input: PyTorch model, DataLoaders, epoch count, loss function, optimizer
#   Output: Trained model, best validation accuracy, and full training/validation performance history

# This function handles training, evaluation, and tracks loss/accuracy across epochs (PyTorch).

# Function Header
# Description:
#   Executes the training and evaluation process over multiple epochs for a given PyTorch model.
#   Tracks and prints epoch-wise performance, storing the best validation accuracy.
# Parameters:
#   - model: a PyTorch model (e.g., ResNet18)
#   - train_loader: DataLoader for training data
#   - test_loader: DataLoader for validation/testing
#   - epochs: number of training epochs
#   - criterion: loss function
#   - optimizer: optimizer object (e.g., SGD or Adam)
# Returns:
#   - model: trained model with best weights
#   - best_acc: highest validation accuracy achieved
#   - train_acc, val_acc: lists of accuracy per epoch
#   - train_loss, val_loss: lists of loss per epoch
def train_and_evaluate_model(model, train_loader, test_loader, epochs, criterion, optimizer):
    train_acc, val_acc = [], []
    train_loss, val_loss = []
    best_acc = 0.0

    for epoch in range(1, epochs + 1):
        model.train()
        correct = 0
        total = 0
        running_loss = 0.0

        # training loop for one epoch
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        train_acc.append(correct / total)
        train_loss.append(running_loss / len(train_loader))

        # evaluation loop
        model.eval()
        correct = 0
        total = 0
        val_running_loss = 0.0

        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item()
                _, predicted = outputs.max(1)
                correct += predicted.eq(labels).sum().item()
                total += labels.size(0)

        val_acc_epoch = correct / total
        val_acc.append(val_acc_epoch)
        val_loss.append(val_running_loss / len(test_loader))

        if val_acc_epoch > best_acc:
            best_acc = val_acc_epoch

        print(f"Epoch {epoch}/{epochs} - Train Acc: {train_acc[-1]:.4f}, Val Acc: {val_acc[-1]:.4f}, Train Loss: {train_loss[-1]:.4f}, Val Loss: {val_loss[-1]:.4f}")

    return model, best_acc, train_acc, val_acc, train_loss, val_loss


In [None]:
# Cell 19: Train ResNet-18 on MNIST

# Description:
#   This cell trains a ResNet-18 model on the MNIST dataset using PyTorch.
#   It uses label smoothing for regularization, SGD optimizer with momentum,
#   and tracks performance via black-box testing (accuracy/loss on test set).
#   Input: MNIST data (converted to PyTorch tensors)
#   Output: Trained model and printed best test accuracy

# This cell initializes and trains a ResNet-18 model on the MNIST dataset using PyTorch.
# It uses label smoothing, SGD optimizer, and black-box testing metrics to track best validation accuracy.

# setup: training hyperparameters
EPOCHS_MNIST_RESNET = 50
LEARNING_RATE = 0.1
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4
LABEL_SMOOTHING = 0.1
USE_FP16 = True  # enable mixed-precision training

# Object Header
# Purpose: Select appropriate hardware (GPU if available) for model training
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# reshape MNIST dataset to match PyTorch (N, C, H, W) input format
x_train_torch = torch.tensor(x_train.reshape(-1, 1, 28, 28), dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.long)
x_test_torch = torch.tensor(x_test.reshape(-1, 1, 28, 28), dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.long)

# Object Header
# Purpose: Wrap MNIST tensors in DataLoader objects for batch training and evaluation
from torch.utils.data import DataLoader, TensorDataset
mnist_train_loader = DataLoader(TensorDataset(x_train_torch, y_train_torch), batch_size=64, shuffle=True)
mnist_test_loader = DataLoader(TensorDataset(x_test_torch, y_test_torch), batch_size=64, shuffle=False)

# Object Header
# Purpose: Initialize ResNet-18 model for grayscale MNIST (1 input channel)
model_mnist = ResNet18(num_classes=10, in_channels=1).to(device)

# Object Header
# Purpose: Define cross-entropy loss with label smoothing
criterion_mnist = SmoothCrossEntropyLoss(smoothing=LABEL_SMOOTHING)

# Object Header
# Purpose: Configure SGD optimizer with momentum and weight decay
optimizer_mnist = optim.SGD(model_mnist.parameters(), lr=LEARNING_RATE,
                            momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# Object Header
# Purpose: Train ResNet-18 model on MNIST and store training history
print("Training ResNet-18 on MNIST...")
model_mnist, best_acc_mnist, train_acc_mnist, val_acc_mnist, train_loss_mnist, val_loss_mnist = train_and_evaluate_model(
    model_mnist, mnist_train_loader, mnist_test_loader,
    EPOCHS_MNIST_RESNET, criterion_mnist, optimizer_mnist
)

# output best test accuracy from training
print(f"Best MNIST ResNet18 Test Accuracy: {best_acc_mnist:.4f}")


In [None]:
# Cell 20: Plot ResNet-18 MNIST Training Curves

# Description:
#   This cell visualizes the training and validation performance of the ResNet-18 model on MNIST.
#   Accuracy is plotted on a standard scale, while loss is shown on a logarithmic scale to highlight small improvements.
#   Input: Training/validation accuracy and loss lists
#   Output: Accuracy and loss plots across all epochs

# This cell plots the training and validation accuracy/loss curves for ResNet-18 on MNIST.
# Log-scale is used for the loss plot to emphasize changes during training.

# define x-axis range based on number of epochs completed
epochs_range = range(1, len(train_acc_mnist) + 1)

# plot training and validation accuracy
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, train_acc_mnist, label='Train Accuracy')
plt.plot(epochs_range, val_acc_mnist, label='Validation Accuracy')
plt.title("MNIST ResNet-18 Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

# plot training and validation loss on a logarithmic y-axis
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, train_loss_mnist, label='Train Loss')
plt.plot(epochs_range, val_loss_mnist, label='Validation Loss')
plt.yscale("log")
plt.title("MNIST ResNet-18 Loss (Log Scale)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Cell 21: Evaluate ResNet-18 on MNIST

# Description:
#   This cell evaluates a trained ResNet-18 model on the MNIST test dataset using black-box testing.
#   It computes overall accuracy, displays the confusion matrix, and visualizes correct and incorrect predictions.
#   Input: Trained model, test DataLoader, test dataset tensors
#   Output: Accuracy score, confusion matrix heatmap, and sample prediction grid

# This cell evaluates the ResNet-18 model on the MNIST test set.
# It outputs the test accuracy, plots the confusion matrix, and displays correct vs. incorrect predictions.

from sklearn.metrics import confusion_matrix
import seaborn as sns

# set model to evaluation mode
model_mnist.eval()
all_preds = []
all_labels = []

# run predictions across entire test set
with torch.no_grad():
    for images, labels in mnist_test_loader:
        images = images.to(device)
        outputs = model_mnist(images)
        preds = outputs.argmax(dim=1).cpu().numpy()  # convert logits to predicted classes
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# compute confusion matrix and overall test accuracy
conf_mat = confusion_matrix(all_labels, all_preds)
accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"MNIST ResNet-18 Test Accuracy: {accuracy:.4f}")

# plot confusion matrix heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("MNIST ResNet-18 Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()

# identify indices for correct and incorrect classifications
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
correct_idx = np.where(all_preds == all_labels)[0]
incorrect_idx = np.where(all_preds != all_labels)[0]

# Function Header
# Description:
#   Displays sample correct and incorrect predictions from MNIST using ResNet-18.
#   Highlights accurate predictions in green and misclassifications in red.
# Parameters:
#   - x_test_torch: full test image tensor (N, 1, 28, 28)
#   - correct_idx, incorrect_idx: indices of correct and incorrect predictions
#   - preds, labels: predicted and true labels
# Returns: None (plots sample images with prediction labels)
def show_resnet_preds(x_test_torch, correct_idx, incorrect_idx, preds, labels):
    idx_correct = np.random.choice(correct_idx, size=4, replace=False)
    idx_incorrect = np.random.choice(incorrect_idx, size=4, replace=False)
    selected = np.concatenate([idx_correct, idx_incorrect])

    plt.figure(figsize=(10, 4))
    for i, idx in enumerate(selected):
        img = x_test_torch[idx].squeeze().numpy()
        true = labels[idx]
        pred = preds[idx]
        color = 'green' if i < 4 else 'red'
        plt.subplot(2, 4, i+1)
        plt.imshow(img, cmap='gray')
        plt.title(f"T:{true}, P:{pred}", color=color)
        plt.axis('off')
    plt.suptitle("MNIST ResNet18 Predictions: Correct (green) vs Incorrect (red)")
    plt.tight_layout()
    plt.show()

# call visualization function to show prediction examples
show_resnet_preds(x_test_torch, correct_idx, incorrect_idx, all_preds, all_labels)


In [None]:
# Cell 22: Train ResNet-18 on CIFAR-10

# Description:
#   This cell trains a ResNet-18 model on the CIFAR-10 dataset using PyTorch.
#   It uses label smoothing, SGD optimizer with momentum, and tracks model performance over 50 epochs.
#   Input: CIFAR-10 training and test sets
#   Output: Trained model, performance metrics, and best test accuracy

# This cell trains a ResNet-18 model on the CIFAR-10 dataset using PyTorch.
# It uses SGD optimizer, label smoothing, and tracks accuracy/loss for evaluation.

# setup: training parameters
EPOCHS_CIFAR_RESNET = 50
LEARNING_RATE = 0.1
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4
LABEL_SMOOTHING = 0.1

# reshape CIFAR-10 data to PyTorch format (N, C, H, W)
x_train_cifar_torch = torch.tensor(x_train_cifar.transpose(0, 3, 1, 2), dtype=torch.float32)
y_train_cifar_torch = torch.tensor(y_train_cifar, dtype=torch.long)
x_test_cifar_torch = torch.tensor(x_test_cifar.transpose(0, 3, 1, 2), dtype=torch.float32)
y_test_cifar_torch = torch.tensor(y_test_cifar, dtype=torch.long)

# create DataLoader objects for training and evaluation
cifar_train_loader = DataLoader(TensorDataset(x_train_cifar_torch, y_train_cifar_torch), batch_size=64, shuffle=True)
cifar_test_loader = DataLoader(TensorDataset(x_test_cifar_torch, y_test_cifar_torch), batch_size=64, shuffle=False)

# initialize ResNet-18 for CIFAR-10 (3-channel RGB input)
model_cifar = ResNet18(num_classes=10, in_channels=3).to(device)

# define label smoothing loss
criterion_cifar = SmoothCrossEntropyLoss(smoothing=LABEL_SMOOTHING)

# configure SGD optimizer with weight decay and momentum
optimizer_cifar = optim.SGD(model_cifar.parameters(), lr=LEARNING_RATE,
                            momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# train the model and store training history
print("Training ResNet-18 on CIFAR-10...")
model_cifar, best_acc_cifar, train_acc_cifar, val_acc_cifar, train_loss_cifar, val_loss_cifar = \
    train_and_evaluate_model(model_cifar, cifar_train_loader, cifar_test_loader,
                             EPOCHS_CIFAR_RESNET, criterion_cifar, optimizer_cifar)

# display best validation accuracy
print(f"Best CIFAR-10 ResNet18 Test Accuracy: {best_acc_cifar:.4f}")


In [None]:
# Cell 23: CIFAR-10 ResNet-18 Training Curves

# Description:
#   This cell plots the training and validation accuracy and loss curves for the ResNet-18 model trained on CIFAR-10.
#   Accuracy is plotted on a linear scale, while loss is shown on a logarithmic scale to highlight convergence trends.
#   Input: Lists of accuracy and loss over epochs
#   Output: Two plots: accuracy curve and log-scaled loss curve

# Cell 23: CIFAR-10 ResNet-18 Training Curves
# This cell visualizes the training and validation accuracy and loss for the ResNet-18 model trained on CIFAR-10.
# Log-scale is used for the loss plot to highlight convergence behavior.

# define x-axis range based on number of epochs completed
epochs_range = range(1, len(train_acc_cifar) + 1)

# plot training and validation accuracy curves
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, train_acc_cifar, label='Train Accuracy')
plt.plot(epochs_range, val_acc_cifar, label='Validation Accuracy')
plt.title("CIFAR-10 ResNet-18 Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

# plot training and validation loss curves with log-scaled y-axis
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, train_loss_cifar, label='Train Loss')
plt.plot(epochs_range, val_loss_cifar, label='Validation Loss')
plt.yscale("log")
plt.title("CIFAR-10 ResNet-18 Loss (Log Scale)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Cell 24: Evaluate ResNet-18 on CIFAR-10

# Description:
#   This cell evaluates the trained ResNet-18 model on the CIFAR-10 dataset using black-box testing.
#   It computes and displays the overall test accuracy and a confusion matrix,
#   and visualizes example predictions, distinguishing between correct and incorrect outputs.
#   Input: Trained model, CIFAR-10 test data
#   Output: Accuracy score, confusion matrix plot, and sample image predictions

# This cell evaluates the trained ResNet-18 model on the CIFAR-10 dataset.
# It visualizes the confusion matrix and shows sample predictions (correct/incorrect).

# set model to evaluation mode
model_cifar.eval()
all_preds = []
all_labels = []

# generate predictions on entire test set
with torch.no_grad():
    for images, labels in cifar_test_loader:
        images = images.to(device)
        outputs = model_cifar(images)
        preds = outputs.argmax(dim=1).cpu().numpy()  # get class with highest probability
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# compute confusion matrix and accuracy
cifar_conf = confusion_matrix(all_labels, all_preds)
accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"CIFAR-10 ResNet-18 Test Accuracy: {accuracy:.4f}")

# plot confusion matrix with class labels
plt.figure(figsize=(6, 5))
sns.heatmap(cifar_conf, annot=True, fmt='d', cmap='Blues',
            xticklabels=cifar10_classes, yticklabels=cifar10_classes)
plt.title("CIFAR-10 ResNet-18 Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# convert predictions and labels to NumPy arrays for indexing
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
correct_idx = np.where(all_preds == all_labels)[0]
incorrect_idx = np.where(all_preds != all_labels)[0]

# Function Header
# Description:
#   Displays a sample of correct and incorrect CIFAR-10 predictions made by ResNet-18.
#   Correct predictions are shown in green; incorrect ones in red.
# Parameters:
#   - x_data: original input images (N, H, W, C)
#   - correct_idx, incorrect_idx: arrays of correct/incorrect sample indices
#   - preds: predicted class indices
#   - labels: true class indices
#   - class_names: list of class name strings for CIFAR-10
# Returns: None (shows plot)
def show_resnet_cifar_preds(x_data, correct_idx, incorrect_idx, preds, labels, class_names):
    idx_correct = np.random.choice(correct_idx, size=4, replace=False)
    idx_incorrect = np.random.choice(incorrect_idx, size=4, replace=False)
    selected = np.concatenate([idx_correct, idx_incorrect])

    plt.figure(figsize=(10, 4))
    for i, idx in enumerate(selected):
        img = x_data[idx]
        true = class_names[labels[idx]]
        pred = class_names[preds[idx]]
        color = 'green' if i < 4 else 'red'
        plt.subplot(2, 4, i + 1)
        plt.imshow(img)
        plt.title(f"T: {true}\nP: {pred}", color=color, fontsize=9)
        plt.axis('off')
    plt.suptitle("CIFAR-10 ResNet-18 Predictions: Correct (green) vs Incorrect (red)")
    plt.tight_layout()
    plt.show()

# call function to visualize correct and incorrect predictions
show_resnet_cifar_preds(x_test_cifar, correct_idx, incorrect_idx, all_preds, all_labels, cifar10_classes)


In [None]:
# Cell 25: Train ResNet-18 No-Skip on CIFAR-10

# Description:
#   This cell defines and trains a variant of ResNet-18 with no skip connections for ablation testing on CIFAR-10.
#   The goal is to compare performance against the standard ResNet-18 with residual shortcuts.
#   Input: CIFAR-10 dataset, training hyperparameters
#   Output: Trained no-skip model, performance history, and best test accuracy

# This cell trains a modified ResNet-18 model with no skip connections (ResidualBlockNoSkip) for ablation testing.
# The architecture is otherwise similar, and training follows the same procedure as standard ResNet.

# training settings
EPOCHS_NOSKIP = 50

# User-Defined ADT Header
# ADT Name: ResidualBlockNoSkip
# Description:
#   A basic residual-style convolutional block with no identity/skip connection.
#   Used in constructing no-skip versions of ResNet for architecture ablation.
class ResidualBlockNoSkip(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlockNoSkip, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        return out

# User-Defined ADT Header
# ADT Name: ResNet18NoSkip
# Description:
#   A ResNet-like architecture composed of ResidualBlockNoSkip blocks (no skip connections).
#   Used to compare performance with the standard ResNet-18 on CIFAR-10.
class ResNet18NoSkip(nn.Module):
    def __init__(self, num_classes=10, in_channels=3):
        super(ResNet18NoSkip, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        self.layer1 = self._make_layer(64, 2, stride=1)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, out_channels, blocks, stride):
        layers = []
        for i in range(blocks):
            s = stride if i == 0 else 1
            layers.append(ResidualBlockNoSkip(self.in_channels, out_channels, s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.fc(out)
        return out

# Object Header
# Purpose: Instantiate and train ResNet-18 without skip connections on CIFAR-10
model_noskip = ResNet18NoSkip(num_classes=10, in_channels=3).to(device)
criterion_noskip = SmoothCrossEntropyLoss(smoothing=LABEL_SMOOTHING)
optimizer_noskip = optim.SGD(model_noskip.parameters(), lr=LEARNING_RATE,
                             momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

print("Training ResNet-18 No-Skip on CIFAR-10...")
model_noskip, best_acc_noskip, train_acc_noskip, val_acc_noskip, train_loss_noskip, val_loss_noskip = \
    train_and_evaluate_model(model_noskip, cifar_train_loader, cifar_test_loader,
                             EPOCHS_NOSKIP, criterion_noskip, optimizer_noskip)

print(f"Best Test Accuracy (No-Skip): {best_acc_noskip:.4f}")


In [None]:
# Cell 26: No-Skip ResNet Training Curves (CIFAR-10)

# Description:
#   This cell visualizes the training and validation performance of the no-skip ResNet-18 model on CIFAR-10.
#   It includes an accuracy curve and a log-scaled loss curve for ablation comparison with the standard ResNet.
#   Input: Accuracy and loss history from no-skip model training
#   Output: Accuracy plot and loss plot (log scale)

# This cell plots the training and validation curves for ResNet-18 without skip connections.
# Curves include standard accuracy and log-scaled loss to support ablation comparison.

# define epoch range for x-axis
epochs_range = range(1, len(train_acc_noskip) + 1)

# plot training and validation accuracy
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, train_acc_noskip, label="Train Accuracy")
plt.plot(epochs_range, val_acc_noskip, label="Validation Accuracy")
plt.title("CIFAR-10 ResNet-18 No-Skip Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

# plot training and validation loss (log-scaled y-axis)
plt.figure(figsize=(6, 4))
plt.plot(epochs_range, train_loss_noskip, label="Train Loss")
plt.plot(epochs_range, val_loss_noskip, label="Validation Loss")
plt.yscale("log")
plt.title("CIFAR-10 ResNet-18 No-Skip Loss (Log Scale)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Cell 27: Train Baseline CNN on CIFAR-10

# Description:
#   This cell builds and trains a baseline convolutional neural network (CNN) on the CIFAR-10 dataset.
#   The model uses stacked convolutional layers without any skip/residual connections,
#   providing a control architecture for ablation comparisons against ResNet variants.
#   Input: CIFAR-10 dataset, training hyperparameters
#   Output: Trained model, accuracy/loss history, and best validation accuracy

# This cell builds and trains a basic CNN without residual connections, useful for ablation comparisons.

# User-Defined ADT Header
# ADT Name: BaselineCNN
# Description:
#   A simple convolutional neural network architecture using 4 convolutional layers,
#   followed by global average pooling and a fully connected classification head.
#   No skip connections or residual blocks are used.
class BaselineCNN(nn.Module):
    def __init__(self, num_classes=10, in_channels=3):
        super(BaselineCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = self.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = self.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)
        x = self.relu(self.conv4(x))
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

# Object Header
# Purpose: Instantiate and train the baseline CNN on CIFAR-10
model_cnn = BaselineCNN().to(device)

# Object Header
# Purpose: Define loss function with label smoothing for better generalization
criterion_cnn = SmoothCrossEntropyLoss(smoothing=LABEL_SMOOTHING)

# Object Header
# Purpose: Use SGD optimizer with momentum and weight decay for training
optimizer_cnn = optim.SGD(model_cnn.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# Train the baseline CNN model and record its performance
print("Training Baseline CNN on CIFAR-10...")
model_cnn, best_acc_cnn, train_acc_cnn, val_acc_cnn, train_loss_cnn, val_loss_cnn = \
    train_and_evaluate_model(model_cnn, cifar_train_loader, cifar_test_loader,
                             EPOCHS_NOSKIP, criterion_cnn, optimizer_cnn)

# Output best performance on test set
print(f"Best Test Accuracy (Baseline CNN): {best_acc_cnn:.4f}")


In [None]:
# Cell 28: Evaluate Baseline CNN on CIFAR-10

# Description:
#   This cell evaluates the trained Baseline CNN model on the CIFAR-10 test dataset.
#   It reports test accuracy, generates a confusion matrix for class-wise evaluation,
#   and visualizes prediction examples with correct and incorrect classifications.
#   Input: Trained BaselineCNN model, CIFAR-10 test data
#   Output: Accuracy score, confusion matrix, and prediction visualization

# This cell evaluates the baseline CNN model, including confusion matrix and prediction examples.

# set model to evaluation mode
model_cnn.eval()
all_preds = []
all_labels = []

# run predictions on full CIFAR-10 test set
with torch.no_grad():
    for images, labels in cifar_test_loader:
        images = images.to(device)
        outputs = model_cnn(images)
        preds = outputs.argmax(dim=1).cpu().numpy()  # predicted class indices
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# compute and print test accuracy
cnn_conf = confusion_matrix(all_labels, all_preds)
print(f"Baseline CNN Test Accuracy: {np.mean(np.array(all_preds) == np.array(all_labels)):.4f}")

# plot confusion matrix heatmap with CIFAR-10 class labels
plt.figure(figsize=(6, 5))
sns.heatmap(cnn_conf, annot=True, fmt='d', cmap='Blues',
            xticklabels=cifar10_classes, yticklabels=cifar10_classes)
plt.title("CIFAR-10 Baseline CNN Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# visualize example predictions using helper from Cell 24
show_resnet_cifar_preds(
    x_test_cifar,
    np.where(np.array(all_preds) == np.array(all_labels))[0],  # correct indices
    np.where(np.array(all_preds) != np.array(all_labels))[0],  # incorrect indices
    all_preds,
    all_labels,
    cifar10_classes
)


In [None]:
SECTION 5

In [None]:
# Cell 29: Print Model Summaries

# Description:
#   This cell prints model summaries using `torchsummary` for all trained networks in the project.
#   It allows architectural comparison across ResNet-18 (with/without skips), baseline CNN, and MNIST/CIFAR variants.
#   Input: Model instances (already trained)
#   Output: Model layer breakdowns, parameter counts, output shapes

# This cell summarizes all trained models using torchsummary for architecture comparison.

from torchsummary import summary

# print layer-wise structure and parameter counts for each model

print("\n--- ResNet-18 Summary (MNIST) ---")
summary(model_mnist, input_size=(1, 28, 28))

print("\n--- ResNet-18 Summary (CIFAR-10) ---")
summary(model_cifar, input_size=(3, 32, 32))

print("\n--- No-Skip ResNet Summary ---")
summary(model_noskip, input_size=(3, 32, 32))

print("\n--- Baseline CNN Summary ---")
summary(model_cnn, input_size=(3, 32, 32))


In [None]:
# Cell 30: Visualize ResNet and Baseline Weights

# Description:
#   This cell performs white-box testing visualizations for trained PyTorch models.
#   It displays the learned filters from the first convolutional layer (conv1)
#   and visualizes the dense (fully connected) output layer's weight matrix using heatmaps.
#   Input: Trained models (ResNet and BaselineCNN)
#   Output: Grid of filter visualizations and heatmaps of fully connected layer weights

# This cell visualizes conv1 filters and dense layer weights from trained models.

# Function Header
# Description:
#   Visualizes the first convolutional layer filters of a given model.
#   Filters are normalized to [0, 1] and shown as individual images.
# Parameters:
#   - model: a PyTorch CNN or ResNet model
#   - title: title prefix for the plot
# Returns: None (displays matplotlib figure)
def show_conv_weights(model, title):
    conv_weights = model.conv1.weight.data.clone().cpu()
    conv_weights = (conv_weights - conv_weights.min()) / (conv_weights.max() - conv_weights.min())

    fig, axes = plt.subplots(1, min(8, conv_weights.shape[0]), figsize=(12, 3))
    fig.suptitle(f"{title} - First Conv Layer Filters", fontsize=14)

    for i in range(min(8, conv_weights.shape[0])):
        img = conv_weights[i].squeeze()
        axes[i].imshow(img.permute(1, 2, 0) if img.ndim == 3 else img, cmap='gray')
        axes[i].axis("off")
    plt.tight_layout()
    plt.show()

# Function Header
# Description:
#   Visualizes the fully connected (FC) output layer weights as a 2D heatmap.
# Parameters:
#   - model: a PyTorch model with a .fc linear output layer
#   - title: title prefix for the heatmap
# Returns: None (displays matplotlib heatmap)
def show_fc_weights(model, title):
    fc_weights = model.fc.weight.data.clone().cpu()
    plt.figure(figsize=(10, 4))
    sns.heatmap(fc_weights, cmap="viridis")
    plt.title(f"{title} - FC Layer Weight Matrix ({fc_weights.shape[0]} classes × {fc_weights.shape[1]} features)")
    plt.xlabel("Features")
    plt.ylabel("Classes")
    plt.tight_layout()
    plt.show()

# visualize weights for all trained PyTorch models
show_conv_weights(model_mnist, "MNIST ResNet-18")
show_fc_weights(model_mnist, "MNIST ResNet-18")

show_conv_weights(model_cifar, "CIFAR-10 ResNet-18")
show_fc_weights(model_cifar, "CIFAR-10 ResNet-18")

show_conv_weights(model_noskip, "CIFAR-10 ResNet-18 No-Skip")
show_fc_weights(model_noskip, "CIFAR-10 ResNet-18 No-Skip")

show_conv_weights(model_cnn, "CIFAR-10 Baseline CNN")
show_fc_weights(model_cnn, "CIFAR-10 Baseline CNN")


In [None]:
# Cell 31: Black-Box Testing for Final Model Performance

# Description:
#   This cell performs formal black-box testing on each trained model using unseen test data.
#   Models are treated as black boxes — their outputs are assessed solely based on input-output behavior.
#   Evaluation includes a detailed classification report with precision, recall, F1-score, and support for each class.
#   Input: Trained model, test data tensors
#   Output: Formatted classification reports showing generalization performance

# This cell performs explicit black-box testing on the final trained models by evaluating their input–output behavior.
# Models are evaluated on unseen test sets without accessing internal implementation or weights.
# It confirms functional correctness and compares generalization across models.

from sklearn.metrics import classification_report

# Function Header
# Description:
#   Runs black-box evaluation of a model by comparing predicted and true labels.
#   Outputs a classification report summarizing key performance metrics.
# Parameters:
#   - model: a trained PyTorch model
#   - x_test_tensor: input images tensor
#   - y_test_tensor: ground truth labels tensor
#   - dataset_name: string label for output report title
# Returns: None (prints output)
def black_box_test(model, x_test_tensor, y_test_tensor, dataset_name):
    """run black-box testing on test data and print evaluation summary"""
    model.eval()
    y_true = y_test_tensor.numpy()
    y_pred = []

    with torch.no_grad():
        for inputs, _ in DataLoader(TensorDataset(x_test_tensor, y_test_tensor), batch_size=64):
            inputs = inputs.to(device)  # input already correctly shaped
            outputs = model(inputs)
            preds = outputs.argmax(dim=1).cpu().numpy()
            y_pred.extend(preds)

    print(f"\n─── BLACK BOX TEST REPORT ({dataset_name}) ───")
    print(classification_report(y_true, y_pred, digits=4))

# Run black-box tests on all final models
black_box_test(model_mnist, x_test_torch, y_test_torch, "MNIST ResNet-18")

black_box_test(
    model_cifar,
    torch.tensor(x_test_cifar.transpose(0, 3, 1, 2)).float(),
    torch.tensor(y_test_cifar),
    "CIFAR-10 ResNet-18"
)

black_box_test(
    model_noskip,
    torch.tensor(x_test_cifar.transpose(0, 3, 1, 2)).float(),
    torch.tensor(y_test_cifar),
    "CIFAR-10 No-Skip ResNet"
)

black_box_test(
    model_cnn,
    torch.tensor(x_test_cifar.transpose(0, 3, 1, 2)).float(),
    torch.tensor(y_test_cifar),
    "CIFAR-10 Baseline CNN"
)
