# COMP 551 — Assignment 3

Authors:
 - Bernier, Audréanne
 - Coull-Neveu, Ben
 - Trachsel-Bourbeau, Anjara

Imports

In [1]:
from torchvision import datasets, transforms
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

CHECKLIST
- Implementation of data normalization (1 points)
- Implementation and training of MLP with no hidden layers (1 points)
- Implementation and training of MLP with one hidden layers and ReLU activation (3 points)
- Implementation and training of MLP with two hidden layers and ReLU activation (1 points)
- Implementation and training of the two hidden layers MLP with tanh activation (2 points)
- Implementation and training of the two hidden layers MLP with Leaky-ReLU activation (2 points)
- L1 regularization (2 points)
- L2 regularization (2 points)
- Train MLP without normalization (1 points)
- Correctly plot and compare the results of the 8 trained models above (5 points)
- Train the MLP with the 128 x 128 FashionMNIST data (1 points)
- Plot the results of the larger models, and compare classification performance and training time (2 points)
- Implement and train the CNN (2 points)
- Re-train the CNN with the 128 x 128 FashionMNIST data (1 points)
- Plot the results of the two CNN trained, and compare classification performance and training time to the best MLP. (4 points)
- Implement and train the pre-trained model with the trainable fully connected layer(s). (3 points)
- Plot the results of the pre-trained model, and compare its performance to the best MLP and regular CNN.(4 points)
- Run an experiment to justify the choice of fully connected layers for the pre-trained model, and show
supporting plots. (3 points)

You can report your findings either in the form of a table or a plot in the write-up. However, include in your
colab notebooks the plots of the test and train performance of the MLPs / CNN / pre-trained model as a function
of training epochs. This will allow you to see how much the network should be trained before it starts to overfit
to the training data.

Note 2: We expect you to provide plots/tables in your report that justifies your choice of hyperparameters
(the learning rates of the MLPs / CNNs / pretrained models, the architectural parameters of the CNNs and
pretrained models). You are not required to perform cross-validation in this project.

# Implement MLP

In [None]:
# Activation functions
logistic = lambda z: 1./ (1 + np.exp(-z))  # softmax
relu = lambda z: np.maximum(0, z)

In [None]:
class MLP:
    """
    Supports L=0,1,2 hidden layers.

    Constructor takes:
    g = output activation function
    h = hidden activation function
    L = number of hidden layers
    M = number of hidden units, iterable (each element corresponds to a layer)
    
    -> Weights and biases are initialized randomly
    """

    def __init__(self, g=None, h=None, L=1, M=64, D=None): 
        self.g = g if g is not None else lambda x: x  # default identity
        self.h = h if h is not None else lambda x: x
        self.L = L
        
        if isinstance(M, int):
            self.M = [M] * L  # same num of units in each layer
        else:
            assert len(M) == L, "Length of M must equal L"
            self.M = M
        
        # Initialize weights & biases depending on # of layers (L)
        assert D is not None, "Need number of features"
        if L == 0:  # no hidden layer
            self.w = np.random.randn(D) * 0.01
            self.b = 0.0
        elif L == 1:  # 1 hidden layer
            self.v = np.random.randn(D, self.M[0]) * 0.01  # 1st hidden layer, D x M
            self.c = np.zeros(self.M[0])
            self.w = np.random.randn(self.M[0]) * 0.01  # output layer, M x 1
            self.b = 0.0
        elif L == 2:  # 2 hidden layers
            self.v1 = np.random.randn(D, self.M[0]) * 0.01  # 1st hidden layer, D x M1
            self.c1 = np.zeros(self.M[0])
            self.v2 = np.random.randn(self.M[0], self.M[1]) * 0.01  # 2nd hidden layer, M1 x M2
            self.c2 = np.zeros(self.M[1])
            self.w = np.random.randn(self.M[1]) * 0.01  # output layer, M2 x 1
            self.b = 0.0

    
    # NEED TO FIX BELOW TO SUPPORT L=0,1,2 AND THE DIFFERENT ACTIVATIONS
    # IMPLEMENT BACK PROP MUNALLY FOR ALL CASES (SEE ED ANNOUNCEMENT #384)
    def fit(self, x, y, optimizer):
        N,D = x.shape
        def gradient(x, y, params):
            v, w = params
            z = logistic(np.dot(x, v)) #N x M
            yh = logistic(np.dot(z, w)) #N
            dy = yh - y #N
            dw = np.dot(z.T, dy)/N #M
            dz = np.outer(dy, w) #N x M
            dv = np.dot(x.T, dz * z * (1 - z))/N #D x M
            dparams = [dv, dw]
            return dparams
        
        w = np.random.randn(self.M) * .01
        v = np.random.randn(D,self.M) * .01
        params0 = [v,w]
        self.params = optimizer.run(gradient, x, y, params0)
        return self
    
    def predict(self, x):
        v, w = self.params
        z = logistic(np.dot(x, v)) #N x M
        yh = logistic(np.dot(z, w))#N
        return yh

In [64]:
class GradientDescent:
    
    def __init__(self, learning_rate=.001, max_iters=1e4, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon
        
    def run(self, gradient_fn, x, y, params):
        norms = np.array([np.inf])
        t = 1
        while np.any(norms > self.epsilon) and t < self.max_iters:
            grad = gradient_fn(x, y, params)
            for p in range(len(params)):
                params[p] -= self.learning_rate * grad[p]
            t += 1
            norms = np.array([np.linalg.norm(g) for g in grad])
        return params

# Other Functions

# Load Data

In [18]:
# Compute mean and std from train dataset (for normalization)
dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
data = dataset.data.float() / 255.0  # convert to float and scale to [0,1]

# Compute mean and std over all training set (since greyscale images)
mean = data.mean()
std = data.std()
print(mean, std)

tensor(0.2860) tensor(0.3530)


In [None]:
# Load the FashionMNIST dataset
transform = transforms.Compose([transforms.ToTensor(), # scales to [0,1]
                                transforms.Normalize((mean,), (std,))  # mean 0, std 1
                                ])

train_dataset = datasets.FashionMNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transform
    )
test_dataset = datasets.FashionMNIST(
    root='./data', 
    train=False, 
    download=True, 
    transform=transform
    )

In [11]:
# get loaders
train_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=64, 
    shuffle=True
    )
test_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=64, 
    shuffle=False
)

"In the context of images, normalization means converting pixel values (originally in the range [0, 255]) to a specific range, usually between 0 and 1 or -1 and 1, usually by first scaling to [0, 1] (i.e., dividing by 255), and then subtracting the mean and dividing by the standard deviation, either per-channel (for color images) or over the entire training set (for grayscale). This helps stabilize training by centering the data and ensuring all features (pixels) contribute proportionally during optimization."

# MLP Experiments

## 1 - Vary number of hidden layers

First of all, create three different models: (1) an MLP with no hidden layers, i.e., it directly maps the inputs
to outputs, (2) an MLP with a single hidden layer having 256 units and ReLU activations, (3) an MLP with 2
hidden layers each having 256 units with ReLU activations. It should be noted that since we want to perform
classification, all of these models should have a softmax layer at the end. After training, compare the test
accuracy of these three models on the FashionMNIST dataset. Comment on how non-linearity and network
depth affects the accuracy. Are the results that you obtain expected?

In [None]:
# --- 0 Hidden Layers ---
model0 = MLP(g=logistic, L=0)

In [None]:
# --- 1 Hidden Layer ---
model1 = MLP(g=logistic, h=relu, L=1, M=256)

In [None]:
# --- 2 Hidden Layers ---
model2 = MLP(g=logistic, h=relu, L=2, M=[256, 256])

## 2 - Changing activations in 2-layer MLP

Take the last model above, the one with 2 hidden layers, and create two different copies of it in which the
activations are now tanh and Leaky-ReLU. After training these two models compare their test accuracies with
model having ReLU activations. Comment on the performances of these models: which one is better and why?
Are certain activations better than others? If the results are not as you expected, what could be the reason?

## 3 - Adding regularization to 2-layer MLP

Create an MLP with 2 hidden layers each having 256 units with ReLU activations as above. However, this
time, independently add L1 and L2 regularization to the network and train the MLP in this way. How does
these regularizations affect the accuracy? This proportion can be varied as a tunable hyperparameter that can be
explored as part of other project requirements.

## 4 - Unnormalized 2-layer MLP

Create an MLP with 2 hidden layers each having 256 units with ReLU activations as above. However, this time,
train it with unnormalized images. How does this affect the accuracy?

## 5 - Data augmentation

Re-train the MLP from question 3 on a version of FashionMNIST using data augmentation. You can use the
transforms.Compose() function to set your transformations for data augmentation, and the
transform=train transform argument in the dataset constructor to set the transforms. Is the accuracy
affected, and how? What are the benefits and/or drawbacks of using data augmentation? Can you think of a
situation in which certain types data augmentation would be harmful?

In [None]:
# reload the data and add other transformations

# CNN Experiments

## 6 - Create a CNN

Using existing libraries such as TensorFlow or PyTorch, create a convolutional neural network (CNN) with 2
convolutional layers, one fully connected hidden layer and one fully connected output layer. Although you
are free in your choice of the hyperparameters of the convolutional layers, set the number of units in the fully
connected layers to be 256. Also, set the activations in all of the layers to be ReLU. Train this CNN on the
FashionMNIST dataset. Does using a CNN increase/decrease the accuracy compared to using MLPs? Provide
comments on your results.

## 7 - Data augmentation

Train the above CNN using FashionMNIST with the data augmentation from Q5. How is the performance
(accuracy and speed) affected?

## 8 - Pre-trained model

Load a pre-trained model that you see fit (e.g., a ResNet) using existing libraries such as TensorFlow or PyTorch,
and then freeze all the convolutional layers and remove all the fully connected ones. Add a number of fully
connected layers of your choice right after the convolutional layers. Train only the fully connected layers of the
pre-trained model on the FashionMNIST dataset with the data augmentation from Q5. How does this pre-trained
model compare to the best MLP in part 5 and to the CNN in part 7 in terms of accuracy? How does it compare
to the previous models in terms of the required training time? Justify your choice of how many fully connected
layers that you have added to the pre-trained model through careful experiments.

# TO CLEAN:

In [None]:
# TODO: find better way to normalize dataset *****FIXED? SEE LOADING DATA SECTION

def compute_loader_stats(loader, return_distribution=False):
    n_pixels = 0
    sum_pixels = 0
    sum_squared = 0
    
    dist = []
    for images, _ in loader:
        n_pixels += images.numel()
        sum_pixels += images.sum().item()
        sum_squared += (images ** 2).sum().item()
        if return_distribution:
            dist.append(images.numpy().flatten())
        
    mean = sum_pixels / n_pixels
    var = (sum_squared / n_pixels) - (mean ** 2)
    std = np.sqrt(var)
    return mean, std, dist if return_distribution else (mean, std)


def compute_data_stats(*loaders, return_distribution=False):
    total_mean = 0
    total_std = 0
    for loader in loaders:
        stats = compute_loader_stats(loader, return_distribution=return_distribution)
        total_mean += stats[0]
        total_std += stats[1]
        print(f'Loader mean: {stats[0]:.4f}, std: {stats[1]:.4f}')
    n_loaders = len(loaders)
    print(f'Overall mean: {total_mean / n_loaders:.4f}, std: {total_std / n_loaders:.4f}')
    total_mean /= n_loaders
    total_std /= n_loaders
    return total_mean, total_std, stats[2] if return_distribution else (total_mean, total_std)

In [61]:
# temp variables until normalization done
norm_train = train_dataset
norm_test = test_dataset

In [None]:
# stats = compute_data_stats(train_loader, test_loader, return_distribution=True)
# mean, std = stats[0], stats[1]
# print(f'Final mean: {mean:.4f}, std: {std:.4f}')

# distribution_prenorm = np.concatenate(stats[2])

In [56]:
# sns.histplot(distribution_prenorm, bins=100, kde=True)
# plt.yscale('log')
# plt.title('Pixel Value Distribution Before Normalization')

In [55]:
# # normalize datasets
# train_dataset.transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((mean,), (std,))
# ])

# test_dataset.transform = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((mean,), (std,))
# ])

In [62]:
# stats = compute_data_stats(train_loader, test_loader, return_distribution=True)
# mean_postnorm, std_postnorm = stats[0], stats[1]
# print(f'Final mean: {mean_postnorm:.4f}, std: {std_postnorm:.4f}')

# distribution_postnorm = np.concatenate(stats[2])

In [58]:
# sns.histplot(distribution_postnorm, bins=100, kde=True)
# plt.yscale('log')
# plt.title('Pixel Value Distribution Before Normalization')