# Programming assignment 2: Convolutional Neural Networks (100 points)

## Overview
<font size='4'> In this assignment you will practice putting together a Convolution Neural Network (CNN) classification pipeline. So far we have worked with deep fully-connected networks, using them to explore different optimization strategies and network architectures. Fully-connected networks are a good testbed for experimentation because they are computationally efficient, but in practice CNNs work better for image classification.

<font size='4'>In the first part , you will implement several layer types that are used in convolutional networks using Numpy. In the second part, you will then implement a custom CNN and ones based on the ResNet using PyTorch. You will also practice hyper parameter tuning to achieve desired accuracy.

## Submission format
* <font size='4'>`<your_nu_username>_pa2.ipynb`


## setup

In [None]:
# As usual, a bit of setup
import time
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [None]:
# let's download the data
# !mkdir ../datasets
# !cd ../datasets

# 1 -- Linux
# 2 -- MacOS
# 3 -- Command Prompt on Windows
# 4 -- manually downloading the data
choice = 1


if choice == 1:
    # should work well on Linux and in Powershell on Windows
    !wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
elif choice == 2 or choice ==3:
    # if wget is not available for you, try curl
    # should work well on MacOS
    !curl http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz --output cifar-10-python.tar.gz
else:
    print('Please manually download the data from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and put it under the datasets folder.')
!tar -xzvf cifar-10-python.tar.gz

if choice==3:
    !del cifar-10-python.tar.gz
else:
    !rm cifar-10-python.tar.gz

--2025-02-20 17:08:30--  http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170498071 (163M) [application/x-gzip]
Saving to: ‘cifar-10-python.tar.gz’


2025-02-20 17:08:35 (37.7 MB/s) - ‘cifar-10-python.tar.gz’ saved [170498071/170498071]

cifar-10-batches-py/
cifar-10-batches-py/data_batch_4
cifar-10-batches-py/readme.html
cifar-10-batches-py/test_batch
cifar-10-batches-py/data_batch_3
cifar-10-batches-py/batches.meta
cifar-10-batches-py/data_batch_2
cifar-10-batches-py/data_batch_5
cifar-10-batches-py/data_batch_1


In [None]:
# helpful functions to process and load the data
from six.moves import cPickle as pickle
import numpy as np
import os
from imageio import imread
import platform

def load_pickle(f):
    version = platform.python_version_tuple()
    if version[0] == '2':
        return  pickle.load(f)
    elif version[0] == '3':
        return  pickle.load(f, encoding='latin1')
    raise ValueError("invalid python version: {}".format(version))

def load_CIFAR_batch(filename):
  """ load single batch of cifar """
  with open(filename, 'rb') as f:
    datadict = load_pickle(f)
    X = datadict['data']
    Y = datadict['labels']
    X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    Y = np.array(Y)
    return X, Y

def load_CIFAR10(ROOT):
  """ load all of cifar """
  xs = []
  ys = []
  for b in range(1,6):
    f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
    X, Y = load_CIFAR_batch(f)
    xs.append(X)
    ys.append(Y)
  Xtr = np.concatenate(xs)
  Ytr = np.concatenate(ys)
  del X, Y
  Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
  return Xtr, Ytr, Xte, Yte


def get_CIFAR10_data(cifar10_dir, num_training=49000, num_validation=1000, num_test=1000,
                     subtract_mean=True):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for classifiers. These are the same steps as we used for the SVM, but
    condensed to a single function.
    """
    # Load the raw CIFAR-10 data
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # Subsample the data
    mask = list(range(num_training, num_training + num_validation))
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = list(range(num_training))
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    if subtract_mean:
      mean_image = np.mean(X_train, axis=0)
      X_train -= mean_image
      X_val -= mean_image
      X_test -= mean_image

    # Transpose so that channels come first
    X_train = X_train.transpose(0, 3, 1, 2).copy()
    X_val = X_val.transpose(0, 3, 1, 2).copy()
    X_test = X_test.transpose(0, 3, 1, 2).copy()

    # Package data into a dictionary
    return {
      'X_train': X_train, 'y_train': y_train,
      'X_val': X_val, 'y_val': y_val,
      'X_test': X_test, 'y_test': y_test,
    }

In [None]:
# Load the (preprocessed) CIFAR10 data.
cifar10_dir = './cifar-10-batches-py'

data = get_CIFAR10_data(cifar10_dir, subtract_mean=True)

pix_mean = (0.485, 0.456, 0.406)
pix_std = (0.229, 0.224, 0.225)

for c in range(3):
    data['X_train'][:, c] = (data['X_train'][:, c] / 255 - pix_mean[c]) / pix_std[c]
    data['X_val'][:, c] = (data['X_val'][:, c] / 255 - pix_mean[c]) / pix_std[c]
    data['X_test'][:, c] = (data['X_test'][:, c] / 255 - pix_mean[c]) / pix_std[c]

for split in ['train', 'val', 'test']:
    print('===\nFor the split {}'.format(split))
    print('shape: {}'.format(data['X_{}'.format(split)].shape))
    print('data value range, min: {}, max: {}\n'.format(data['X_{}'.format(split)].min(), data['X_{}'.format(split)].max()))

===
For the split train
shape: (49000, 3, 32, 32)
data value range, min: -4.489820571085577, max: 0.8966644435551998

===
For the split val
shape: (1000, 3, 32, 32)
data value range, min: -4.489820571085577, max: 0.8966644435551998

===
For the split test
shape: (1000, 3, 32, 32)
data value range, min: -4.489820571085577, max: 0.8966644435551998



## Part 1: Implementing convolution and batch normalization layers using Numpy (25 points)
(adapted from the work done by Erik Learned-Miller, which was originally developed by Fei-Fei Li, Andrej Karpathy, and Justin Johnson)

<font size="4" color="red">**task 1.1: forward pass of a convolution layer with two nested for loops (10 points)**

In [None]:
def conv_forward_naive(x, w, b, conv_param):
    """
    A naive implementation of the forward pass for a convolutional layer.

    The input consists of N data points, each with C channels, height H and
    width W. We convolve each input with F different filters, where each filter
    spans all C channels and has height HH and width WW.

    Input:
    - x: Input data of shape (N, C, H, W)
    - w: Filter weights of shape (F, C, HH, WW)
    - b: Biases, of shape (F,)
    - conv_param: A dictionary with the following keys:
      - 'stride': The number of pixels between adjacent receptive fields in the
        horizontal and vertical directions.
      - 'pad': The number of pixels that will be used to zero-pad the input.


    During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
    along the height and width axes of the input. Be careful not to modfiy the original
    input x directly.

    Returns a tuple of:
    - out: Output data, of shape (N, F, H', W') where H' and W' are given by
      H' = ceil((H + 2 * pad - HH + 1) / stride)
      W' = ceil((W + 2 * pad - WW + 1) / stride)
    - cache: (x, w, b, conv_param)
    """
    out = None

    ###########################################################################
    # TODO: Implement the convolutional forward pass.                         #
    # Hint: you can use the function np.pad for padding.                      #
    ###########################################################################
    # raise NotImplementedError
    # Get the num of the input and the filter
    N, C, H, W = x.shape # number of the input, channel, height, width
    F, C, HH, WW = w.shape # number of the kernel, channel, height, width
    pad, stride = conv_param['pad'], conv_param['stride']

    # calculate the size of the output
    H_out = np.ceil((H + 2 * pad - HH + 1) / stride).astype(int)
    W_out = np.ceil((W + 2 * pad - WW + 1) / stride).astype(int)
    # initialize the output
    out = np.zeros((N, F, H_out, W_out))
    # pad the input
    x_padded = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode='constant')
    # loop over the input
    for n in range(N): # all the input
        for f in range(F): # all the kernel
            for i in range(H_out): # height
                for j in range(W_out): # weight
                # kernel position
                    h_start = i * stride
                    h_end = h_start + HH
                    w_start = j * stride
                    w_end = w_start + WW

                    #apply the kernel
                    out[n, f, i, j] = np.sum(x_padded[n, :, h_start:h_end, w_start:w_end] * w[f, :, :, :]) + b[f]

    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################

    cache = (x, w, b, conv_param)
    return out, cache

In [None]:
# check your forward pass implementation
x_shape = (2, 3, 4, 4)
w_shape = (3, 3, 4, 4)
x = np.linspace(-0.1, 0.5, num=np.prod(x_shape)).reshape(x_shape)
w = np.linspace(-0.2, 0.3, num=np.prod(w_shape)).reshape(w_shape)
b = np.linspace(-0.1, 0.2, num=3)

conv_param = {'stride': 2, 'pad': 1}
out, _ = conv_forward_naive(x, w, b, conv_param)
correct_out = np.array([[[[-0.08759809, -0.10987781],
                           [-0.18387192, -0.2109216 ]],
                          [[ 0.21027089,  0.21661097],
                           [ 0.22847626,  0.23004637]],
                          [[ 0.50813986,  0.54309974],
                           [ 0.64082444,  0.67101435]]],
                         [[[-0.98053589, -1.03143541],
                           [-1.19128892, -1.24695841]],
                          [[ 0.69108355,  0.66880383],
                           [ 0.59480972,  0.56776003]],
                          [[ 2.36270298,  2.36904306],
                           [ 2.38090835,  2.38247847]]]])

# Compare your output to ours; difference should be around e-8
print('Testing conv_forward_naive')
print('difference: ', rel_error(out, correct_out))

Testing conv_forward_naive
difference:  2.2121476417505994e-08


<font size='4' color='red'>**Task 1.2: forward pass of a (normal) batch norm layer (10 points).**

<font size='4'>Batch normalization is a very useful technique for training deep neural networks. As proposed in the original paper [1], batch normalization can also be used for convolutional networks, but we need to tweak it a bit; the modification will be called "spatial batch normalization."

<font size='4'>Normally batch-normalization accepts inputs of shape `(N, D)` and produces outputs of shape `(N, D)`, where we normalize across the minibatch dimension `N`.

[1] [Sergey Ioffe and Christian Szegedy, "Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift", ICML 2015.](https://arxiv.org/abs/1502.03167)

In [None]:
def batchnorm_forward(x, gamma, beta, bn_param):
    """
    Forward pass for batch normalization.

    During training the sample mean and (uncorrected) sample variance are
    computed from minibatch statistics and used to normalize the incoming data.
    During training we also keep an exponentially decaying running mean of the
    mean and variance of each feature, and these averages are used to normalize
    data at test-time.

    At each timestep we update the running averages for mean and variance using
    an exponential decay based on the momentum parameter:

    running_mean = momentum * running_mean + (1 - momentum) * sample_mean
    running_var = momentum * running_var + (1 - momentum) * sample_var

    Note that the batch normalization paper suggests a different test-time
    behavior: they compute sample mean and variance for each feature using a
    large number of training images rather than using a running average. For
    this implementation we have chosen to use running averages instead since
    they do not require an additional estimation step; the PyTorch
    implementation of batch normalization also uses running averages.

    Input:
    - x: Data of shape (N, D)
    - gamma: Scale parameter of shape (D,)
    - beta: Shift paremeter of shape (D,)
    - bn_param: Dictionary with the following keys:
      - mode: 'train' or 'test'; required
      - eps: Constant for numeric stability
      - momentum: Constant for running mean / variance.
      - running_mean: Array of shape (D,) giving running mean of features
      - running_var Array of shape (D,) giving running variance of features

    Returns a tuple of:
    - out: of shape (N, D)
    - cache: A tuple of values needed in the backward pass
    """
    mode = bn_param['mode']
    eps = bn_param.get('eps', 1e-5)
    momentum = bn_param.get('momentum', 0.9) # update ratio

    N, D = x.shape
    running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
    running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))

    out, cache = None, None
    if mode == 'train':
        #######################################################################
        # TODO: Implement the training-time forward pass for batch norm.      #
        # Use minibatch statistics to compute the mean and variance, use      #
        # these statistics to normalize the incoming data, and scale and      #
        # shift the normalized data using gamma and beta. Simply treat the    #
        # sample mean and sample variance as constants to simplify the        #
        # gradients computation.                                              #
        #                                                                     #
        # You should store the output in the variable out. Any intermediates  #
        # that you need for the backward pass should be stored in the cache   #
        # variable.                                                           #
        #                                                                     #
        # You should also use your computed sample mean and variance together #
        # with the momentum variable to update the running mean and running   #
        # variance, storing your result in the running_mean and running_var   #
        # variables.                                                          #
        #                                                                     #
        # Note that though you should be keeping track of the running         #
        # variance, you should normalize the data based on the standard       #
        # deviation (square root of variance) instead!                        #
        # Referencing the original paper (https://arxiv.org/abs/1502.03167)   #
        # might prove to be helpful.                                          #
        #######################################################################

        # compute the mean
        mean = np.mean(x, axis=0)
        # compute the variance
        var = np.var(x, axis=0)
        # normalize the data
        x_norm = (x - mean) / np.sqrt(var + eps)
        # scale and shift
        out = gamma * x_norm + beta

        #calculate the running mean and variance for infer
        running_mean= momentum * running_mean + (1 - momentum) * mean
        running_var = momentum * running_var + (1 - momentum) * var

        # stroe the cache for the backpropagation
        cache = (x, x_norm, mean, var, gamma, beta, eps)

        #######################################################################
        #                           END OF YOUR CODE                          #
        #######################################################################
    elif mode == 'test':
        #######################################################################
        # TODO: Implement the test-time forward pass for batch normalization. #
        # Use the running mean and variance to normalize the incoming data,   #
        # then scale and shift the normalized data using gamma and beta.      #
        # Store the result in the out variable.                               #
        #######################################################################
        # normalize the data
        x_norm = (x - running_mean) / np.sqrt(running_var + eps)
        out = gamma * x_norm + beta
        cache = None
        #######################################################################
        #                          END OF YOUR CODE                           #
        #######################################################################
    else:
        raise ValueError('Invalid forward batchnorm mode "%s"' % mode)

    # Store the updated running means back into bn_param
    bn_param['running_mean'] = running_mean
    bn_param['running_var'] = running_var

    return out, cache

<font size='4' color='red'>**Task 1.3: forward pass of a spatial batch norm layer (5 points).**

<font size='4'>For data coming from convolutional layers, batch normalization needs to accept inputs of shape `(N, C, H, W)` and produce outputs of shape `(N, C, H, W)` where the `N` dimension gives the minibatch size and the `(H, W)` dimensions give the spatial size of the feature map. In specific, we expect the statistics of each feature channel to be relatively consistent both between different imagesand different locations within the same image. Therefore spatial batch normalization computes a mean and variance for each of the `C` feature channels by computing statistics over both the minibatch dimension `N` and the spatial dimensions `H` and `W`.

In [None]:
def spatial_batchnorm_forward(x, gamma, beta, bn_param):
    """
    Computes the forward pass for spatial batch normalization.

    Inputs:
    - x: Input data of shape (N, C, H, W)
    - gamma: Scale parameter, of shape (C,)
    - beta: Shift parameter, of shape (C,)
    - bn_param: Dictionary with the following keys:
      - mode: 'train' or 'test'; required
      - eps: Constant for numeric stability
      - momentum: Constant for running mean / variance. momentum=0 means that
        old information is discarded completely at every time step, while
        momentum=1 means that new information is never incorporated. The
        default of momentum=0.9 should work well in most situations.
      - running_mean: Array of shape (D,) giving running mean of features
      - running_var Array of shape (D,) giving running variance of features

    Returns a tuple of:
    - out: Output data, of shape (N, C, H, W)
    - cache: Values needed for the backward pass
    """
    out, cache = None, None

    ###########################################################################
    # TODO: Implement the forward pass for spatial batch normalization.       #
    #                                                                         #
    # HINT: You can implement spatial batch normalization by calling the      #
    # vanilla version of batch normalization you implemented above.           #
    # Your implementation should be very short; ours is less than five lines. #
    ###########################################################################

    # extract the params of the input
    N, C, H, W = x.shape
    x_reshaped = x.transpose(0, 2, 3, 1).reshape(-1, C) # (N*H*W, C)

    # use vanilla batch norm
    out_reshaped, cache = batchnorm_forward(x_reshaped, gamma, beta, bn_param)
    # reshape the output back to (N,C,H,W)
    out = out_reshaped.reshape(N, H, W, C).transpose(0, 3, 1, 2)
    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################

    return out, cache

In [None]:
np.random.seed(231)
# Check the training-time forward pass by checking means and variances
# of features both before and after spatial batch normalization

N, C, H, W = 2, 3, 4, 5
x = 4 * np.random.randn(N, C, H, W) + 10

print('Before spatial batch normalization:')
print('  Shape: ', x.shape)
print('  Means: ', x.mean(axis=(0, 2, 3)))
print('  Stds: ', x.std(axis=(0, 2, 3)))

# Means should be close to zero and stds close to one
gamma, beta = np.ones(C), np.zeros(C)
bn_param = {'mode': 'train'}
out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)
print('After spatial batch normalization:')
print('  Shape: ', out.shape)
print('  Means: ', out.mean(axis=(0, 2, 3)))
print('  Stds: ', out.std(axis=(0, 2, 3)))

# Means should be close to beta and stds close to gamma
gamma, beta = np.asarray([3, 4, 5]), np.asarray([6, 7, 8])
out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)
print('After spatial batch normalization (nontrivial gamma, beta):')
print('  Shape: ', out.shape)
print('  Means: ', out.mean(axis=(0, 2, 3)))
print('  Stds: ', out.std(axis=(0, 2, 3)))

Before spatial batch normalization:
  Shape:  (2, 3, 4, 5)
  Means:  [9.33463814 8.90909116 9.11056338]
  Stds:  [3.61447857 3.19347686 3.5168142 ]
After spatial batch normalization:
  Shape:  (2, 3, 4, 5)
  Means:  [ 6.18949336e-16  5.99520433e-16 -1.22124533e-16]
  Stds:  [0.99999962 0.99999951 0.9999996 ]
After spatial batch normalization (nontrivial gamma, beta):
  Shape:  (2, 3, 4, 5)
  Means:  [6. 7. 8.]
  Stds:  [2.99999885 3.99999804 4.99999798]


In [None]:
np.random.seed(231)
# Check the test-time forward pass by running the training-time
# forward pass many times to warm up the running averages, and then
# checking the means and variances of activations after a test-time
# forward pass.
N, C, H, W = 10, 4, 11, 12

bn_param = {'mode': 'train'}
gamma = np.ones(C)
beta = np.zeros(C)
for t in range(50):
  x = 2.3 * np.random.randn(N, C, H, W) + 13
  spatial_batchnorm_forward(x, gamma, beta, bn_param)
bn_param['mode'] = 'test'
x = 2.3 * np.random.randn(N, C, H, W) + 13
a_norm, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)

# Means should be close to zero and stds close to one, but will be
# noisier than training-time forward passes.
print('After spatial batch normalization (test-time):')
print('  means: ', a_norm.mean(axis=(0, 2, 3)))
print('  stds: ', a_norm.std(axis=(0, 2, 3)))

After spatial batch normalization (test-time):
  means:  [-0.08034406  0.07562881  0.05716371  0.04378383]
  stds:  [0.96718744 1.0299714  1.02887624 1.00585577]


## Part 2: Implementing CNNs (Convolutional Neural Networks) using PyTorch (75 points)
<font size='4'>You may find the documentation of PyTorch useful https://pytorch.org/docs/stable/index.html.

<font size='4' color='red'>**Task 2.1: Implement a custom CNN (12 points).**

In [None]:
import torch
import torch.nn as nn
from torch import Tensor

class ConvNet(nn.Module):
    """
    A simple convolutional network with the following architecture:

    [conv - bn - relu] x M - global_average_pooling - affine - softmax

    "[conv - bn - relu] x M" means the "conv-bn-relu" block is repeated for
    M times, where M is implicitly defined by the convolution layers' parameters.
    Whether to use the batch normalization layer (bn) in-between is a design choice.

    For each convolution layer, we do downsampling of factor 2 by setting the stride
    to be 2. So we can have a large receptive field size.

    The network operates on minibatches of data that have shape (N, C, H, W)
    consisting of N images, each with height H and width W and with C input
    channels.
    """

    def __init__(self, input_dim=(3, 32, 32), filter_sizes=[7], filter_channels=[32],
            num_classes=10, use_batch_norm=True):
        """
        Initialize a new CNN.

        Inputs:
        - input_dim: Tuple (C, H, W) giving size of input data
        - filter_sizes: Width/height of filters to use in the convolutional layer. It is a
          list whose length defines the number of convolution layers.
        - filter_channels: Number of filters to use in each convolutional layer. It has the
          same length as filter_sizes.
        - num_classes: Number of output classes
        - use_batch_norm: A boolean variable indicating whether to use batch normalization
        """
        super().__init__()

        assert len(filter_sizes) == len(filter_channels), "Inconsistent filter sizes and channels."

        ############################################################################
        # TODO: Define a set of layers according to the user input.                #
        #                                                                          #
        # IMPORTANT:                                                               #
        # 1. For this assignment, you can assume that the padding of the every     #
        # convolutional layer are chosen so that **the width and height of the     #
        # input are preserved** (without considering the stride). You need to      #
        # carefully set the `pad` parameter for the convolution.                   #
        #                                                                          #
        # 2. For each convolution layer, we use stride of 2 to do downsampling.    #
        ############################################################################
        C, H, W = input_dim
        layers = [] # store the stack of the cnn
        in_channels = C # store the channel
        # construct the [conv - bn - relu]
        for i in range(len(filter_sizes)):
          kernel_size = filter_sizes[i]
          out_channels = filter_channels[i]
          padding = (kernel_size - 1) // 2 # two sides
          stride = 2 #downsampling
          layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding))

          if use_batch_norm:
            layers.append(nn.BatchNorm2d(out_channels)) # add BN layer
          layers.append(nn.ReLU(inplace=True)) # ReLU activation
          in_channels = out_channels

        self.conv_layers = nn.Sequential(*layers)

        stride = 2
        # calculate the output size
        H_out = H // stride ** len(filter_sizes)
        W_out = W // stride ** len(filter_sizes)

        # global average pooling
        self.global_average_pooling = nn.AdaptiveAvgPool2d(1) # (N,C,1,1)
        # affine fully connected
        self.fc = nn.Linear(in_channels, num_classes)
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

    def forward(self, x):
        logits = None
        feat_before_gap = None

        ############################################################################
        # TODO: Implement the forward pass for the simple convolutional net,       #
        # computing the class scores for x and storing them in the logits          #
        # variable. Also, store the feature map right before the global average    #
        # pooling (GAP) layer in the feat_before_gap variable for debugging        #
        # purpose only.                                                            #
        ############################################################################
        # pass for the conv
        feat_before_gap = self.conv_layers(x)
        # pass for the fc
        logits = self.fc(self.global_average_pooling(feat_before_gap).squeeze())
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return logits, feat_before_gap

In [None]:
# Sanity check of the model
model = ConvNet(filter_sizes=[3, 3, 3], filter_channels=[4, 8, 16])
print(model)

x = torch.rand((4, 3, 32, 32))
logits, feat_before_gap = model(x)
assert logits.shape == torch.Size([4, 10]), "Incorrect shape for the logits"
print(feat_before_gap.shape)
assert feat_before_gap.shape == torch.Size([4, 16, 4, 4]), "Incorrect shape for the feature map before the GAP layer"

ConvNet(
  (conv_layers): Sequential(
    (0): Conv2d(3, 4, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(4, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (4): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (7): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
  )
  (global_average_pooling): AdaptiveAvgPool2d(output_size=1)
  (fc): Linear(in_features=16, out_features=10, bias=True)
)
torch.Size([4, 16, 4, 4])


<font size='4' color='red'>**Task 2.2: Implement a function to test a CNN (6 points).**

In [None]:
# Function to test an already trained model
def test_model(model, data_loader):
    """
    Compute accuracy of the model.

    Inputs:
      - model: A CNN implemented in PyTorch
      - data_loader: A data loader that will provide batched images and labels
    """

    # set the model in evaluation mode so the batch norm layers will behave correctly
    model.eval()

    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for batch_data in data_loader:
            images, labels = batch_data
            images = images.cuda()
            labels = labels.cuda()

            predicted = None
            ############################################################################
            # TODO: Compute the predicted labels of the batched input images and store #
            # them in the predicted varaible.                                          #
            ############################################################################
            logits, _ = model(images) # calculate the logits
            predicted = torch.argmax(logits, dim=1) # get the label for the logits
            ############################################################################
            #                             END OF YOUR CODE                             #
            ############################################################################

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    acc = 100 * correct // total
    return acc

<font size='4' color='red'>**Task 2.3: Implement a function to train and validate a CNN (11 points).**

In [None]:
def train_val_model(model, train_data_loader, val_data_loader, loss_fn, optimizer, lr_scheduler, num_epochs, print_freq=50):
    """
    Training and validating a CNN model using PyTorch.

    Inputs:
      - model: A CNN implemented in PyTorch
      - data_loader: A data loader that will provide batched images and labels
      - loss_fn: A loss function (e.g., cross entropy loss)
      - lr_scheduler: Learning rate scheduler
      - num_epochs: Number of epochs in total
      - print_freq: Frequency to print training statistics

    Output:
      - model: Trained CNN model
    """

    for epoch_i in range(num_epochs):
        # set the model in the train mode so the batch norm layers will behave correctly
        model.train()

        running_loss = 0.0
        running_total = 0.0
        running_correct = 0.0
        for i, batch_data in enumerate(train_data_loader):
            # Every data instance is an image + label pair
            images, labels = batch_data
            images = images.cuda()
            labels = labels.cuda()

            predicted = None
            ############################################################################
            # TODO: Finish loss computation, gradient backpropagation, weight update,  #
            # and computing the predicted labels of the input images and store them in #
            # the predicted varaible, which will be used to monitor the training       #
            # accuracy.                                                                #
            #                                                                          #
            # Note: The learning rate is updated after each **epoch**.                 #
            ############################################################################
            # calculate the logits
            logits,_= model(images)
            # logits = logits.float()
            # calculate the loss
            labels = labels.long()
            loss = loss_fn(logits, labels)
            # do backpropagation
            optimizer.zero_grad() # initialize
            loss.backward() # backpropagation
            optimizer.step()# update the parameter(weights)
            # get the label for the logits
            predicted = torch.argmax(logits, dim=1)

            ############################################################################
            #                             END OF YOUR CODE                             #
            ############################################################################

            # print statistics
            running_loss += loss.item()
            running_total += labels.size(0)
            running_correct += (predicted == labels).sum().item()
            if i % print_freq == 0:    # print every certain number of mini-batches
                running_loss = running_loss / print_freq
                running_acc = running_correct / running_total * 100
                last_lr = lr_scheduler.get_last_lr()[0]
                print(f'[{epoch_i + 1}/{num_epochs}, {i + 1:5d}/{len(train_data_loader)}] loss: {running_loss:.3f} acc: {running_acc:.3f} lr: {last_lr:.5f}')
                running_loss = 0.0
                running_total = 0.0
                running_correct = 0.0

        # adjust the learning rate
        lr_scheduler.step()

        val_acc = test_model(model, val_data_loader)
        print(f'[{epoch_i + 1}/{num_epochs}] val acc: {val_acc:.3f}')

    return model











































































<font size='4' color='red'>**Task 2.4: Implement a function to set up the loss function, optimizer, and learning rate scheduler (8 points).**

In [None]:
def set_up_loss_optimizer_lr_scheduler(model, learning_rate, momentum, lr_step_size, lr_gamma):
    """
    In this programming assignment, we will adopt the most common choice for the optimizer:
    SGD + momentum and learning rate scheduler: StepLR. Please refer to https://pytorch.org/docs/stable/optim.html#algorithms
    and https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html#torch.optim.lr_scheduler.StepLR for more details.
    """
    loss_fn = None
    optimizer = None
    lr_scheduler = None

    ############################################################################
    # TODO: Define the loss function, optimizer (SGD + momentum), and          #
    # learning rate scheduler (StepLR).                                        #
    #                                                                          #
    # Note: We expect you to set up the learning rate in an epoch-based way.   #
    # We will run the learning rate scheduler after each epoch.                #
    ############################################################################
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=lr_gamma)
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss_fn, optimizer, lr_scheduler

In [None]:
# no need to implement anything here
def set_up_cifar10_data_loader(images, labels, batch_size, shuffle=True):
    dataset = torch.utils.data.TensorDataset(torch.Tensor(images), torch.Tensor(labels))
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=2)

    return data_loader

<font size='4' color='red'>**Task 2.5: Train a good custom CNN (10 points).**

<font size='4'>By tweaking different hyper parameters, such as number of convolution layers, number of filters (channels), learning rate, batch size, etc, you should achieve greater than 60% accuracy on the testing set **with 3 epochs using the SGD + momentum optimizer**.
    
<font size='4' color='red'>**Note: The total number of parameters of your custom CNN should be smaller than 180K.**

In [None]:
# In practice, this is a hyperparameter to tune.
# But here we use a fixed number to make the comparisons fair.
num_epochs = 3

model = None
loss_fn = None
optimizer = None
lr_scheduler = None
############################################################################
# TODO: Set up and tune the hyper parameters.                              #
############################################################################
batch_size = 32
learning_rate = 0.01
momentum = 0.9
lr_gamma = 0.5

model = ConvNet(filter_sizes=[3, 3, 3, 3], filter_channels=[16, 32, 64, 128])
# loss_fn = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=lr_gamma)
loss_fn, optimizer, lr_scheduler = set_up_loss_optimizer_lr_scheduler(model, learning_rate, momentum, 2, lr_gamma)
############################################################################
#                             END OF YOUR CODE                             #
############################################################################

model = model.cuda()
num_params = sum(p.numel() for p in model.parameters())
print('Number of parameters: {:.3f}K'.format(num_params / 1000))

# set up the data loaders
# note the usage of the batch_size hyperparameter here
train_loader = set_up_cifar10_data_loader(data['X_train'], data['y_train'], batch_size, shuffle=True)
print("There are {} batches in the training set.".format(len(train_loader)))

val_loader = set_up_cifar10_data_loader(data['X_val'], data['y_val'], batch_size, shuffle=False)
print("There are {} batches in the validation set.".format(len(val_loader)))

test_loader = set_up_cifar10_data_loader(data['X_test'], data['y_test'], batch_size, shuffle=False)
print("There are {} batches in the testing set.".format(len(test_loader)))

model = train_val_model(model, train_loader, val_loader, loss_fn, optimizer, lr_scheduler, num_epochs)
test_acc = test_model(model, test_loader)
print(f"testing accuracy: {test_acc:.3f}")

Number of parameters: 99.210K
There are 1532 batches in the training set.
There are 32 batches in the validation set.
There are 32 batches in the testing set.
[1/3,     1/1532] loss: 0.045 acc: 12.500 lr: 0.01000
[1/3,    51/1532] loss: 2.066 acc: 24.688 lr: 0.01000
[1/3,   101/1532] loss: 1.849 acc: 32.688 lr: 0.01000
[1/3,   151/1532] loss: 1.761 acc: 35.875 lr: 0.01000
[1/3,   201/1532] loss: 1.684 acc: 37.250 lr: 0.01000
[1/3,   251/1532] loss: 1.624 acc: 39.500 lr: 0.01000
[1/3,   301/1532] loss: 1.614 acc: 41.812 lr: 0.01000
[1/3,   351/1532] loss: 1.531 acc: 44.250 lr: 0.01000
[1/3,   401/1532] loss: 1.455 acc: 47.188 lr: 0.01000
[1/3,   451/1532] loss: 1.480 acc: 47.688 lr: 0.01000
[1/3,   501/1532] loss: 1.472 acc: 47.312 lr: 0.01000
[1/3,   551/1532] loss: 1.433 acc: 46.375 lr: 0.01000
[1/3,   601/1532] loss: 1.451 acc: 47.750 lr: 0.01000
[1/3,   651/1532] loss: 1.390 acc: 49.062 lr: 0.01000
[1/3,   701/1532] loss: 1.458 acc: 48.688 lr: 0.01000
[1/3,   751/1532] loss: 1.427 a

<font size='4' color='red'>**Task 2.6: Implement a ResNet-like CNN (11 points).**

<font size='4'> In practice, we can borrow the existing model design for our task. ResNet (residual network) is a classical design and being used in many places. Let's experiment with it here. Since we are dealing with small images (32x32), regular ResNets are too deep with too much downsampling. We need to chop off a few blocks to reduce the depth and downsampling factor.

In [None]:
from functools import partial
from typing import Any, Callable, List, Optional, Type, Union
from torchvision.models.resnet import conv1x1, conv3x3, BasicBlock, Bottleneck, ResNet

class MyResNet(ResNet):
    def __init__(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        layers: List[int],
        num_classes: int = 1000,
        zero_init_residual: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        """
        Here we will design a model architecture MyResNet, inherited from the ResNet model.
        First check here https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py about the
        implementation of ResNet in PyTorch.
        What you need to do in this part is the remove the layer3 and layer4 and also modify the final
        fully-connected layer accordingly.
        """
        super().__init__(
            block, layers, num_classes, zero_init_residual, groups,
            width_per_group, replace_stride_with_dilation, norm_layer
        )

        ############################################################################
        # TODO: Remove the layer3 and layer4 block in the original implementation  #
        # of ResNet and modify the fully-connected layer (classifier) accordingly. #

        ############################################################################
        del self.layer3
        del self.layer4
        self.fc = nn.Linear(128 * block.expansion, num_classes)
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

    def _forward_impl(self, x: Tensor) -> Tensor:
        logits = None
        feat_before_gap = None
        ############################################################################
        # TODO: Implement the forward pass for the ResNet-like model,              #
        # computing the class scores for x and storing them in the logits          #
        # variable. Also, store the feature map right before the global average    #
        # pooling (GAP) layer in the feat_before_gap variable for debugging        #
        # purpose only.                                                            #
        ############################################################################

        # conv + BN + ReLU
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        # residual blocks
        x = self.layer1(x)
        x = self.layer2(x)
        # store the feature map
        feat_before_gap = x

        x = self.avgpool(x)# (N,C,H,W) -> (N,C,1,1)
        x = torch.flatten(x, 1) #(N,C)
        logits = self.fc(x)
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return logits, feat_before_gap

In [None]:
# Let's run a sanity check of your model
model = MyResNet(BasicBlock, [2, 2, 2, 2], num_classes=10)

x = torch.rand((4, 3, 32, 32))
logits, feat_before_gap = model(x)
assert logits.shape == torch.Size([4, 10]), "Incorrect shape for the logits"
assert feat_before_gap.shape[2:] == torch.Size([4, 4]), "Incorrect shape for the feature map before the GAP layer"

<font size='4' color='red'>**Task 2.7: Train a good custom ResNet-like model (6 points).**

<font size='4'>Here we use the same batch size used in the tweaking of your custom CNN. We will also simply use (part of) the ResNet18 model. You only need to tune learning rate, momentum, learning rate decay rate here. You should achieve greater than 70% accuracy on the testing set **with 3 epochs using the SGD + momentum optimizer**.

In [None]:
# In practice, this is a hyperparameter to tune.
# But here we use a fixed number to make the comparisons fair.
num_epochs = 3

model = MyResNet(BasicBlock, [2, 2, 2, 2], num_classes=10)
num_params = sum(p.numel() for p in model.parameters())
print('Number of parameters: {:.3f}K'.format(num_params / 1000))

############################################################################
# TODO: Set up and tune the hyper parameters.                              #
############################################################################
batch_size = 32
learning_rate = 0.01
momentum = 0.9
lr_gamma = 0.5

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=lr_gamma)
############################################################################
#                             END OF YOUR CODE                             #
############################################################################

# set up the data loaders
# note the usage of the batch_size hyperparameter here
train_loader = set_up_cifar10_data_loader(data['X_train'], data['y_train'], batch_size, shuffle=True)
print("There are {} batches in the training set.".format(len(train_loader)))

val_loader = set_up_cifar10_data_loader(data['X_val'], data['y_val'], batch_size, shuffle=False)
print("There are {} batches in the validation set.".format(len(val_loader)))

test_loader = set_up_cifar10_data_loader(data['X_test'], data['y_test'], batch_size, shuffle=False)
print("There are {} batches in the testing set.".format(len(test_loader)))

model = model.cuda()
model = train_val_model(model, train_loader, val_loader, loss_fn, optimizer, lr_scheduler, num_epochs)
test_acc = test_model(model, test_loader)
print(f"testing accuracy: {test_acc:.3f}")

Number of parameters: 684.362K
There are 1532 batches in the training set.
There are 32 batches in the validation set.
There are 32 batches in the testing set.
[1/3,     1/1532] loss: 0.047 acc: 15.625 lr: 0.01000
[1/3,    51/1532] loss: 2.025 acc: 23.875 lr: 0.01000
[1/3,   101/1532] loss: 1.857 acc: 30.938 lr: 0.01000
[1/3,   151/1532] loss: 1.719 acc: 37.062 lr: 0.01000
[1/3,   201/1532] loss: 1.655 acc: 37.562 lr: 0.01000
[1/3,   251/1532] loss: 1.615 acc: 40.750 lr: 0.01000
[1/3,   301/1532] loss: 1.539 acc: 44.375 lr: 0.01000
[1/3,   351/1532] loss: 1.542 acc: 43.125 lr: 0.01000
[1/3,   401/1532] loss: 1.514 acc: 43.750 lr: 0.01000
[1/3,   451/1532] loss: 1.484 acc: 45.000 lr: 0.01000
[1/3,   501/1532] loss: 1.433 acc: 49.250 lr: 0.01000
[1/3,   551/1532] loss: 1.372 acc: 48.312 lr: 0.01000
[1/3,   601/1532] loss: 1.399 acc: 46.938 lr: 0.01000
[1/3,   651/1532] loss: 1.366 acc: 50.750 lr: 0.01000
[1/3,   701/1532] loss: 1.440 acc: 47.312 lr: 0.01000
[1/3,   751/1532] loss: 1.376 

<font size='4' color='red'>**Task 2.8: Train a the same ResNet-like model but with ImageNet pre-trained weights (transfer learning, 8 points).**

<font size='4'>Here we use the same batch size used in the tweaking of your custom CNN. We will also simply use the same ResNet18-like model. You only need to tune learning rate, momentum, learning rate decay rate here. You should achieve greater than 80% accuracy on the testing set **with 3 epochs using the SGD + momentum optimizer**.

In [None]:
# Let's experiment with transfer learning by borrowing the weights of a ResNet model pre-trained on ImageNet.
import torchvision
imagenet_resnet18 = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights)
model = MyResNet(BasicBlock, [2, 2, 2, 2], num_classes=10)
num_params = sum(p.numel() for p in model.parameters())
print('Number of parameters: {:.3f}K'.format(num_params / 1000))

############################################################################
# TODO: Copy the appropriate weights from imagenet_resnet18 to our custom  #
# model, which shares part of the network architecture.                    #
############################################################################
model.conv1 = imagenet_resnet18.conv1
model.bn1 = imagenet_resnet18.bn1
model.maxpool = imagenet_resnet18.maxpool
model.layer1 = imagenet_resnet18.layer1
model.layer2 = imagenet_resnet18.layer2
############################################################################
#                             END OF YOUR CODE                             #
############################################################################

############################################################################
# TODO: Set up and tune the hyper parameters.                              #
############################################################################
learning_rate = 0.01
momentum = 0.9
lr_gamma = 0.5

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=lr_gamma)
############################################################################
#                             END OF YOUR CODE                             #
############################################################################

# set up the data loaders
# note the usage of the batch_size hyperparameter here
train_loader = set_up_cifar10_data_loader(data['X_train'], data['y_train'], batch_size, shuffle=True)
print("There are {} batches in the training set.".format(len(train_loader)))

val_loader = set_up_cifar10_data_loader(data['X_val'], data['y_val'], batch_size, shuffle=False)
print("There are {} batches in the validation set.".format(len(val_loader)))

test_loader = set_up_cifar10_data_loader(data['X_test'], data['y_test'], batch_size, shuffle=False)
print("There are {} batches in the testing set.".format(len(test_loader)))

model = model.cuda()
model = train_val_model(model, train_loader, val_loader, loss_fn, optimizer, lr_scheduler, num_epochs)
test_acc = test_model(model, test_loader)
print(f"testing accuracy: {test_acc:.3f}")



Number of parameters: 684.362K
There are 1532 batches in the training set.
There are 32 batches in the validation set.
There are 32 batches in the testing set.
[1/3,     1/1532] loss: 0.046 acc: 12.500 lr: 0.01000
[1/3,    51/1532] loss: 2.268 acc: 15.375 lr: 0.01000
[1/3,   101/1532] loss: 2.112 acc: 34.875 lr: 0.01000
[1/3,   151/1532] loss: 1.807 acc: 41.812 lr: 0.01000
[1/3,   201/1532] loss: 1.556 acc: 47.938 lr: 0.01000
[1/3,   251/1532] loss: 1.357 acc: 51.625 lr: 0.01000
[1/3,   301/1532] loss: 1.246 acc: 54.937 lr: 0.01000
[1/3,   351/1532] loss: 1.169 acc: 59.000 lr: 0.01000
[1/3,   401/1532] loss: 1.131 acc: 60.125 lr: 0.01000
[1/3,   451/1532] loss: 1.058 acc: 63.125 lr: 0.01000
[1/3,   501/1532] loss: 1.024 acc: 64.062 lr: 0.01000
[1/3,   551/1532] loss: 0.973 acc: 66.188 lr: 0.01000
[1/3,   601/1532] loss: 0.966 acc: 65.875 lr: 0.01000
[1/3,   651/1532] loss: 0.937 acc: 66.625 lr: 0.01000
[1/3,   701/1532] loss: 0.865 acc: 70.438 lr: 0.01000
[1/3,   751/1532] loss: 0.916 

<font size='4' color='red'>**Task 2.9: Briefly explain below why you got increasinly better accuracy from Task 2.5 to Task 2.8 (3 points)**

- **From CNN to CNN + Residual Block:**
  - Residual blocks provide shortcut connections, allowing gradients to flow more easily.
  - Helps the model learn features more effectively, reducing gradient vanishing and explosion.
  - Improves training stability and enables deeper networks.

- **From CNN to CNN + Pretrained Model on ImageNet:**
  - Leverages pretrained weights from a large dataset (ImageNet), which already captures basic image features.
  - Speeds up convergence and improves accuracy on the target dataset.
  - Enhances the model’s generalization ability by transferring learned representations.

Overall, these incremental improvements lead to better performance and accuracy.


<font size='4' color='red'> **Part 3: Extra credits (10 points).**

<font size='4'> Let's do something fun here. You can do whatever you can use. Earn the full credits by achieving at least 91% accuracy on the testing set with the following restrictions:
- Train the model for no more than 5 epochs.
- Use a model whose number of parameters is smaller than 2M.
- Use a convolutional neural network

<font size='4' color='red'> **Note**: If you have to override any function you implemented earlier, write new code below. Do not change the function definition in previous sections so that we can grade your implementation appopriately.

<font size='4' color='red'> **No partial credits will be given to this part. In other words, you won't get any credits if your final testing accuracy is lower than 91%.**

In [None]:
# prepare data
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

class CIFAR10Dataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images  # shape: (N, 3, 32, 32)
        self.labels = labels  # shape: (N,)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = self.images[idx].transpose(1, 2, 0)  # (3, 32, 32) -> (32, 32, 3)
        img = Image.fromarray((img * 255).astype('uint8'))  # to PIL

        if self.transform:
            img = self.transform(img)

        label = self.labels[idx]
        return img, label

# test and train augmentation
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
])

# modify set_up_cifar10_data_loader
def set_up_cifar10_data_loader(images, labels, batch_size, transform, shuffle=True):
    dataset = CIFAR10Dataset(images, labels, transform=transform)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    return data_loader

batch_size = 32
# load augmented data
train_loader = set_up_cifar10_data_loader(data['X_train'], data['y_train'], batch_size, transform_train, shuffle=True)
val_loader = set_up_cifar10_data_loader(data['X_val'], data['y_val'], batch_size, transform_test, shuffle=False)
test_loader = set_up_cifar10_data_loader(data['X_test'], data['y_test'], batch_size, transform_test, shuffle=False)

print(f"train batch: {len(train_loader)}, test batch: {len(test_loader)}")


训练集批次数: 1532, 测试集批次数: 32
