<a href="https://colab.research.google.com/github/armanheydari/Advance-Deep-Learning_Winter-2024/blob/main/Assignment2/cmpt489_828_a2_q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**CMPT 489/828 Assignment 2**

Follow the instructions in this notebook and complete the missing code.

**NOTE: Do Not Change Any Provided Code or Given Variable Names!**

**Q1**. Create a simple fully-connected neural network from scratch (**30 points**)

**NOTE: Do Not Use torch.nn Module for This Question! (Except for nn.Softmax())**

In [25]:
import torch
from torch import nn
import torchvision
from torchvision import transforms
import math

In [26]:
# load CIFAR-10 dataset with pytorch
# convert to tensor, normalize and flatten
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
    transforms.Lambda(lambda x: torch.flatten(x)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_id = list(range(4000))
val_id = list(range(4000, 5000))
test_id = list(range(500))

# subset dataset and create dataloader with batch_size=1
train_sub_set = torch.utils.data.Subset(trainset, train_id)
val_sub_set = torch.utils.data.Subset(trainset, val_id)
test_sub_set = torch.utils.data.Subset(testset, test_id)

train_loader = torch.utils.data.DataLoader(train_sub_set, batch_size=1, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_sub_set, batch_size=1, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_sub_set, batch_size=1, shuffle=True)

# check data size, should be CxHxW, class map only useful for visualization and sanity checks
image_size = trainset[0][0].size(0)
class_map = {0: 'plane', 1: 'car', 2: 'bird', 3: 'cat', 4: 'deer', 5: 'dog', 6: 'frog', 7: 'horse', 8: 'ship',
             9: 'truck'}

Files already downloaded and verified
Files already downloaded and verified


a. Implement a fully-connected model (**16 points**)

In [27]:
# implement operations for our model

def activation(x):
    """
    Implement activation function with tanh()
    :param x: input tensor
    :return: output tensor equals element-wise tanh(x)
    """
    ###############################################################################
    # TODO:                                                                       #
    # 1. calculate act = tanh(x)                                                  #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    act = torch.tanh(x)
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return act


def activation_grad(x):
    """
    Calculate the gradient of activation() respect to input x
    You need to find the maths representation of the derivative first
    :param x: input tensor
    :return: element-wise gradient of activation()
    """
    ###############################################################################
    # TODO:                                                                       #
    # 1. find maths represenation of activation()                                 #
    # 2. calculate gradient respect to x                                          #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    delta_act = 1 - torch.pow(activation(x), 2)
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return delta_act


def cross_entropy(pred, label):
    """
    Calculate the cross entropy loss, L(pred, label)
    This is for one image only
    :param pred: predicted tensor
    :param label: one-hot encoded label tensor
    :return: the cross entropy loss, L(pred, label)
    """
    ###############################################################################
    # TODO:                                                                       #
    # 1. convert pred into a probability distribution use softmax()               #
    # 2. calculate cross entropy loss                                             #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    # Apply softmax to pred tensor
    pred = torch.nn.functional.softmax(pred, dim=1)
    # Compute the cross entropy loss, I just added an epsilon to prevent zero in log
    loss = torch.sum(-torch.mul(label, torch.log(pred+1e-9)))
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return loss


def cross_entropy_grad(pred, label):
    """
    Calculate the gradient of cross entropy respect to pred
    This is for one image only
    :param pred: predicted tensor
    :param label: one-hot encoded label tensor
    :return: gradient of cross entropy respect to pred
    """

    ###############################################################################
    # TODO:                                                                       #
    # 1. calculate element-wise gradient respect to pred = softmax(pred) - label  #   # I think it should be delta_loss = softmax(pred) - label
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    pred = nn.functional.softmax(pred, dim=1)
    delta_loss = pred - label
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return delta_loss


def forward(w1, b1, w2, b2, x):
    """
    forward operation
    1. one linear layer followed by activation
    2. one linear layer followed by activation
    :param w1:
    :param b1:
    :param w2:
    :param b2:
    :param x: input tensor
    :return: x0, s1, x1, s2, x2
    """
    x0 = x
    ###############################################################################
    # TODO:                                                                       #
    # 1. calculate s1 using w1, x0, b1                                            #
    # 2. calculate x1 using activation()                                          #
    # 3. calculate s2 using w2, x1, b2                                            #
    # 4. calculate x2 using activation()                                          #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    s1 = torch.matmul(x0, w1.T) + b1
    x1 = activation(s1)
    s2 = torch.matmul(x1, w2.T) + b2
    x2 = activation(s2)
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return x0, s1, x1, s2, x2


def backward(w1, b1, w2, b2, t, x, s1, x1, s2, x2,
             grad_dw1, grad_db1, grad_dw2, grad_db2):
    """
    backward propagation, calculate dl_dw1, dl_db1, dl_dw2, dl_db2 using chain rule
    :param w1:
    :param b1:
    :param w2:
    :param b2:
    :param t: label
    :param x: input tensor
    :param s1:
    :param x1:
    :param s2:
    :param x2:
    :param grad_dw1: gradient of w1
    :param grad_db1: gradient of b1
    :param grad_dw2: gradient of w2
    :param grad_db2: gradient of b2
    :return:
    """
    x0 = x
    ###############################################################################
    # TODO:                                                                       #
    # 1. calculate grad_dx2 using x2, t                                             #
    # 2. calculate grad_ds2 using s2, grad_dx2                                        #
    # 3. calculate grad_dx1 using w2, grad_ds2                                        #
    # 4. calculate grad_ds1 using s1, grad_dx1                                        #
    # 5. calculate and accumulate grad_dw2 using grad_ds2, x1                         #
    # 6. calculate and accumulate grad_db2 using grad_ds2                             #
    # 7. calculate and accumulate grad_dw1 using grad_ds1, x0                         #
    # 8. calculate and accumulate grad_db1 using grad_ds1                             #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    grad_dx2 = cross_entropy_grad(x2, t)
    grad_ds2 = activation_grad(s2) * grad_dx2
    grad_dx1 = torch.matmul(grad_ds2, w2)
    grad_ds1 = activation_grad(s1) * grad_dx1
    grad_dw2.add_(torch.matmul(grad_ds2.T, x1))
    grad_db2.add_(grad_ds2.view(-1))
    grad_dw1.add_(torch.matmul(grad_ds1.T, x0))
    grad_db1.add_(grad_ds1.view(-1))
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

b. Training loop (**12 points**)

In [28]:
# training loop, we have 10 classes
nb_classes = 10
nb_train_samples = len(train_loader)

# set number of hidden neurons for first linear layer
nb_hidden = 50
# set learn rate and weights initialization std
lr = 1e-1 / nb_train_samples
init_std = 1e-6

# initialize weights and biases to small values from normal distribution
w1 = torch.empty(nb_hidden, image_size).normal_(0, init_std)
b1 = torch.empty(nb_hidden).normal_(0, init_std)
w2 = torch.empty(nb_classes, nb_hidden).normal_(0, init_std)
b2 = torch.empty(nb_classes).normal_(0, init_std)

# initialize empty tensor for gradients of weights and biases
grad_dw1 = torch.empty(w1.size())
grad_db1 = torch.empty(b1.size())
grad_dw2 = torch.empty(w2.size())
grad_db2 = torch.empty(b2.size())

# run for 1000 epochs
for k in range(1000):

    # initialize loss and train error counts
    acc_loss = 0
    nb_train_errors = 0

    ###############################################################################
    # TODO:                                                                       #
    # 1. clear all gradients of weights and biases using zero_()                  #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    grad_dw1.zero_()
    grad_dw2.zero_()
    grad_db1.zero_()
    grad_db2.zero_()
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    for x, y in train_loader:
        train_target_one_hot = nn.functional.one_hot(y.squeeze(dim=0), num_classes=nb_classes)
        ###############################################################################
        # TODO:                                                                       #
        # 1. do forward propagation use forward()                                     #
        # 2. get prediction of x                                                      #
        # 3. accumulate train error: nb_train_errors                                  #
        # 4. accumulate train loss: acc_loss use cross_entropy                        #
        # 5. do backward propagation use backward()                                   #
        ###############################################################################
        # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        x0, s1, x1, s2, x2 = forward(w1, b1, w2, b2, x)
        predicted_class = torch.argmax(x2)
        label_class = torch.argmax(train_target_one_hot)
        if predicted_class != label_class:
          nb_train_errors += 1
        acc_loss += cross_entropy(x2, train_target_one_hot).item()
        backward(w1, b1, w2, b2, train_target_one_hot, x, s1, x1, s2, x2, grad_dw1, grad_db1, grad_dw2, grad_db2)
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    # Gradient step
    ###############################################################################
    # TODO:                                                                       #
    # 1. step w1 using lr, dl_dw1                                                 #
    # 2. step b1 using lr, dl_db1                                                 #
    # 3. step w2 using lr, dl_dw2                                                 #
    # 4. step b2 using lr, dl_db2                                                 #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    w1 = w1 - lr * grad_dw1
    b1 = b1 - lr * grad_db1
    w2 = w2 - lr * grad_dw2
    b2 = b2 - lr * grad_db2
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    # Val error, initialize val error count
    nb_val_errors = 0

    for x_val, y_val in val_loader:
        ###############################################################################
        # TODO:                                                                       #
        # 1. do forward propagation use forward()                                     #
        # 2. get prediction of x_val                                                  #
        # 3. accumulate val error: nb_val_errors                                      #
        ###############################################################################
        # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        x0, s1, x1, s2, x2 = forward(w1, b1, w2, b2, x_val)
        pred = x2
        predicted_class = torch.argmax(pred)
        val_target_one_hot = nn.functional.one_hot(y_val.squeeze(dim=0), num_classes=nb_classes)
        label_class = torch.argmax(val_target_one_hot)
        if predicted_class != label_class:
          nb_val_errors += 1
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    # print train and val information at end of each epoch
    print('{:d}: acc_train_loss {:.02f}, acc_train_accuracy {:.02f}%, val_accuracy {:.02f}%'
          .format(k,
                  acc_loss,
                  100 - (100 * nb_train_errors) / len(train_loader),
                  100 - (100 * nb_val_errors) / len(val_loader)))


0: acc_train_loss 9210.34, acc_train_accuracy 10.33%, val_accuracy 9.70%
1: acc_train_loss 9210.27, acc_train_accuracy 10.55%, val_accuracy 9.70%
2: acc_train_loss 9210.19, acc_train_accuracy 10.55%, val_accuracy 9.70%
3: acc_train_loss 9210.12, acc_train_accuracy 10.55%, val_accuracy 9.70%
4: acc_train_loss 9210.05, acc_train_accuracy 10.55%, val_accuracy 9.70%
5: acc_train_loss 9209.98, acc_train_accuracy 10.55%, val_accuracy 9.70%
6: acc_train_loss 9209.91, acc_train_accuracy 10.55%, val_accuracy 9.70%
7: acc_train_loss 9209.85, acc_train_accuracy 10.55%, val_accuracy 9.70%
8: acc_train_loss 9209.78, acc_train_accuracy 10.55%, val_accuracy 9.70%
9: acc_train_loss 9209.72, acc_train_accuracy 10.55%, val_accuracy 9.70%
10: acc_train_loss 9209.66, acc_train_accuracy 10.55%, val_accuracy 9.70%
11: acc_train_loss 9209.60, acc_train_accuracy 10.55%, val_accuracy 9.70%
12: acc_train_loss 9209.54, acc_train_accuracy 10.55%, val_accuracy 9.70%
13: acc_train_loss 9209.48, acc_train_accuracy 1

c. Test model (**2 points**)

In [30]:
# Test accuracy
nb_test_errors = 0
for x_test, y_test in test_loader:
    ###############################################################################
    # TODO:                                                                       #
    # 1. do forward propagation use forward()                                     #
    # 2. get prediction of x_test                                                 #
    # 3. accumulate val error: nb_test_errors                                     #
    ###############################################################################
    # *****BEGIN YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    x0, s1, x1, s2, x2 = forward(w1, b1, w2, b2, x_test)
    pred = x2
    predicted_class = torch.argmax(pred)
    test_target_one_hot = nn.functional.one_hot(y_test.squeeze(dim=0), num_classes=nb_classes)
    label_class = torch.argmax(test_target_one_hot)
    if predicted_class != label_class:
      nb_test_errors += 1
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# print test set error
print('test_accuracy {:.02f}%'.format(100 - (100 * nb_test_errors) / len(test_loader)))

test_accuracy 33.80%
