# Deep Learning
## HW3 - Problem 2

Name: Amin Robatian

Student Number: 400301075

In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from torchvision import datasets, transforms, ops
from torchvision.transforms import ToTensor, Lambda
from torchsummary import summary
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
import numpy as np
import pandas as pd
import random
from math import floor
import string
import matplotlib.pyplot as plt
%matplotlib inline

torch.manual_seed(85)
random.seed(85)
np.random.seed(85)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Part (A) - ResNet50

In [2]:
model_ResNet50 = torchvision.models.resnet50(pretrained=True)
print(model_ResNet50)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [3]:
pytorch_total_params = sum(p.numel() for p in model_ResNet50.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 25,557,032


In [4]:
for param in model_ResNet50.parameters():
  param.requires_grad = False

pytorch_total_params = sum(p.numel() for p in model_ResNet50.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 0


In [5]:
model_ResNet50.fc = nn.Sequential(nn.Linear(2048, 10))

model_ResNet50 = model_ResNet50.to(device)

pytorch_total_params = sum(p.numel() for p in model_ResNet50.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 20,490


In [6]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])


target_transform = Lambda(lambda y: torch.zeros(10, 
                                                dtype=torch.float).scatter_(dim=0, index=torch.tensor(y),value=1))

train_dataset = datasets.CIFAR10(
    root="data",
    train=True,
    transform=transform,
    target_transform=target_transform,
    download=True
)

print(f"{len(train_dataset):,} Training Images")

test_dataset = datasets.CIFAR10(
    root="data",
    train=False,
    transform=transform,
    target_transform=target_transform,
    download=True
)

print(f"{len(test_dataset):,} Test Images")

Files already downloaded and verified
50,000 Training Images
Files already downloaded and verified
10,000 Test Images


In [7]:
learning_rate = 1e-3
batch_size = 32
epochs = 10

In [8]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
def TrainDataSet_Accuracy(dataloader, model, loss_fn):
    size = floor(len(dataloader.dataset) / batch_size) * batch_size
    num_batches = floor(len(dataloader.dataset) / batch_size)
    train_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            if len(X) < batch_size:
              break
            pred = model(X)
            train_loss += loss_fn(pred, y).item()
            #
            for i in range(batch_size):
              v1 = pred[i]
              v2 = y[i]
              if torch.argmax(v1) == torch.argmax(v2):
                correct += 1
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    train_loss /= num_batches
    correct /= size
    print(f"Train Dataset: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
    accuracy = 100*correct
    return train_loss, accuracy

In [10]:
def TestDataSet_Accuracy(dataloader, model, loss_fn):
    size = floor(len(dataloader.dataset) / batch_size) * batch_size
    num_batches = floor(len(dataloader.dataset) / batch_size)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            if len(X) < batch_size:
              break
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            #
            for i in range(batch_size):
              v1 = pred[i]
              v2 = y[i]
              if torch.argmax(v1) == torch.argmax(v2):
                correct += 1
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Dataset: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    accuracy = 100*correct
    return test_loss, accuracy

In [11]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [12]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_ResNet50.parameters(), lr=learning_rate, momentum=0.9)

train_loss = np.zeros(epochs)
train_accuracy = np.zeros(epochs)
test_loss = np.zeros(epochs)
test_accuracy = np.zeros(epochs)


for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    model_ResNet50.train()
    train_loop(train_dataloader, model_ResNet50, loss_fn, optimizer)
    model_ResNet50.eval()
    train_loss[t], train_accuracy[t] = TrainDataSet_Accuracy(train_dataloader, model_ResNet50, loss_fn)
    test_loss[t], test_accuracy[t] = TestDataSet_Accuracy(test_dataloader, model_ResNet50, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Train Dataset: 
 Accuracy: 79.3%, Avg loss: 0.630236 

Test Dataset: 
 Accuracy: 78.4%, Avg loss: 0.640711 

Epoch 2
-------------------------------
Train Dataset: 
 Accuracy: 80.4%, Avg loss: 0.578347 

Test Dataset: 
 Accuracy: 79.7%, Avg loss: 0.596580 

Epoch 3
-------------------------------
Train Dataset: 
 Accuracy: 81.3%, Avg loss: 0.548008 

Test Dataset: 
 Accuracy: 80.4%, Avg loss: 0.569842 

Epoch 4
-------------------------------
Train Dataset: 
 Accuracy: 81.6%, Avg loss: 0.536568 

Test Dataset: 
 Accuracy: 80.7%, Avg loss: 0.557460 

Epoch 5
-------------------------------
Train Dataset: 
 Accuracy: 82.2%, Avg loss: 0.524954 

Test Dataset: 
 Accuracy: 81.0%, Avg loss: 0.552790 

Epoch 6
-------------------------------
Train Dataset: 
 Accuracy: 82.6%, Avg loss: 0.506352 

Test Dataset: 
 Accuracy: 81.6%, Avg loss: 0.536827 

Epoch 7
-------------------------------
Train Dataset: 
 Accuracy: 82.9%, Avg loss: 0.499738 

Test Datase

In [13]:
for param in model_ResNet50.parameters():
  param.requires_grad = False

pytorch_total_params = sum(p.numel() for p in model_ResNet50.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 0


# Part (B) - Teacher: ResNet50, Student: ResNet18

In [14]:
model_ResNet18 = torchvision.models.resnet18()
print(model_ResNet18)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [15]:
model_ResNet18.fc = nn.Sequential(nn.Linear(512, 10))

model_ResNet18 = model_ResNet18.to(device)

pytorch_total_params = sum(p.numel() for p in model_ResNet18.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 11,181,642


In [16]:
def CrossEntropy(y_hat, y):
  CE =  (-1 / batch_size) * torch.sum(y * torch.log(y_hat) + (1 - y) * torch.log(1 - y_hat))
  return CE

In [17]:
def distillation_loss(logit_teacher, logit_student, y):
    m = nn.Softmax(dim=1)
    alpha = 0.5
    T = 3
    loss =  (1 - alpha) * CrossEntropy(m(logit_student / T), y) \
    + alpha * CrossEntropy(m(logit_student / T), m(logit_teacher / T)) 
    return loss

In [18]:
def train_loop(dataloader, model_teacher, model_student, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
 
        logit_teacher = model_teacher(X)
        logit_student = model_student(X)

        loss = loss_fn(logit_teacher, logit_student, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [19]:
def TrainDataSet_Accuracy(dataloader, model_teacher, model_student, loss_fn):
    size = floor(len(dataloader.dataset) / batch_size) * batch_size
    num_batches = floor(len(dataloader.dataset) / batch_size)
    train_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            
            logit_teacher = model_teacher(X)
            logit_student = model_student(X)
            
            if len(X) < batch_size:
              break
            pred = model_student(X)
            train_loss += loss_fn(logit_teacher, logit_student, y).item()
            #
            for i in range(batch_size):
              v1 = pred[i]
              v2 = y[i]
              if torch.argmax(v1) == torch.argmax(v2):
                correct += 1
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    train_loss /= num_batches
    correct /= size
    print(f"Train Dataset: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
    accuracy = 100*correct
    return train_loss, accuracy

In [20]:
def TestDataSet_Accuracy(dataloader, model_teacher, model_student, loss_fn):
    size = floor(len(dataloader.dataset) / batch_size) * batch_size
    num_batches = floor(len(dataloader.dataset) / batch_size)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            
            logit_teacher = model_teacher(X)
            logit_student = model_student(X)
        
            if len(X) < batch_size:
              break
            pred = model_student(X)
            test_loss += loss_fn(logit_teacher, logit_student, y).item()
            #
            for i in range(batch_size):
              v1 = pred[i]
              v2 = y[i]
              if torch.argmax(v1) == torch.argmax(v2):
                correct += 1
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Dataset: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    accuracy = 100*correct
    return test_loss, accuracy

In [21]:
loss_fn = distillation_loss
optimizer = torch.optim.SGD(model_ResNet18.parameters(), lr=learning_rate, momentum=0.9)
model_ResNet50.eval()

train_loss = np.zeros(epochs)
train_accuracy = np.zeros(epochs)
test_loss = np.zeros(epochs)
test_accuracy = np.zeros(epochs)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    model_ResNet18.train()
    train_loop(train_dataloader,model_ResNet50, model_ResNet18, loss_fn, optimizer)
    model_ResNet18.eval()
    train_loss[t], train_accuracy[t] = TrainDataSet_Accuracy(train_dataloader, model_ResNet50, model_ResNet18, loss_fn)
    test_loss[t], test_accuracy[t] = TestDataSet_Accuracy(test_dataloader, model_ResNet50, model_ResNet18, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Train Dataset: 
 Accuracy: 44.3%, Avg loss: 2.658878 

Test Dataset: 
 Accuracy: 44.6%, Avg loss: 2.658935 

Epoch 2
-------------------------------
Train Dataset: 
 Accuracy: 54.2%, Avg loss: 2.497886 

Test Dataset: 
 Accuracy: 52.9%, Avg loss: 2.514237 

Epoch 3
-------------------------------
Train Dataset: 
 Accuracy: 42.7%, Avg loss: 2.943191 

Test Dataset: 
 Accuracy: 42.6%, Avg loss: 2.965683 

Epoch 4
-------------------------------
Train Dataset: 
 Accuracy: 64.7%, Avg loss: 2.303910 

Test Dataset: 
 Accuracy: 62.2%, Avg loss: 2.342232 

Epoch 5
-------------------------------
Train Dataset: 
 Accuracy: 70.4%, Avg loss: 2.212764 

Test Dataset: 
 Accuracy: 67.2%, Avg loss: 2.258058 

Epoch 6
-------------------------------
Train Dataset: 
 Accuracy: 61.4%, Avg loss: 2.408392 

Test Dataset: 
 Accuracy: 58.4%, Avg loss: 2.459478 

Epoch 7
-------------------------------
Train Dataset: 
 Accuracy: 77.0%, Avg loss: 2.111169 

Test Datase

# Part (C) - ResNet18

In [22]:
model_ResNet18 = torchvision.models.resnet18()

model_ResNet18.fc = nn.Sequential(nn.Linear(512, 10))

model_ResNet18 = model_ResNet18.to(device)

pytorch_total_params = sum(p.numel() for p in model_ResNet18.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 11,181,642


In [23]:
def TrainDataSet_Accuracy(dataloader, model, loss_fn):
    size = floor(len(dataloader.dataset) / batch_size) * batch_size
    num_batches = floor(len(dataloader.dataset) / batch_size)
    train_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            if len(X) < batch_size:
              break
            pred = model(X)
            train_loss += loss_fn(pred, y).item()
            #
            for i in range(batch_size):
              v1 = pred[i]
              v2 = y[i]
              if torch.argmax(v1) == torch.argmax(v2):
                correct += 1
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    train_loss /= num_batches
    correct /= size
    print(f"Train Dataset: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
    accuracy = 100*correct
    return train_loss, accuracy

In [24]:
def TestDataSet_Accuracy(dataloader, model, loss_fn):
    size = floor(len(dataloader.dataset) / batch_size) * batch_size
    num_batches = floor(len(dataloader.dataset) / batch_size)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            if len(X) < batch_size:
              break
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            #
            for i in range(batch_size):
              v1 = pred[i]
              v2 = y[i]
              if torch.argmax(v1) == torch.argmax(v2):
                correct += 1
            #correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Dataset: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    accuracy = 100*correct
    return test_loss, accuracy

In [25]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [26]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_ResNet18.parameters(), lr=learning_rate, momentum=0.9)

train_loss = np.zeros(epochs)
train_accuracy = np.zeros(epochs)
test_loss = np.zeros(epochs)
test_accuracy = np.zeros(epochs)


for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    model_ResNet18.train()
    train_loop(train_dataloader, model_ResNet18, loss_fn, optimizer)
    model_ResNet18.eval()
    train_loss[t], train_accuracy[t] = TrainDataSet_Accuracy(train_dataloader, model_ResNet18, loss_fn)
    test_loss[t], test_accuracy[t] = TestDataSet_Accuracy(test_dataloader, model_ResNet18, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Train Dataset: 
 Accuracy: 52.1%, Avg loss: 1.324067 

Test Dataset: 
 Accuracy: 51.1%, Avg loss: 1.340711 

Epoch 2
-------------------------------
Train Dataset: 
 Accuracy: 57.3%, Avg loss: 1.198751 

Test Dataset: 
 Accuracy: 55.3%, Avg loss: 1.270733 

Epoch 3
-------------------------------
Train Dataset: 
 Accuracy: 69.1%, Avg loss: 0.863157 

Test Dataset: 
 Accuracy: 65.2%, Avg loss: 0.977534 

Epoch 4
-------------------------------
Train Dataset: 
 Accuracy: 76.5%, Avg loss: 0.665944 

Test Dataset: 
 Accuracy: 70.4%, Avg loss: 0.833737 

Epoch 5
-------------------------------
Train Dataset: 
 Accuracy: 80.3%, Avg loss: 0.561722 

Test Dataset: 
 Accuracy: 72.3%, Avg loss: 0.797561 

Epoch 6
-------------------------------
Train Dataset: 
 Accuracy: 83.3%, Avg loss: 0.474058 

Test Dataset: 
 Accuracy: 73.3%, Avg loss: 0.765521 

Epoch 7
-------------------------------
Train Dataset: 
 Accuracy: 89.8%, Avg loss: 0.304579 

Test Datase

# **Result:**




*   As the model becomes smaller, it becomes difficult for the model to learn many complex properties of the data, so the accuracy of the model on the test data decreases.
*    In general, the use of the teacher model helps the accuracy of the student model and its training, but considering that in our problem, the accuracy of the teacher model is about the accuracy of the student model on the new dataset, using Knowledge Distillation technique has not helped us much.

# Part (D) - ResNet50 Fine Tunning

In [27]:
model_ResNet50 = torchvision.models.resnet50(pretrained=True)

model_ResNet50.fc = nn.Sequential(nn.Linear(2048, 10))

model_ResNet50 = model_ResNet50.to(device)

pytorch_total_params = sum(p.numel() for p in model_ResNet50.parameters() if p.requires_grad)
print(f"Number of Trainable params: {pytorch_total_params:,}")

Number of Trainable params: 23,528,522


In [28]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_ResNet50.parameters(), lr=learning_rate, momentum=0.9)

train_loss = np.zeros(epochs)
train_accuracy = np.zeros(epochs)
test_loss = np.zeros(epochs)
test_accuracy = np.zeros(epochs)


for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    model_ResNet50.train()
    train_loop(train_dataloader, model_ResNet50, loss_fn, optimizer)
    model_ResNet50.eval()
    train_loss[t], train_accuracy[t] = TrainDataSet_Accuracy(train_dataloader, model_ResNet50, loss_fn)
    test_loss[t], test_accuracy[t] = TestDataSet_Accuracy(test_dataloader, model_ResNet50, loss_fn)
print("Done!")

Epoch 1
-------------------------------
Train Dataset: 
 Accuracy: 97.6%, Avg loss: 0.079643 

Test Dataset: 
 Accuracy: 95.3%, Avg loss: 0.139978 

Epoch 2
-------------------------------
Train Dataset: 
 Accuracy: 99.2%, Avg loss: 0.029195 

Test Dataset: 
 Accuracy: 95.9%, Avg loss: 0.129463 

Epoch 3
-------------------------------
Train Dataset: 
 Accuracy: 99.6%, Avg loss: 0.014123 

Test Dataset: 
 Accuracy: 96.1%, Avg loss: 0.127551 

Epoch 4
-------------------------------
Train Dataset: 
 Accuracy: 99.9%, Avg loss: 0.006038 

Test Dataset: 
 Accuracy: 96.1%, Avg loss: 0.124864 

Epoch 5
-------------------------------
Train Dataset: 
 Accuracy: 100.0%, Avg loss: 0.003117 

Test Dataset: 
 Accuracy: 96.4%, Avg loss: 0.121331 

Epoch 6
-------------------------------
Train Dataset: 
 Accuracy: 99.9%, Avg loss: 0.003166 

Test Dataset: 
 Accuracy: 96.3%, Avg loss: 0.121296 

Epoch 7
-------------------------------
Train Dataset: 
 Accuracy: 100.0%, Avg loss: 0.001476 

Test Data

# **Result:**




*   ResNet50 is a large model that can easily learn the complex features of the dataset and since we train the model from the scratch, it is trained well on CIFAR-10 dataset. 
But due to the large size of the network, training is very time-consuming and the amount of calculations is very huge.
