# Mohammad Amin Rami

# Student ID: 98101588
## Deep Learning HW3
## Question 2

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
def one_hot(label):
    y = torch.zeros((10,))
    y[label] = 1
    return y.to(device)

batch_size = 16
transformer = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

trainset = torchvision.datasets.CIFAR10(
    root='./Data', 
    train=True,
    download=False,
    transform=lambda x: transformer(x).to(device),
    target_transform=one_hot
)

testset = torchvision.datasets.CIFAR10(
    root='./Data',
    train=False,
    download=False,
    transform=lambda x: transformer(x).to(device),
    target_transform=one_hot
)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)

### Part A: Trining the teacher using a pre trained model

In [4]:
from torchvision.models import resnet50

resnet50 = resnet50(weights='IMAGENET1K_V1').to(device)

for param in resnet50.parameters():
    param.requires_grad = False

in_num = resnet50.fc.in_features
resnet50.fc = nn.Linear(in_num, 10).to(device)

In [5]:
lr = 1e-3
loss_fn = nn.CrossEntropyLoss()
teacher_optimizer = torch.optim.Adam(resnet50.parameters(), lr=lr)

In [10]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()     
    
        if batch % 64 * 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>11f}  [{current:>5d}/{size:>5d}]")

In [5]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    test_loss, correct = 0, 0
    
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return correct

In [8]:
max_epochs = 5


for epoch in range(max_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(trainloader, resnet50, loss_fn, teacher_optimizer)
    teacher_acc = test_loop(testloader, resnet50, loss_fn)
    
print("Done!")

Epoch 1
-------------------------------
loss:    2.328330  [    0/50000]
loss:    0.984811  [ 4096/50000]
loss:    0.909545  [ 8192/50000]
loss:    0.689388  [12288/50000]
loss:    0.813920  [16384/50000]
loss:    0.700437  [20480/50000]
loss:    0.761752  [24576/50000]
loss:    0.470369  [28672/50000]
loss:    0.647181  [32768/50000]
loss:    0.836232  [36864/50000]
loss:    0.461987  [40960/50000]
loss:    0.895684  [45056/50000]
loss:    0.638397  [49152/50000]
Test Error: 
 Accuracy: 78.2%, Avg loss: 0.629915 

Epoch 2
-------------------------------
loss:    0.551104  [    0/50000]
loss:    0.397680  [ 4096/50000]
loss:    0.742310  [ 8192/50000]
loss:    0.544798  [12288/50000]
loss:    0.596137  [16384/50000]
loss:    0.520657  [20480/50000]
loss:    0.464992  [24576/50000]
loss:    0.715164  [28672/50000]
loss:    0.799786  [32768/50000]
loss:    0.457682  [36864/50000]
loss:    0.991606  [40960/50000]
loss:    0.387928  [45056/50000]
loss:    0.455101  [49152/50000]
Test Error

In [9]:
print('======= Teacher Performance Report =======')
print(f'The accuracy achieved for teacher model is: {teacher_acc*100:0.2f}%')

The accuracy achieved for teacher model is: 79.75%


--- 
**As it is shown, an accuracy of approximately 80% percent is achieved for teacher**     
using a pre-trained model makes the training phase faster because we only have to train the last layer of the network.

### Part B: Training the student using teacher-student training model

In [10]:
from torchvision.models import resnet18
resnet18 = resnet18(weights='IMAGENET1K_V1').to(device)

in_num = resnet18.fc.in_features
resnet18.fc = nn.Linear(in_num, 10).to(device)

In [11]:
def softmax(y, tau):
    y_exp = torch.exp(y/tau)
    return y_exp/torch.reshape(torch.sum(y_exp, dim=1), (-1, 1))

def teacher_student_loss_fn(student, teacher, X, y, alpha, tau, batch_size):
    ce = nn.CrossEntropyLoss()
    pred = student(X)
    t_pred = teacher(X)
    loss = alpha * tau * tau * (-1) * torch.sum(softmax(t_pred, tau) * softmax(pred, tau).log2()) + (1-alpha)*ce(pred, y)
    return loss/batch_size

In [11]:
def ts_train_loop(dataloader, student, teacher, loss_fn, optimizer, alpha, tau, batch_size):
    size = len(dataloader.dataset)

    for batch, (X, y) in enumerate(dataloader):
        loss = loss_fn(student, teacher, X, y, alpha, tau, batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()     
    
        if batch % 64 *100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>11f}  [{current:>5d}/{size:>5d}]")

In [13]:
lr = 1e-4
student_optimizer = torch.optim.Adam(resnet18.parameters(), lr=lr)

In [15]:
max_epochs = 5
alpha = 0.2
tau = 1

for epoch in range(max_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    ts_train_loop(trainloader, resnet18, resnet50, teacher_student_loss_fn, student_optimizer, alpha, tau, batch_size)
    student_acc = test_loop(testloader, resnet18, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss:    0.207074  [    0/50000]
loss:    0.162495  [ 4096/50000]
loss:    0.193539  [ 8192/50000]
loss:    0.207572  [12288/50000]
loss:    0.246111  [16384/50000]
loss:    0.206524  [20480/50000]
loss:    0.198682  [24576/50000]
loss:    0.196588  [28672/50000]
loss:    0.202725  [32768/50000]
loss:    0.200311  [36864/50000]
loss:    0.177496  [40960/50000]
loss:    0.164825  [45056/50000]
loss:    0.217797  [49152/50000]
Test Error: 
 Accuracy: 85.8%, Avg loss: 0.454565 

Epoch 2
-------------------------------
loss:    0.192510  [    0/50000]
loss:    0.158930  [ 4096/50000]
loss:    0.209561  [ 8192/50000]
loss:    0.174529  [12288/50000]
loss:    0.202334  [16384/50000]
loss:    0.237954  [20480/50000]
loss:    0.192443  [24576/50000]
loss:    0.162217  [28672/50000]
loss:    0.211491  [32768/50000]
loss:    0.213350  [36864/50000]
loss:    0.166758  [40960/50000]
loss:    0.184743  [45056/50000]
loss:    0.201246  [49152/50000]
Test Error

---

**As it can be seen, using the teacher-student trainig model has caused an improvement in the student**  
**The setudent has achieved an accuracy of 87% which better than the teacher itself**     
This because the student uses both the training data and the knowledge of the teacher and this causes an improvemnt to the student's performance.      
I have chosen the following hyperparameters:     
1. alpha = 0.2    
2. tau = 1

### Part C: Training the student without the teacher

In [16]:
from torchvision.models import resnet18
resnet18 = resnet18().to(device)

in_num = resnet18.fc.in_features
resnet18.fc = nn.Linear(in_num, 10).to(device)

In [17]:
lr = 1e-3
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet18.parameters(), lr=lr)

In [18]:
max_epochs = 5


for epoch in range(max_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(trainloader, resnet18, loss_fn, optimizer)
    acc = test_loop(testloader, resnet18, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss:    2.439258  [    0/50000]
loss:    1.816784  [ 4096/50000]
loss:    1.725463  [ 8192/50000]
loss:    1.761023  [12288/50000]
loss:    1.556396  [16384/50000]
loss:    1.303082  [20480/50000]
loss:    1.299468  [24576/50000]
loss:    1.155498  [28672/50000]
loss:    1.231911  [32768/50000]
loss:    1.314616  [36864/50000]
loss:    1.247056  [40960/50000]
loss:    1.054740  [45056/50000]
loss:    0.972304  [49152/50000]
Test Error: 
 Accuracy: 63.7%, Avg loss: 1.009833 

Epoch 2
-------------------------------
loss:    0.859626  [    0/50000]
loss:    1.002671  [ 4096/50000]
loss:    0.686511  [ 8192/50000]
loss:    0.967483  [12288/50000]
loss:    0.846845  [16384/50000]
loss:    0.872856  [20480/50000]
loss:    0.819767  [24576/50000]
loss:    0.786501  [28672/50000]
loss:    0.824263  [32768/50000]
loss:    0.743037  [36864/50000]
loss:    0.589118  [40960/50000]
loss:    0.860406  [45056/50000]
loss:    0.706413  [49152/50000]
Test Error

---
**We have a achieved a weaker performance comparing to teacher-student model**    
This demonstrate the power of teacher-student model. Here we have achieved an accuracy of 82%      
So if the student trains on it's own, it will get a weaker result compared to having a teacher

### Part D: Training Resnet50 from scratch

In [6]:
from torchvision.models import resnet50

In [7]:
res50 = resnet50().to(device)

in_num = res50.fc.in_features
res50.fc = nn.Linear(in_num, 10)
res50 = res50.to(device)

In [8]:
lr = 1e-4
loss_fn = nn.CrossEntropyLoss()
res50_optimizer = torch.optim.Adam(res50.parameters(), lr=lr)

In [12]:
max_epochs = 5


for epoch in range(max_epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(trainloader, res50, loss_fn, res50_optimizer)
    acc = test_loop(testloader, res50, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss:    1.618962  [    0/50000]
loss:    2.052775  [ 1024/50000]
loss:    1.653041  [ 2048/50000]
loss:    1.568250  [ 3072/50000]
loss:    1.672568  [ 4096/50000]
loss:    1.429227  [ 5120/50000]
loss:    1.280605  [ 6144/50000]
loss:    1.797416  [ 7168/50000]
loss:    1.569487  [ 8192/50000]
loss:    1.667472  [ 9216/50000]
loss:    1.542448  [10240/50000]
loss:    1.350139  [11264/50000]
loss:    1.543443  [12288/50000]
loss:    1.187486  [13312/50000]
loss:    1.166818  [14336/50000]
loss:    1.905272  [15360/50000]
loss:    1.716818  [16384/50000]
loss:    1.563241  [17408/50000]
loss:    1.590838  [18432/50000]
loss:    1.363685  [19456/50000]
loss:    1.655000  [20480/50000]
loss:    1.484124  [21504/50000]
loss:    1.602441  [22528/50000]
loss:    1.625207  [23552/50000]
loss:    1.465005  [24576/50000]
loss:    1.337627  [25600/50000]
loss:    1.249995  [26624/50000]
loss:    1.209847  [27648/50000]
loss:    1.334068  [28672/50000]
los

loss:    0.151452  [40960/50000]
loss:    0.499910  [41984/50000]
loss:    0.827349  [43008/50000]
loss:    0.363077  [44032/50000]
loss:    0.325536  [45056/50000]
loss:    0.621426  [46080/50000]
loss:    0.829272  [47104/50000]
loss:    0.403509  [48128/50000]
loss:    0.127339  [49152/50000]
Test Error: 
 Accuracy: 79.5%, Avg loss: 0.607955 

Done!


---
**Trainig the teacher from scratch has the following disadvantages:**    
    1- makes the training proccess much longer specialy if you have limited computational resources. Using a pre-trained model and tuning it is much more efficient and takes much less time. For example, fine tuning Resnet50 took much less time than training it from scratch    
    2- People who have developed the pre-trained model, have probably trained the model several times and thus have chosen the best paramters for that model. Because as you know, training a neural network is stochastic proccess. Because weights are initialized randomly and also optimization algorithms are also stochastic such as SGD. Therefore the people who have developed the pre-trained models have chosen the best hyperparamters and initializations and thus it's better to use their results.     
    Thus, it is a good idea to use a pre-tained model.
    
**As you can see, training Resnet50 from scratch has not improved performance comparing to fine tuning and we have gotten the same result. only we have waited a much longer time for the model to be trained**     
Both methods give an accuracy of 80%