<a href="https://colab.research.google.com/github/ZahraSorkhei/HW_DeepLearning/blob/main/DL_HW03_Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Libraries**##

In [2]:
import time
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
from torchvision import datasets, transforms
from torchsummary import summary
from torch.optim import lr_scheduler
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models as models
from torch import nn, optim
import torch.nn as nn
import torch
from torch import Tensor
from typing import Type

##**Load Dataset**##

In [3]:
transform = transforms.Compose([transforms.Resize((224,224)),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485,0.456,  
                                0.406], [0.229, 0.224, 0.225])])
trainset = datasets.CIFAR10('/content/train/', download=True, train=True, transform=transform)
valset = datasets.CIFAR10('/content/val/', download=True, train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True)
len_trainset = len(trainset)
len_valset = len(valset)
classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/train/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /content/train/cifar-10-python.tar.gz to /content/train/
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/val/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /content/val/cifar-10-python.tar.gz to /content/val/


In [None]:
dataiter = iter(trainloader)
images, labels =  next(dataiter)
print(images.shape)
print(labels.shape)

torch.Size([64, 3, 224, 224])
torch.Size([64])


##**Part A**##

In [23]:
resnet = models.resnet50(pretrained=True)
for param in resnet.parameters():
   param.requires_grad = False
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Linear(num_ftrs, 10)
resnet = resnet.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet.fc.parameters())

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [10]:
def train_and_evaluate(model, trainloader, valloader, criterion, optimizer, len_trainset, len_valset, num_epochs=25):
   model.train()
   best_model_wts = copy.deepcopy(model.state_dict())
   best_acc = 0.0
   for epoch in range(num_epochs):
      model.train()
      print("Epoch {}/{}".format(epoch, num_epochs - 1))
      print('-' * 10)
      running_loss = 0.0
      running_corrects = 0
      for inputs, labels in trainloader:
         inputs = inputs.to(device)
         labels = labels.to(device)
         optimizer.zero_grad()
         outputs = model(inputs)
         loss = criterion(outputs, labels)
         _, preds = torch.max(outputs, 1)
         loss.backward() 
         optimizer.step()  
         running_loss += loss.item() * inputs.size(0)
         running_corrects += torch.sum(preds == labels.data)
      epoch_loss = running_loss / len_trainset
      epoch_acc = running_corrects.double() / len_trainset
      print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss,
             epoch_acc)) 
         
      model.eval()
      running_loss_val = 0.0 
      running_corrects_val = 0
      for inputs, labels in valloader:
         inputs = inputs.to(device)
         labels = labels.to(device)
         outputs = model(inputs) 
         loss = criterion(outputs,labels)
         _, preds = torch.max(outputs, 1)
         running_loss_val += loss.item() * inputs.size(0)
         running_corrects_val += torch.sum(preds == labels.data)
      
      epoch_loss_val = running_loss_val / len_valset
      epoch_acc_val = running_corrects_val.double() / len_valset
      
      if epoch_acc_val > best_acc:
         best_acc = epoch_acc_val
         best_model_wts = copy.deepcopy(model.state_dict())
      
      print(' Val Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss_val,
             epoch_acc_val))
      
   print()
   print('Best val Acc: {:4f}'.format(best_acc))
   model.load_state_dict(best_model_wts)
   return model

In [None]:
resnet_teacher = train_and_evaluate(resnet,trainloader,
                                   valloader,criterion,optimizer,
                                   len_trainset,len_valset,3)

Epoch 0/2
----------
Train Loss: 0.7636 Acc: 0.7471
 Val Loss: 0.5794 Acc: 0.8000
Epoch 1/2
----------
Train Loss: 0.5909 Acc: 0.7951
 Val Loss: 0.5439 Acc: 0.8117
Epoch 2/2
----------
Train Loss: 0.5688 Acc: 0.8021
 Val Loss: 0.5361 Acc: 0.8135

Best val Acc: 0.813500


##**Part B**##

In [None]:
net=  models.resnet18(pretrained=True).to(device)
num_ftrs = net.fc.in_features
net.fc = nn.Linear(num_ftrs, 10)
net = net.to(device)

In [None]:
dataiter = iter(trainloader)
images, labels = next(dataiter)
out = net(images.to(device))
print(out.shape)

torch.Size([64, 10])


In [12]:
def loss_kd(outputs, labels, teacher_outputs, temparature, alpha):
   KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/temparature, dim=1),F.softmax(teacher_outputs/temparature,dim=1)) * (alpha * temparature * temparature) + F.cross_entropy(outputs, labels) * (1. - alpha)
   return KD_loss
def get_outputs(model, dataloader):
   '''
   Used to get the output of the teacher network
   '''
   outputs = []
   for inputs, labels in dataloader:
      inputs_batch, labels_batch = inputs.cuda(), labels.cuda()
      output_batch = model(inputs_batch).data.cpu().numpy()
      outputs.append(output_batch)
   return outputs

In [11]:
def train_kd(model,teacher_out, optimizer, loss_kd, dataloader, temparature, alpha):
   model.train()
   running_loss = 0.0
   running_corrects = 0
   for i,(images, labels) in enumerate(dataloader):
      inputs = images.to(device)
      labels = labels.to(device)
      optimizer.zero_grad()
      outputs = model(inputs)
      outputs_teacher = torch.from_numpy(teacher_out[i]).to(device)
      loss = loss_kd(outputs,labels,outputs_teacher,temparature, 
                     alpha)
      _, preds = torch.max(outputs, 1)
      loss.backward()
      optimizer.step()
      running_loss += loss.item() * inputs.size(0)
      running_corrects += torch.sum(preds == labels.data)
   
   epoch_loss = running_loss / len(trainset)
   epoch_acc = running_corrects.double() / len(trainset)
   print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
def eval_kd(model,teacher_out, optimizer, loss_kd, dataloader, temparature, alpha):
   model.eval()
   running_loss = 0.0
   running_corrects = 0
   for i,(images, labels) in enumerate(dataloader):
      inputs = images.to(device)
      labels = labels.to(device)
      outputs = model(inputs)
      outputs_teacher = torch.from_numpy(teacher_out[i]).cuda()
      loss = loss_kd(outputs,labels,outputs_teacher,temparature, 
                     alpha)
      _, preds = torch.max(outputs, 1)
      running_loss += loss.item() * inputs.size(0)
      running_corrects += torch.sum(preds == labels.data)
   epoch_loss = running_loss / len(valset)
   epoch_acc = running_corrects.double() / len(valset)
   print('Val Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss,
          epoch_acc))
   return epoch_acc
def train_and_evaluate_kd(model, teacher_model, optimizer, loss_kd, trainloader, valloader, temparature, alpha, num_epochs=25):
   teacher_model.eval()
   best_model_wts = copy.deepcopy(model.state_dict())
   outputs_teacher_train = get_outputs(teacher_model, trainloader)
   outputs_teacher_val = get_outputs(teacher_model, valloader)
   print("Teacher’s outputs are computed now starting the training process-")
   best_acc = 0.0
   for epoch in range(num_epochs):
      print('Epoch {}/{}'.format(epoch, num_epochs -1))
      print('-' * 10)
      
      # Training the student with the soft labes as the outputs from the teacher and using the loss_kd function
      train_kd(model, outputs_teacher_train, 
               optim.Adam(net.parameters()),loss_kd,trainloader, 
               temparature, alpha)
     
      # Evaluating the student network
      epoch_acc_val = eval_kd(model, outputs_teacher_val, 
                          optim.Adam(net.parameters()), loss_kd, 
                          valloader, temparature, alpha)
      if epoch_acc_val > best_acc:
         best_acc = epoch_acc_val
         best_model_wts = copy.deepcopy(model.state_dict())
         print('Best val Acc: {:4f}'.format(best_acc))
         model.load_state_dict(best_model_wts)
   return model

In [None]:
stud=train_and_evaluate_kd(net,resnet_teacher,optim.Adam(net.parameters()),loss_kd,trainloader,valloader,1,0.5,3)

Teacher’s outputs are computed now starting the training process-
Epoch 0/2
----------
Train Loss: 0.4747 Acc: 0.8220
Val Loss: 0.4345 Acc: 0.8501
Best val Acc: 0.850100
Epoch 1/2
----------
Train Loss: 0.3873 Acc: 0.8882
Val Loss: 0.3993 Acc: 0.8779
Best val Acc: 0.877900
Epoch 2/2
----------
Train Loss: 0.3382 Acc: 0.9265
Val Loss: 0.3692 Acc: 0.9006
Best val Acc: 0.900600


In [None]:
stud=train_and_evaluate_kd(net,resnet_teacher,optim.Adam(net.parameters()),loss_kd,trainloader,valloader,0.5,0.5,3)

Teacher’s outputs are computed now starting the training process-
Epoch 0/2
----------




Train Loss: 0.3654 Acc: 0.8173
Val Loss: 0.3918 Acc: 0.8021
Best val Acc: 0.802100
Epoch 1/2
----------
Train Loss: 0.2772 Acc: 0.8849
Val Loss: 0.2887 Acc: 0.8738
Best val Acc: 0.873800
Epoch 2/2
----------
Train Loss: 0.2262 Acc: 0.9231
Val Loss: 0.2842 Acc: 0.8764
Best val Acc: 0.876400


In [None]:
stud=train_and_evaluate_kd(net,resnet_teacher,optim.Adam(net.parameters()),loss_kd,trainloader,valloader,0.25,0.5,3)

Teacher’s outputs are computed now starting the training process-
Epoch 0/2
----------
Train Loss: 0.3297 Acc: 0.8181
Val Loss: 0.2541 Acc: 0.8736
Best val Acc: 0.873600
Epoch 1/2
----------
Train Loss: 0.2388 Acc: 0.8835
Val Loss: 0.2499 Acc: 0.8744
Best val Acc: 0.874400
Epoch 2/2
----------
Train Loss: 0.1854 Acc: 0.9227
Val Loss: 0.2330 Acc: 0.8822
Best val Acc: 0.882200


In [None]:
stud=train_and_evaluate_kd(net,resnet_teacher,optim.Adam(net.parameters()),loss_kd,trainloader,valloader,1,1,3)

Teacher’s outputs are computed now starting the training process-
Epoch 0/2
----------
Train Loss: 0.0077 Acc: 0.1021
Val Loss: 0.0074 Acc: 0.1000
Best val Acc: 0.100000
Epoch 1/2
----------
Train Loss: 0.0075 Acc: 0.0998
Val Loss: 0.0074 Acc: 0.0998
Epoch 2/2
----------
Train Loss: 0.0075 Acc: 0.0988
Val Loss: 0.0078 Acc: 0.1013
Best val Acc: 0.101300


Base on on our experiment the best high parameter is alpha = 0.5 and Temp=1

##**Part C with PyTyorch**##

In [None]:
model2=  models.resnet18(pretrained=True).to(device)
for param in model2.parameters():
   param.requires_grad = False
num_ftrs = model2.fc.in_features
model2.fc = nn.Linear(num_ftrs, 10)
model2 = model2.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.fc.parameters())
resnet_teacher = train_and_evaluate(model2,trainloader,
                                   valloader,criterion,optimizer,
                                   len_trainset,len_valset,5)

Epoch 0/4
----------
Train Loss: 2.4450 Acc: 0.1082
 Val Loss: 2.4435 Acc: 0.1128
Epoch 1/4
----------
Train Loss: 2.4442 Acc: 0.1090
 Val Loss: 2.4414 Acc: 0.1129
Epoch 2/4
----------
Train Loss: 2.4444 Acc: 0.1095
 Val Loss: 2.4461 Acc: 0.1132
Epoch 3/4
----------
Train Loss: 2.4443 Acc: 0.1092
 Val Loss: 2.4439 Acc: 0.1114
Epoch 4/4
----------
Train Loss: 2.4442 Acc: 0.1085
 Val Loss: 2.4465 Acc: 0.1126

Best val Acc: 0.113200


##**Part D**##


It can be seen that the percentage is slightly improved. The initial weights of the resnet network are a good starting point, but they are not necessarily the optimal points for our dataset. By training the network from the initial layers, we find the best weights, and as a result, we will have a better percentage of accuracy at the end.

In [27]:
resnet_all = models.resnet50(pretrained=False)
for param in resnet_all.parameters():
   param.requires_grad = False
num_ftrs = resnet_all.fc.in_features
resnet_all.fc = nn.Linear(num_ftrs, 10)
resnet_all = resnet_all.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet_all.fc.parameters())

In [7]:
resnet_teacher_all = train_and_evaluate(resnet_all,trainloader,
                                   valloader,criterion,optimizer,
                                   len_trainset,len_valset,15)

Epoch 0/14
----------
Train Loss: 2.2901 Acc: 0.1671
 Val Loss: 2.2305 Acc: 0.1964
Epoch 1/14
----------
Train Loss: 2.1606 Acc: 0.2154
 Val Loss: 2.0537 Acc: 0.2362
Epoch 2/14
----------
Train Loss: 2.1114 Acc: 0.2333
 Val Loss: 2.1394 Acc: 0.2151
Epoch 3/14
----------
Train Loss: 2.0914 Acc: 0.2437
 Val Loss: 2.1046 Acc: 0.2547
Epoch 4/14
----------
Train Loss: 2.0669 Acc: 0.2543
 Val Loss: 2.1339 Acc: 0.2287
Epoch 5/14
----------
Train Loss: 2.0466 Acc: 0.2617
 Val Loss: 1.9516 Acc: 0.2920
Epoch 6/14
----------
Train Loss: 2.0333 Acc: 0.2656
 Val Loss: 1.9882 Acc: 0.2844
Epoch 7/14
----------
Train Loss: 2.0248 Acc: 0.2702
 Val Loss: 1.9898 Acc: 0.2628
Epoch 8/14
----------
Train Loss: 2.0165 Acc: 0.2763
 Val Loss: 2.1274 Acc: 0.2611
Epoch 9/14
----------
Train Loss: 2.0155 Acc: 0.2772
 Val Loss: 1.9349 Acc: 0.3147
Epoch 10/14
----------
Train Loss: 2.0017 Acc: 0.2825
 Val Loss: 2.0316 Acc: 0.2817
Epoch 11/14
----------
Train Loss: 2.0075 Acc: 0.2820
 Val Loss: 2.0669 Acc: 0.2606
Ep

In [16]:
net=  models.resnet18(pretrained=True).to(device)
num_ftrs = net.fc.in_features
net.fc = nn.Linear(num_ftrs, 10)
net = net.to(device)

In [17]:
stud=train_and_evaluate_kd(net,resnet_teacher_all,optim.Adam(net.parameters()),loss_kd,trainloader,valloader,1,0.5,5)

Teacher’s outputs are computed now starting the training process-
Epoch 0/4
----------




Train Loss: 0.4031 Acc: 0.8250
Val Loss: 0.3581 Acc: 0.8541
Best val Acc: 0.854100
Epoch 1/4
----------
Train Loss: 0.3194 Acc: 0.8885
Val Loss: 0.3109 Acc: 0.8933
Best val Acc: 0.893300
Epoch 2/4
----------
Train Loss: 0.2685 Acc: 0.9267
Val Loss: 0.2925 Acc: 0.9049
Best val Acc: 0.904900
Epoch 3/4
----------
Train Loss: 0.2332 Acc: 0.9539
Val Loss: 0.2875 Acc: 0.9071
Best val Acc: 0.907100
Epoch 4/4
----------
Train Loss: 0.2125 Acc: 0.9682
Val Loss: 0.2845 Acc: 0.9099
Best val Acc: 0.909900


##**Part C with Tensorflow**##

As can be seen, there are several advantages of this knowledge distillation process: Using a lighter model. Less compute requirements and superior performance under stringent production constraints. Better accuracy than stand-alone model. Distillation enables us to train another neural network using a pre-trained network, without the dead weight of the original neural network. Enabling us to compress the size of the network without much loss of accuracy. Hence distilled models have higher accuracies than their normally trained counterparts.

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow.keras import datasets,models,layers
from keras.datasets import cifar10
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255.0
X_test /= 255.0
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.2,shuffle = True)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(Y_train)
Y_train = encoder.transform(Y_train).toarray()
Y_test = encoder.transform(Y_test).toarray()
Y_val =  encoder.transform(Y_val).toarray()
from keras.preprocessing.image import ImageDataGenerator
aug = ImageDataGenerator(horizontal_flip=True, width_shift_range=0.05,
                             height_shift_range=0.05)
aug.fit(X_train)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [34]:
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Conv2D,  MaxPool2D, Flatten, GlobalAveragePooling2D,  BatchNormalization, Layer, Add
from keras.models import Sequential
from keras.models import Model
import tensorflow as tf


class ResnetBlock(Model):
    """
    A standard resnet block.
    """

    def __init__(self, channels: int, down_sample=False):
        """
        channels: same as number of convolution kernels
        """
        super().__init__()

        self.__channels = channels
        self.__down_sample = down_sample
        self.__strides = [2, 1] if down_sample else [1, 1]

        KERNEL_SIZE = (3, 3)
        # use He initialization, instead of Xavier (a.k.a 'glorot_uniform' in Keras), as suggested in [2]
        INIT_SCHEME = "he_normal"

        self.conv_1 = Conv2D(self.__channels, strides=self.__strides[0],
                             kernel_size=KERNEL_SIZE, padding="same", kernel_initializer=INIT_SCHEME)
        self.bn_1 = BatchNormalization()
        self.conv_2 = Conv2D(self.__channels, strides=self.__strides[1],
                             kernel_size=KERNEL_SIZE, padding="same", kernel_initializer=INIT_SCHEME)
        self.bn_2 = BatchNormalization()
        self.merge = Add()

        if self.__down_sample:
            # perform down sampling using stride of 2, according to [1].
            self.res_conv = Conv2D(
                self.__channels, strides=2, kernel_size=(1, 1), kernel_initializer=INIT_SCHEME, padding="same")
            self.res_bn = BatchNormalization()

    def call(self, inputs):
        res = inputs

        x = self.conv_1(inputs)
        x = self.bn_1(x)
        x = tf.nn.relu(x)
        x = self.conv_2(x)
        x = self.bn_2(x)

        if self.__down_sample:
            res = self.res_conv(res)
            res = self.res_bn(res)

        # if not perform down sample, then add a shortcut directly
        x = self.merge([x, res])
        out = tf.nn.relu(x)
        return out

In [31]:
class ResNet18(Model):

    def __init__(self, num_classes, **kwargs):
        """
            num_classes: number of classes in specific classification task.
        """
        super().__init__(**kwargs)
        self.conv_1 = Conv2D(64, (7, 7), strides=2,
                             padding="same", kernel_initializer="he_normal")
        self.init_bn = BatchNormalization()
        self.pool_2 = MaxPool2D(pool_size=(2, 2), strides=2, padding="same")
        self.res_1_1 = ResnetBlock(64)
        self.res_1_2 = ResnetBlock(64)
        self.res_2_1 = ResnetBlock(128, down_sample=True)
        self.res_2_2 = ResnetBlock(128)
        self.res_3_1 = ResnetBlock(256, down_sample=True)
        self.res_3_2 = ResnetBlock(256)
        self.res_4_1 = ResnetBlock(512, down_sample=True)
        self.res_4_2 = ResnetBlock(512)
        self.avg_pool = GlobalAveragePooling2D()
        self.flat = Flatten()
        self.fc = Dense(num_classes, activation="softmax")

    def call(self, inputs):
        out = self.conv_1(inputs)
        out = self.init_bn(out)
        out = tf.nn.relu(out)
        out = self.pool_2(out)
        for res_block in [self.res_1_1, self.res_1_2, self.res_2_1, self.res_2_2, self.res_3_1, self.res_3_2, self.res_4_1, self.res_4_2]:
            out = res_block(out)
        out = self.avg_pool(out)
        out = self.flat(out)
        out = self.fc(out)
        return out

In [35]:
model = ResNet18(10)
model.build(input_shape = (None,32,32,3))
#use categorical_crossentropy since the label is one-hot encoded
from keras.optimizers import SGD
# opt = SGD(learning_rate=0.1,momentum=0.9,decay = 1e-04) #parameters suggested by He [1]
model.compile(optimizer = "adam",loss='categorical_crossentropy', metrics=["accuracy"])

Cause: mangled names are not yet supported


Cause: mangled names are not yet supported


In [36]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(patience= 8, restore_best_weights=True, monitor="val_acc")
#I did not use cross validation, so the validate performance is not accurate.
STEPS = len(X_train) / 256
history = model.fit(aug.flow(X_train,Y_train,batch_size = 256), steps_per_epoch=STEPS, batch_size = 256, epochs=3, validation_data=(X_train, Y_train),callbacks=[es])

Epoch 1/3



Epoch 2/3



Epoch 3/3



