#### Libraries Imported and Setup

In [1]:
import torch
from torch import nn
from torch import functional as F
from torch import optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import math
import matplotlib.pyplot as plt
import numpy as np
import os
from scipy.ndimage import gaussian_filter
import time
from datetime import timedelta
from torch.utils.data import DataLoader, SubsetRandomSampler
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU: ', torch.cuda.get_device_name(0))

else:
    device = torch.device("cpu")
    print('No GPU available')

GPU:  NVIDIA GeForce RTX 2080 Ti


In [4]:
work_dir = '/storageStudents/ncsmmlab/tungufm/VitFromScratch/notebooks'
os.chdir(work_dir)
print(f"Đã chuyển đến {os.getcwd()}")

Đã chuyển đến /storageStudents/ncsmmlab/tungufm/VitFromScratch/notebooks


#### Model Architecture

**Vision Transformer Model**

In [18]:
class Attention(nn.Module):
    '''
    Attention Module used to perform self-attention operation allowing the model to attend
    information from different representation subspaces on an input sequence of embeddings.
    The sequence of operations is as follows :-

    Input -> Query, Key, Value -> ReshapeHeads -> Query.TransposedKey -> Softmax -> Dropout
    -> AttentionScores.Value -> ReshapeHeadsBack -> Output

    Args:
        embed_dim: Dimension size of the hidden embedding
        heads: Number of parallel attention heads (Default=8)
        activation: Optional activation function to be applied to the input while
                    transforming to query, key and value matrixes (Default=None)
        dropout: Dropout value for the layer on attention_scores (Default=0.1)

    Methods:
        _reshape_heads(inp) :-
        Changes the input sequence embeddings to reduced dimension according to the number
        of attention heads to parallelize attention operation
        (batch_size, seq_len, embed_dim) -> (batch_size * heads, seq_len, reduced_dim)

        _reshape_heads_back(inp) :-
        Changes the reduced dimension due to parallel attention heads back to the original
        embedding size
        (batch_size * heads, seq_len, reduced_dim) -> (batch_size, seq_len, embed_dim)

        forward(inp) :-
        Performs the self-attention operation on the input sequence embedding.
        Returns the output of self-attention as well as atttention scores
        (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim), (batch_size * heads, seq_len, seq_len)

    Examples:
        >>> attention = Attention(embed_dim, heads, activation, dropout)
        >>> out, weights = attention(inp)
    '''
    def __init__(self, embed_dim, heads=8, activation=None, dropout=0.1):
        super(Attention, self).__init__()
        self.heads = heads
        self.embed_dim = embed_dim
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)
        if activation == 'relu':
            self.activation = nn.ReLU()
        else:
            self.activation = nn.Identity()
        self.dropout = nn.Dropout(dropout)

    def forward(self, inp):
        # inp: (batch_size, seq_len, embed_dim)
        batch_size, seq_len, embed_dim = inp.size()
        assert embed_dim == self.embed_dim

        query = self.activation(self.query(inp))
        key   = self.activation(self.key(inp))
        value = self.activation(self.value(inp))

        # output of _reshape_heads(): (batch_size * heads, seq_len, reduced_dim) | reduced_dim = embed_dim // heads
        query = self._reshape_heads(query)
        key   = self._reshape_heads(key)
        value = self._reshape_heads(value)

        # attention_scores: (batch_size * heads, seq_len, seq_len) | Softmaxed along the last dimension
        attention_scores = self.softmax(torch.matmul(query, key.transpose(1, 2)))

        # out: (batch_size * heads, seq_len, reduced_dim)
        out = torch.matmul(self.dropout(attention_scores), value)

        # output of _reshape_heads_back(): (batch_size, seq_len, embed_size)
        out = self._reshape_heads_back(out)

        return out, attention_scores

    def _reshape_heads(self, inp):
        # inp: (batch_size, seq_len, embed_dim)
        batch_size, seq_len, embed_dim = inp.size()

        reduced_dim = self.embed_dim // self.heads
        assert reduced_dim * self.heads == self.embed_dim
        out = inp.reshape(batch_size, seq_len, self.heads, reduced_dim)
        out = out.permute(0, 2, 1, 3)
        out = out.reshape(-1, seq_len, reduced_dim)

        # out: (batch_size * heads, seq_len, reduced_dim)
        return out

    def _reshape_heads_back(self, inp):
        # inp: (batch_size * heads, seq_len, reduced_dim) | reduced_dim = embed_dim // heads
        batch_size_mul_heads, seq_len, reduced_dim = inp.size()
        batch_size = batch_size_mul_heads // self.heads

        out = inp.reshape(batch_size, self.heads, seq_len, reduced_dim)
        out = out.permute(0, 2, 1, 3)
        out = out.reshape(batch_size, seq_len, self.embed_dim)

        # out: (batch_size, seq_len, embed_dim)
        return out

In [19]:
# Check if Dropout should be used after second Linear Layer
class FeedForward(nn.Module):
    '''
    FeedForward Network with two sequential linear layers with GELU activation function
    ,applied to the output of self attention operation. The sequence of operations is as
    follows :-

    Input -> FC1 -> GELU -> Dropout -> FC2 -> Output

    Args:
        embed_dim: Dimension size of the hidden embedding
        forward_expansion: The scale used to transform the input embedding to a higher dimension
                           and then scaled back to capture richer information (Default=1)
        dropout: Dropout value for the layer on attention_scores (Default=0.1)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)

    Examples:
        >>> FF = FeedForward(8, 1)
        >>> out = FF(inp)
    '''
    def __init__(self, embed_dim, forward_expansion=1, dropout=0.1):
        super(FeedForward, self).__init__()
        self.embed_dim = embed_dim
        self.fc1 = nn.Linear(embed_dim, embed_dim * forward_expansion)
        self.activation = nn.GELU()
        self.fc2 = nn.Linear(embed_dim * forward_expansion, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inp):
        # inp: (batch_size, seq_len, embed_dim)
        batch_size, seq_len, embed_dim = inp.size()
        assert embed_dim == self.embed_dim

        out = self.dropout(self.activation(self.fc1(inp)))
        # out = self.dropout(self.fc2(out))
        out = self.fc2(out)

        # out: (batch_size, seq_len, embed_dim)
        return out

In [20]:
class TransformerBlock(nn.Module):
    '''
    Transformer Block combines both the attention module and the feed forward module with layer
    normalization, dropout and residual connections. The sequence of operations is as follows :-

    Input -> LayerNorm1 -> Attention -> Residual -> LayerNorm2 -> FeedForward -> Output
      |                                   |  |                                      |
      |-------------Addition--------------|  |---------------Addition---------------|

    Args:
        embed_dim: Dimension size of the hidden embedding
        heads: Number of parallel attention heads (Default=8)
        activation: Optional activation function to be applied to the input while
                    transforming to query, key and value matrixes (Default=None)
        forward_expansion: The scale used to transform the input embedding to a higher dimension
                           and then scaled back to capture richer information (Default=1)
        dropout: Dropout value for the layer on attention_scores (Default=0.1)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)

    Examples:
        >>> TB = TransformerBlock(embed_dim, heads, activation, forward_expansion, dropout)
        >>> out = TB(inp)
    '''
    def __init__(self, embed_dim, heads=8, activation=None, forward_expansion=1, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.embed_dim = embed_dim
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attention = Attention(embed_dim, heads, activation, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = FeedForward(embed_dim, forward_expansion, dropout)

    def forward(self, inp):
        # inp: (batch_size, seq_len, embed_dim)
        batch_size, seq_len, embed_dim = inp.size()
        assert embed_dim == self.embed_dim

        res = inp
        out = self.norm1(inp)
        out, _ = self.attention(out)
        out = out + res

        res = out
        out = self.norm2(out)
        out = self.feed_forward(out)
        out = out + res

        # out: (batch_size, seq_len, embed_dim)
        return out

In [21]:
class Transformer(nn.Module):
    '''
    Transformer combines multiple layers of Transformer Blocks in a sequential manner. The sequence
    of the operations is as follows -

    Input -> TB1 -> TB2 -> .......... -> TBn (n being the number of layers) -> Output

    Args:
        embed_dim: Dimension size of the hidden embedding
        layers: Number of Transformer Blocks in the Transformer
        heads: Number of parallel attention heads (Default=8)
        activation: Optional activation function to be applied to the input while
                    transforming to query, key and value matrixes (Default=None)
        forward_expansion: The scale used to transform the input embedding to a higher dimension
                           and then scaled back to capture richer information (Default=1)
        dropout: Dropout value for the layer on attention_scores (Default=0.1)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)

    Examples:
        >>> transformer = Transformer(embed_dim, layers, heads, activation, forward_expansion, dropout)
        >>> out = transformer(inp)
    '''
    def __init__(self, embed_dim, layers, heads=8, activation=None, forward_expansion=1, dropout=0.1):
        super(Transformer, self).__init__()
        self.embed_dim = embed_dim
        self.trans_blocks = nn.ModuleList(
            [TransformerBlock(embed_dim, heads, activation, forward_expansion, dropout) for i in range(layers)]
        )

    def forward(self, inp):
        # inp: (batch_size, seq_len, embed_dim)

        out = inp
        for block in self.trans_blocks:
            out = block(out)

        # out: (batch_size, seq_len, embed_dim)
        return out

In [22]:
# Not Exactly Same as Paper
class ClassificationHead(nn.Module):
    '''
    Classification Head attached to the first sequence token which is used as the arbitrary
    classification token and used to optimize the transformer model by applying Cross-Entropy
    loss. The sequence of operations is as follows :-

    Input -> FC1 -> GELU -> Dropout -> FC2 -> Output

    Args:
        embed_dim: Dimension size of the hidden embedding
        classes: Number of classification classes in the dataset
        dropout: Dropout value for the layer on attention_scores (Default=0.1)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, embed_dim) -> (batch_size, classes)

    Examples:
        >>> CH = ClassificationHead(embed_dim, classes, dropout)
        >>> out = CH(inp)
    '''
    def __init__(self, embed_dim, classes, dropout=0.1):
        super(ClassificationHead, self).__init__()
        self.embed_dim = embed_dim
        self.classes = classes
        self.fc1 = nn.Linear(embed_dim, embed_dim // 2)
        self.activation = nn.GELU()
        self.fc2 = nn.Linear(embed_dim // 2, classes)
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inp):
        # inp: (batch_size, embed_dim)
        batch_size, embed_dim = inp.size()
        assert embed_dim == self.embed_dim

        out = self.dropout(self.activation(self.fc1(inp)))
        # out = self.softmax(self.fc2(out))
        out = self.fc2(out)

        # out: (batch_size, classes)
        return out

In [23]:
class VisionTransformer(nn.Module):
    '''
    Vision Transformer is the complete end to end model architecture which combines all the above modules
    in a sequential manner. The sequence of the operations is as follows -

    Input -> CreatePatches -> ClassToken, PatchToEmbed , PositionEmbed -> Transformer -> ClassificationHead -> Output
                                   |            | |                |
                                   |---Concat---| |----Addition----|

    Args:
        patch_size: Length of square patch size
        max_len: Max length of learnable positional embedding
        embed_dim: Dimension size of the hidden embedding
        classes: Number of classes in the dataset
        layers: Number of Transformer Blocks in the Transformer
        channels: Number of channels in the input (Default=3)
        heads: Number of parallel attention heads (Default=8)
        activation: Optional activation function to be applied to the input while
                    transforming to query, key and value matrixes (Default=None)
        forward_expansion: The scale used to transform the input embedding to a higher dimension
                           and then scaled back to capture richer information (Default=1)
        dropout: Dropout value for the layer on attention_scores (Default=0.1)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        It outputs the classification output as well as the sequence output of the transformer
        (batch_size, channels, width, height) -> (batch_size, classes), (batch_size, seq_len+1, embed_dim)

    Examples:
        >>> ViT = VisionTransformer(atch_size, max_len, embed_dim, classes, layers, channels, heads, activation, forward_expansion, dropout)
        >>> class_out, hidden_seq = ViT(inp)
    '''
    def __init__(self, patch_size, max_len, embed_dim, classes, layers, channels=3, heads=8, activation=None, forward_expansion=1, dropout=0.1):
        super(VisionTransformer, self).__init__()
        self.name = 'VisionTransformer'
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.channels = channels
        self.patch_to_embed = nn.Linear(patch_size * patch_size * channels, embed_dim)
        self.position_embed = nn.Parameter(torch.randn((max_len, embed_dim)))
        self.transformer = Transformer(embed_dim, layers, heads, activation, forward_expansion, dropout)
        self.classification_head = ClassificationHead(embed_dim, classes)
        self.class_token = nn.Parameter(torch.zeros(1, 1, embed_dim))

    def forward(self, inp):
        # inp: (batch_size, channels, width, height)
        batch_size, channels, width, height = inp.size()
        assert channels == self.channels

        out = inp.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size).contiguous()
        out = out.view(batch_size, channels, -1, self.patch_size, self.patch_size)
        out = out.permute(0, 2, 3, 4, 1)
        # out: (batch_size, seq_len, patch_size, patch_size, channels) | seq_len would be (width*height)/(patch_size**2)
        batch_size, seq_len, patch_size, _, channels = out.size()

        out = out.reshape(batch_size, seq_len, -1)
        out = self.patch_to_embed(out)
        # out: (batch_size, seq_len, embed_dim)

        class_token = self.class_token.expand(batch_size, -1, -1)
        out = torch.cat([class_token, out], dim=1)
        # out: (batch_size, seq_len+1, embed_dim)

        position_embed = self.position_embed[:seq_len+1]
        position_embed = position_embed.unsqueeze(0).expand(batch_size, seq_len+1, self.embed_dim)
        out = out + position_embed
        # out: (batch_size, seq_len+1, embed_dim) | Added Positional Embeddings

        out = self.transformer(out)
        # out: (batch_size, seq_len+1, embed_dim)
        class_token = out[:, 0]
        # class_token: (batch_size, embed_dim)

        class_out = self.classification_head(class_token)
        # class_out: (batch_size, classes)

        return class_out, out

In [24]:
import torch
import torch.nn as nn
from torchvision.models import resnet34

class ResNetFeatures(nn.Module):
    '''
    ResNetFeatures outputs the lower level features from pretrained ResNet34 till the intial 5 layers
    (conv1, bn1, relu, maxpool, layer1(3 conv layers)) to be used in the hybrid architecture to be
    able to kickstart the learining faster. The sequence of operations is as follows :-

    Input -> conv1 -> bn1 -> relu -> maxpool -> layer1 -> Output

    Args:
        No arguments required

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, 3, 224, 224) -> (batch_size, 64, 56, 56)

    Examples:
        >>> resnet_features = ResNetFeatures()
        >>> out = resnet_features(inp)
    '''
    def __init__(self):
        super(ResNetFeatures, self).__init__()
        layers = list(resnet34(pretrained=True).children())[:5] #all layer expect last layer
        self.feature_extractor = nn.Sequential(*layers)

    def forward(self, inp):
        # inp: (batch_size, 3, 224, 224)

        out = self.feature_extractor(inp)

        # out: (batch_size, 64, 56, 56)
        return out

##### ResNet Models

In [25]:
class ResidualBlockSmall(nn.Module):
    '''
    ResidualBlockSmall implements the smaller block of the Residual Networks. It optionally also downsamples
    the input according to the stride to match the output while adding the residual. The sequence of operations
    is as follows :-

    Input -> Conv1 -> BNorm1 -> ReLU -> Conv2 -> BNorm2 -> ReLU -> Output
      |                                                              |
      |-----------------Residual_Downsample (Optional)---------------|

    Args:
        input_channels: Number of input channels
        out_channels: Number of output channels
        residual_downsample: Residual Downsample dependent on if either height, width or channels change
        stride: Stride value for the convolutional layers (Default=1)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, input_channels, height, width) -> (batch_size, out_channels, height, width)

    Examples:
        >>> RBS = ResidualBlockSmall(input_channels, out_channels, residual_downsample, stride)
        >>> out = RBS(inp)
    '''
    def __init__(self, input_channels, out_channels, residual_downsample=None, stride=1):
        super(ResidualBlockSmall, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bnorm1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bnorm2 = nn.BatchNorm2d(out_channels)
        self.activation = nn.ReLU()
        self.residual_downsample = residual_downsample

    def forward(self, inp):
        # inp: (batch_size, input_channels, height, width)

        res = inp
        out = self.activation(self.bnorm1(self.conv1(inp)))
        out = self.activation(self.bnorm2(self.conv2(out)))

        if self.residual_downsample is not None:
            res = self.residual_downsample(res)

        out = self.activation(out + res)

        # out: (batch_size, out_channels, height, width) | height, width depending on stride
        return out

In [26]:
class ResNetSmall(nn.Module):
    '''
    ResNetSmall consists of layers of the smaller residual block defined above (ResidualBlockSmall).
    The layers are the residual blocks. The sequence of operations is as follows :-

    Input -> Conv1 -> BNorm1 -> ReLU -> MaxPool -> Layer1 -> Layer2 -> Layer3 -> Layer4 -> AvgPool -> FC

    Args:
        layers: A four value array containing number of conv layers in each residual block
        input_channels: number of input channels
        classes: Number of classes in the dataset

    Methods:
        _layer(num_layers (Number of conv layers)
               ,input_channels (Number of input channels)
               ,output_channels (Number of output channels)
               ,stride (Stride value for conv layer)) :-
        Returns the sequential wrapper with all the layers in the residual block constructed according
        to the parameters.

        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, input_channels, height, width) -> (batch_size, classes)

    Examples:
        >>> resnet = ResNetSmall(layers, input_channels, classes)
        >>> out = resnet(inp)
    '''
    def __init__(self, layers, input_channels, classes):
        super(ResNetSmall, self).__init__()
        self.name = 'ResNet'
        self.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bnorm1 = nn.BatchNorm2d(64)
        self.activation = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._layer(layers[0], input_channels=64, output_channels=64, stride=1)
        self.layer2 = self._layer(layers[1], input_channels=64, output_channels=128, stride=2)
        self.layer3 = self._layer(layers[2], input_channels=128, output_channels=256, stride=2)
        self.layer4 = self._layer(layers[3], input_channels=256, output_channels=512, stride=2)

        self.avppool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, classes)

    def forward(self, inp):
        # inp: (batch_size, input_channels, height, width)

        out = self.activation(self.bnorm1(self.conv1(inp)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = self.avppool(out)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)

        # out: (batch_size, classes)
        return out

    def _layer(self, num_layers, input_channels, output_channels, stride):
        residual_downsample = None
        layers = []

        if stride != 1:
            residual_downsample = nn.Sequential(
                nn.Conv2d(input_channels, output_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(output_channels * 4)
            )

        layers.append(ResidualBlockSmall(input_channels, output_channels, residual_downsample, stride))

        for i in range(num_layers - 1):
            layers.append(ResidualBlockSmall(output_channels, output_channels))

        return nn.Sequential(*layers)

In [27]:
class ResidualBlockLarge(nn.Module):
    '''
    ResidualBlockLarge implements the larger block of the Residual Networks. It optionally also downsamples
    the input according to the stride or output channels to match the output while adding the residual. The
    sequence of operations is as follows :-

    Input -> Conv1 -> BNorm1 -> ReLU -> Conv2 -> BNorm2 -> ReLU -> Conv3 -> BNorm3 -> ReLU -> Output
      |                                                                                          |
      |-----------------------------Residual_Downsample (Optional)-------------------------------|

    Args:
        input_channels: Number of input channels
        out_channels: Number of output channels
        residual_downsample: Residual Downsample dependent on if either height, width or channels change
        stride: Stride value for the convolutional layers (Default=1)
        expansion: Expansion of the input channels during convolutions (Default=4)

    Methods:
        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, input_channels, height, width) -> (batch_size, out_channels * expansion, height, width)

    Examples:
        >>> RBL = ResidualBlockLarge(input_channels, out_channels, residual_downsample, stride, expansion)
        >>> out = RBL(inp)
    '''
    def __init__(self, input_channels, out_channels, residual_downsample=None, stride=1, expansion=4):
        super(ResidualBlockLarge, self).__init__()
        self.expansion = expansion
        self.conv1 = nn.Conv2d(input_channels, out_channels, kernel_size=1, stride=1, padding=0)
        self.bnorm1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bnorm2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * expansion, kernel_size=1, stride=1, padding=0)
        self.bnorm3 = nn.BatchNorm2d(out_channels * expansion)
        self.activation = nn.ReLU()
        self.residual_downsample = residual_downsample

    def forward(self, inp):
        # inp: (batch_size, input_channels, height, width)

        res = inp
        out = self.activation(self.bnorm1(self.conv1(inp)))
        out = self.activation(self.bnorm2(self.conv2(out)))
        out = self.activation(self.bnorm3(self.conv3(out)))

        if self.residual_downsample is not None:
            res = self.residual_downsample(res)

        out = self.activation(out + res)

        # out: (batch_size, out_channels * expansion, height, width) | height, width depending on stride
        return out

In [28]:
class ResNetLarge(nn.Module):
    '''
    ResNetLarge consists of layers of the larger residual block defined above (ResidualBlockLarger).
    The layers are the residual blocks. The sequence of operations is as follows :-

    Input -> Conv1 -> BNorm1 -> ReLU -> MaxPool -> Layer1 -> Layer2 -> Layer3 -> Layer4 -> AvgPool -> FC

    Args:
        layers: A four value array containing number of conv layers in each residual block
        input_channels: number of input channels
        classes: Number of classes in the dataset

    Methods:
        _layer(num_layers (Number of conv layers)
               ,input_channels (Number of input channels)
               ,output_channels (Number of output channels)
               ,stride (Stride value for conv layer)) :-
        Returns the sequential wrapper with all the layers in the residual block constructed according
        to the parameters.

        forward(inp) :-
        Applies the sequence of operations mentioned above.
        (batch_size, input_channels, height, width) -> (batch_size, classes)

    Examples:
        >>> resnet = ResNetLarge(layers, input_channels, classes)
        >>> out = resnet(inp)
    '''
    def __init__(self, layers, input_channels, classes):
        super(ResNetLarge, self).__init__()
        self.name = 'ResNet'
        self.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bnorm1 = nn.BatchNorm2d(64)
        self.activation = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._layer(layers[0], input_channels=64, output_channels=64, stride=1)
        self.layer2 = self._layer(layers[1], input_channels=256, output_channels=128, stride=2)
        self.layer3 = self._layer(layers[2], input_channels=512, output_channels=256, stride=2)
        self.layer4 = self._layer(layers[3], input_channels=1024, output_channels=512, stride=2)

        self.avppool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(2048, classes)

    def forward(self, inp):
        # inp: (batch_size, input_channels, height, width)

        out = self.activation(self.bnorm1(self.conv1(inp)))
        out = self.maxpool(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = self.avppool(out)
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)

        # out: (batch_size, classes)
        return out

    def _layer(self, num_layers, input_channels, output_channels, stride):
        residual_downsample = None
        layers = []

        # Checks if there would be potential mismatch in any of height, width or channels between input and output.
        # 4 is the value of the expansion for large ResNets
        if stride != 1 or input_channels != output_channels * 4:
            residual_downsample = nn.Sequential(
                nn.Conv2d(input_channels, output_channels * 4, kernel_size=1, stride=stride),
                nn.BatchNorm2d(output_channels * 4)
            )

        layers.append(ResidualBlockLarge(input_channels, output_channels, residual_downsample, stride))

        for i in range(num_layers - 1):
            layers.append(ResidualBlockLarge(output_channels * 4, output_channels))

        return nn.Sequential(*layers)

In [29]:
def ResNet34(input_channels, classes):
    '''
    Initalization of ResNet34 using the layers as mentioned in the paper and using ResNetSmall module.

    Args:
        input_channels: Number of input channels
        classes: Number of classes in the dataset

    Output:
        ResNetSmall Object
    '''
    return ResNetSmall([3, 4, 6, 3], input_channels, classes)

In [30]:
def ResNet50(input_channels, classes):
    '''
    Initalization of ResNet50 using the layers as mentioned in the paper and using ResNetLarge module.

    Args:
        input_channels: Number of input channels
        classes: Number of classes in the dataset

    Output:
        ResNetLarge Object
    '''
    return ResNetLarge([3, 4, 6, 3], input_channels, classes)

#### Data Loading Functions

In [31]:
def create_data_sampler(dataset_size, split_size, seed=42):
    """
    Create a sampler for dataset splitting

    Args:
        dataset_size: Total size of the dataset
        split_size: Size of the split needed
        seed: Random seed
    Returns:
        torch.utils.data.Sampler object
    """
    indices = list(range(dataset_size))
    np.random.seed(seed)
    np.random.shuffle(indices)
    return SubsetRandomSampler(indices[:split_size])

In [32]:
def get_transforms(size='32', normalize='standard', is_training=True):
    """
    Get data transforms for CIFAR100

    Args:
        size: Image size ('32' or '224')
        normalize: Normalization type ('standard' or 'imagenet')
        is_training: Whether to include augmentation transforms
    Returns:
        torchvision.transforms.Compose object
    """
    if normalize == 'imagenet':
        mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
    else:
        mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]

    transforms_list = []

    # Add data augmentation for training
    if is_training:
        if size == '32':
            transforms_list.extend([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(15)
            ])
        else:  # size == '224'
            transforms_list.append(
                transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.0))
            )
    elif size == '224':  # For validation/test with size 224
        transforms_list.append(transforms.Resize((224, 224)))

    # Add basic transforms
    transforms_list.extend([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    return transforms.Compose(transforms_list)

In [33]:
def CIFAR100DataLoader(split= 'train',
                       batch_size=8,
                       num_workers=2,
                       shuffle=True,
                       size='32',
                       normalize='standard',
                       train_size=45000,
                       val_size=5000,
                       seed=42):
    '''
    A wrapper function that creates a DataLoader for CIFAR100 dataset loaded from torchvision using
    the parameters supplied and applies the required data augmentations.

    Args:
        split: (string Values: 'train', 'valt' hoặc 'test')
        batch_size: Batch size to used for loading data (Default=8)
        num_workers: Number of parallel workers used to load data (Default=2)
        shuffle: Boolean value to decide if data should be randomized (Default=True)
        size: A string to decide the size of the input images (Default='32') (Values: '32','224')
        normalize: A string to decide the normalization to applied to the input images
                   (Default='standard') (Values: 'standard', 'imagenet')
        train_size: Number of training samples
        val_size: Number of validation samples
        seed: Random seed

    Output:
        torch.utils.data.DataLoader object
    '''
    # Set seed for reproducibility
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Get appropriate transforms
    is_training = (split == 'train')
    transforms = get_transforms(size, normalize, is_training)

    if split in ['train', 'val']:
        dataset = torchvision.datasets.CIFAR100(
            root='./data',
            train=True,
            download=True,
            transform=transforms
        )

        # Chia dữ liệu train và val
        if split == 'train':
            sampler = create_data_sampler(len(dataset), train_size, seed)
        else:  # val
            val_indices = list(range(len(dataset)))[train_size:train_size + val_size]
            sampler = SubsetRandomSampler(val_indices)

        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            sampler=sampler
        )

    else:  # test
        dataset = torchvision.datasets.CIFAR100(
            root='./data',
            train=False,
            download=True,
            transform=transforms
        )

        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            shuffle=shuffle
        )

    return dataloader

In [34]:
def load_cifar100_data(
    batch_size=32,
    num_workers=2,
    size='32',
    normalize='standard',
    train_size=45000,
    val_size=5000,
    seed=42
):
    """
    Load all CIFAR100 dataloaders

    Args:
        batch_size: Batch size
        num_workers: Number of workers
        size: Image size ('32' or '224')
        normalize: Normalization type ('standard' or 'imagenet')
        train_size: Number of training samples
        val_size: Number of validation samples
        seed: Random seed
    Returns:
        train_loader, val_loader, test_loader
    """
    train_loader = CIFAR100DataLoader(
        split='train',
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=True,
        size=size,
        normalize=normalize,
        train_size=train_size,
        val_size=val_size,
        seed=seed
    )

    val_loader = CIFAR100DataLoader(
        split='val',
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        size=size,
        normalize=normalize,
        train_size=train_size,
        val_size=val_size,
        seed=seed
    )

    test_loader = CIFAR100DataLoader(
        split='test',
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        size=size,
        normalize=normalize,
        train_size=train_size,
        val_size=val_size,
        seed=seed
    )

    # Print dataset information
    print(f'Dataset sizes:')
    print(f'Training:   {train_size:,} images')
    print(f'Validation: {val_size:,} images')
    print(f'Test:       {10000:,} images')

    return train_loader, val_loader, test_loader

In [None]:
def CIFAR10DataLoader(split, batch_size=8, num_workers=2, shuffle=True, size='32', normalize='standard'):
    '''
    A wrapper function that creates a DataLoader for CIFAR10 dataset loaded from torchvision using
    the parameters supplied and applies the required data augmentations.

    Args:
        split: (string Values: 'train', 'valt' hoặc 'test')
        batch_size: Batch size to used for loading data (Default=8)
        num_workers: Number of parallel workers used to load data (Default=2)
        shuffle: Boolean value to decide if data should be randomized (Default=True)
        size: A string to decide the size of the input images (Default='32') (Values: '32','224')
        normalize: A string to decide the normalization to applied to the input images
                   (Default='standard') (Values: 'standard', 'imagenet')

    Output:
        DataLoader Object
    '''
    if normalize == 'imagenet':
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
    elif normalize == 'standard':
        mean = [0.5, 0.5, 0.5]
        std =  [0.5, 0.5, 0.5]

    if split == 'train':
        if size == '224':
            train_transform = transforms.Compose([
                transforms.RandomResizedCrop((224,224), scale=(0.5, 1.0)),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)
            ])
        elif size == '32':
            train_transform = transforms.Compose([
                transforms.Resize((48, 48)),
                transforms.RandomCrop(32),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(15),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)
            ])

        cifar10 = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
        dataloader = DataLoader(cifar10, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle)

    elif split == 'test':
        if size == '224':
            test_transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean, std)
            ])
        elif size == '32':
            test_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean, std)
            ])

        cifar10 = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)
        dataloader = DataLoader(cifar10, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle)

    return dataloader

#### Training and Evaluation Functions

In [35]:
# Initializations of all the constants used in the training and testing process

lr = 0.003
batch_size = 32 #256
num_workers = 2
shuffle = True
patch_size = 16 #4
image_sz = 32
max_len = 1000 # All sequences must be less than 1000 including class token
embed_dim = 768
classes = 100
layers = 12
channels = 3
resnet_features_channels = 64
heads = 12
epochs = 100

In [36]:
def train(model, dataloader, criterion, optimizer, scheduler, resnet_features=None):
    '''
    Function used to train the model over a single epoch and update it according to the
    calculated gradients.

    Args:
        model: Model supplied to the function
        dataloader: DataLoader supplied to the function
        criterion: Criterion used to calculate loss
        optimizer: Optimizer used update the model
        scheduler: Scheduler used to update the learing rate for faster convergence
                   (Commented out due to poor results)
        resnet_features: Model to get Resnet Features for the hybrid architecture (Default=None)

    Output:
        running_loss: Training Loss (Float)
        running_accuracy: Training Accuracy (Float)
    '''
    running_loss = 0.0
    running_accuracy = 0.0

    for data, target in tqdm(dataloader):
        data = data.to(device)
        target = target.to(device)

        if model.name == 'VisionTransformer':
            with torch.no_grad():
                if resnet_features != None:
                    data = resnet_features(data)
            output, _ = model(data)
        elif model.name == 'ResNet':
            output = model(data)
        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # scheduler.step()

        acc = (output.argmax(dim=1) == target).float().mean()
        running_accuracy += acc / len(dataloader)
        running_loss += loss.item() / len(dataloader)

    return running_loss, running_accuracy

In [37]:
def evaluation(model, dataloader, criterion, resnet_features=None):
    '''
    Function used to evaluate the model on the test dataset.

    Args:
        model: Model supplied to the function
        dataloader: DataLoader supplied to the function
        criterion: Criterion used to calculate loss
        resnet_features: Model to get Resnet Features for the hybrid architecture (Default=None)

    Output:
        test_loss: Testing Loss (Float)
        test_accuracy: Testing Accuracy (Float)
    '''
    with torch.no_grad():
        test_accuracy = 0.0
        test_loss = 0.0
        for data, target in tqdm(dataloader):
            data = data.to(device)
            target = target.to(device)

            if model.name == 'VisionTransformer':
                if resnet_features != None:
                    data = resnet_features(data)
                output, _ = model(data)
            elif model.name == 'ResNet':
                output = model(data)
            loss = criterion(output, target)

            acc = (output.argmax(dim=1) == target).float().mean()
            test_accuracy += acc / len(dataloader)
            test_loss += loss.item() / len(dataloader)

    return test_loss, test_accuracy

#### Model Initialization

Run either one the following subcells according to the models selected to train and test

##### Model - Vision Transformer

Recommended Values for the following Architecture

- patch_size = 4
- max_len = 100
- embed_dim = 512
- classes = According to Dataset
- layers = 12
- channels = 3
- heads = 16

In [38]:
# Vision Transformer Architecture

model = VisionTransformer(
    patch_size=patch_size,
    max_len=max_len,
    embed_dim=embed_dim,
    classes=classes,
    layers=layers,
    channels=channels,
    heads=heads).to(device)

resnet_features = None

Recommended Values for the following Architecture

- patch_size = 7
- max_len = 100
- embed_dim = 512
- classes = According to Dataset
- layers = 12
- channels = 64 (Resnet Features Channels)
- heads = 16

In [None]:
# Hybrid Vision Transformer Architecture

model = VisionTransformer(
    patch_size=patch_size,
    max_len=max_len,
    embed_dim=embed_dim,
    classes=classes,
    layers=layers,
    channels=resnet_features_channels,
    heads=heads).to(device)

resnet_features = ResNetFeatures().to(device).eval()



##### Model - ResNet50 or ResNet34

Recommended Values for the following Architecture

- input_channels = 3
- classes = According to Dataset


In [None]:
# ResNet34 Architecture

model = ResNet34(
    input_channels=3,
    classes=classes).to(device)

resnet_features = None

Recommended Values for the following Architecture

- input_channels = 3
- classes = According to Dataset


In [None]:
# ResNet50 Architecture

model = ResNet50(
    input_channels=3,
    classes=classes).to(device)

resnet_features = None

#### Model Training and Evaluation

##### CIFAR100 Dataset

In [39]:
tenmohinh = 'm1'

In [40]:
train_dataloader, val_dataloader, test_dataloader = load_cifar100_data(batch_size=32,
                                                           num_workers=2,
                                                           size='32',
                                                           normalize='standard',
                                                           train_size=45000,
                                                           val_size=5000) # mặc định test = 10000

criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, steps_per_epoch=len(train_dataloader), epochs=epochs)

train_accs = []
test_accs = []

checkpoint_dir = os.path.join(work_dir, 'checkpoints')
os.makedirs(checkpoint_dir, exist_ok=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [01:18<00:00, 2147510.76it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified
Dataset sizes:
Training:   45,000 images
Validation: 5,000 images
Test:       10,000 images


In [None]:
start_time = time.time()

try:
    for epoch in range(epochs):
        # Training phase
        running_loss, running_accuracy = train(model, train_dataloader, criterion,
                                             optimizer, scheduler, resnet_features)
        print(f"Epoch : {epoch+1} - acc: {running_accuracy:.4f} - loss : {running_loss:.4f}")
        train_accs.append(running_accuracy)

        # Evaluation phase
        test_loss, test_accuracy = evaluation(model, test_dataloader, criterion, resnet_features)
        print(f"test acc: {test_accuracy:.4f} - test loss : {test_loss:.4f}\n")
        test_accs.append(test_accuracy)

        # Save checkpoint every 10 epochs
        if (epoch+1) % 10 == 0:
            checkpoint_path = os.path.join(checkpoint_dir,
                                         f'{model.name}_{tenmohinh}_CIFAR100_epoch_{epoch+1}_checkpoint.pt')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_acc': train_accs,
                'test_acc': test_accs
            }, checkpoint_path)
            print(f"Saved checkpoint at epoch {epoch+1}")

except KeyboardInterrupt:
    print("\nTraining interrupted by user. Saving final checkpoint...")
    final_checkpoint_path = os.path.join(checkpoint_dir,
                                       f'{model.name}_{tenmohinh}_CIFAR100_interrupted_checkpoint.pt')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(), # Chỉ lưu các tham số (weights và biases) của mô hình
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'train_acc': train_accs,
        'test_acc': test_accs
    }, final_checkpoint_path)
    print("Final checkpoint saved.")

except Exception as e:
    print(f"An error occurred during training: {str(e)}")

finally:
    # Tính tổng thời gian chạy
    end_time = time.time()
    total_time = end_time - start_time
print(f"Total training time: {str(timedelta(seconds=int(total_time)))}")


100%|██████████| 1407/1407 [01:13<00:00, 19.18it/s]


Epoch : 1 - acc: 0.0312 - loss : 4.4426


100%|██████████| 313/313 [00:04<00:00, 74.47it/s]


test acc: 0.0509 - test loss : 4.2500



100%|██████████| 1407/1407 [01:15<00:00, 18.70it/s]


Epoch : 2 - acc: 0.0665 - loss : 4.1298


100%|██████████| 313/313 [00:04<00:00, 72.98it/s]


test acc: 0.0835 - test loss : 4.0075



100%|██████████| 1407/1407 [01:14<00:00, 18.94it/s]


Epoch : 3 - acc: 0.0918 - loss : 3.9383


100%|██████████| 313/313 [00:04<00:00, 73.67it/s]


test acc: 0.1055 - test loss : 3.8439



100%|██████████| 1407/1407 [01:13<00:00, 19.21it/s]


Epoch : 4 - acc: 0.1069 - loss : 3.8384


100%|██████████| 313/313 [00:04<00:00, 71.99it/s]


test acc: 0.1211 - test loss : 3.7583



100%|██████████| 1407/1407 [01:12<00:00, 19.35it/s]


Epoch : 5 - acc: 0.1177 - loss : 3.7787


100%|██████████| 313/313 [00:04<00:00, 75.32it/s]


test acc: 0.1302 - test loss : 3.7154



100%|██████████| 1407/1407 [01:12<00:00, 19.42it/s]


Epoch : 6 - acc: 0.1207 - loss : 3.7437


100%|██████████| 313/313 [00:04<00:00, 67.77it/s]


test acc: 0.1234 - test loss : 3.7580



100%|██████████| 1407/1407 [01:11<00:00, 19.71it/s]


Epoch : 7 - acc: 0.1294 - loss : 3.6985


100%|██████████| 313/313 [00:04<00:00, 73.27it/s]


test acc: 0.1275 - test loss : 3.7217



100%|██████████| 1407/1407 [01:12<00:00, 19.45it/s]


Epoch : 8 - acc: 0.1312 - loss : 3.6799


100%|██████████| 313/313 [00:04<00:00, 73.32it/s]


test acc: 0.1333 - test loss : 3.7002



100%|██████████| 1407/1407 [01:13<00:00, 19.19it/s]


Epoch : 9 - acc: 0.1362 - loss : 3.6487


100%|██████████| 313/313 [00:04<00:00, 72.03it/s]


test acc: 0.1315 - test loss : 3.6990



100%|██████████| 1407/1407 [01:14<00:00, 18.78it/s]


Epoch : 10 - acc: 0.1390 - loss : 3.6277


100%|██████████| 313/313 [00:04<00:00, 71.41it/s]


test acc: 0.1365 - test loss : 3.6855

Saved checkpoint at epoch 10


100%|██████████| 1407/1407 [01:13<00:00, 19.06it/s]


Epoch : 11 - acc: 0.1468 - loss : 3.6077


100%|██████████| 313/313 [00:04<00:00, 70.00it/s]


test acc: 0.1406 - test loss : 3.6844



100%|██████████| 1407/1407 [01:11<00:00, 19.60it/s]


Epoch : 12 - acc: 0.1445 - loss : 3.6016


100%|██████████| 313/313 [00:04<00:00, 72.59it/s]


test acc: 0.1456 - test loss : 3.6306



100%|██████████| 1407/1407 [01:12<00:00, 19.46it/s]


Epoch : 13 - acc: 0.1507 - loss : 3.5745


100%|██████████| 313/313 [00:04<00:00, 75.51it/s]


test acc: 0.1498 - test loss : 3.6089



100%|██████████| 1407/1407 [01:12<00:00, 19.31it/s]


Epoch : 14 - acc: 0.1505 - loss : 3.5759


100%|██████████| 313/313 [00:04<00:00, 75.64it/s]


test acc: 0.1489 - test loss : 3.6067



100%|██████████| 1407/1407 [01:12<00:00, 19.48it/s]


Epoch : 15 - acc: 0.1539 - loss : 3.5504


100%|██████████| 313/313 [00:04<00:00, 70.66it/s]


test acc: 0.1458 - test loss : 3.6552



100%|██████████| 1407/1407 [01:12<00:00, 19.49it/s]


Epoch : 16 - acc: 0.1564 - loss : 3.5504


100%|██████████| 313/313 [00:04<00:00, 73.29it/s]


test acc: 0.1406 - test loss : 3.6378



100%|██████████| 1407/1407 [01:12<00:00, 19.36it/s]


Epoch : 17 - acc: 0.1567 - loss : 3.5397


100%|██████████| 313/313 [00:04<00:00, 74.29it/s]


test acc: 0.1388 - test loss : 3.6920



100%|██████████| 1407/1407 [01:12<00:00, 19.45it/s]


Epoch : 18 - acc: 0.1514 - loss : 3.5516


100%|██████████| 313/313 [00:04<00:00, 72.47it/s]


test acc: 0.1505 - test loss : 3.6285



100%|██████████| 1407/1407 [01:11<00:00, 19.71it/s]


Epoch : 19 - acc: 0.1539 - loss : 3.5402


100%|██████████| 313/313 [00:04<00:00, 74.34it/s]


test acc: 0.1525 - test loss : 3.5924



100%|██████████| 1407/1407 [01:12<00:00, 19.29it/s]


Epoch : 20 - acc: 0.1553 - loss : 3.5333


100%|██████████| 313/313 [00:04<00:00, 74.67it/s]


test acc: 0.1537 - test loss : 3.6282

Saved checkpoint at epoch 20


100%|██████████| 1407/1407 [01:11<00:00, 19.55it/s]


Epoch : 21 - acc: 0.1601 - loss : 3.5170


100%|██████████| 313/313 [00:04<00:00, 75.83it/s]


test acc: 0.1548 - test loss : 3.5989



100%|██████████| 1407/1407 [01:12<00:00, 19.49it/s]


Epoch : 22 - acc: 0.1617 - loss : 3.5114


100%|██████████| 313/313 [00:04<00:00, 72.89it/s]


test acc: 0.1505 - test loss : 3.6544



100%|██████████| 1407/1407 [01:14<00:00, 19.01it/s]


Epoch : 23 - acc: 0.1592 - loss : 3.5087


100%|██████████| 313/313 [00:04<00:00, 66.71it/s]


test acc: 0.1517 - test loss : 3.6377



100%|██████████| 1407/1407 [01:17<00:00, 18.06it/s]


Epoch : 24 - acc: 0.1588 - loss : 3.5263


100%|██████████| 313/313 [00:04<00:00, 67.02it/s]


test acc: 0.1384 - test loss : 3.6628



100%|██████████| 1407/1407 [01:18<00:00, 18.01it/s]


Epoch : 25 - acc: 0.1566 - loss : 3.5378


100%|██████████| 313/313 [00:04<00:00, 64.32it/s]


test acc: 0.1544 - test loss : 3.6173



100%|██████████| 1407/1407 [01:17<00:00, 18.04it/s]


Epoch : 26 - acc: 0.1570 - loss : 3.5217


100%|██████████| 313/313 [00:04<00:00, 70.11it/s]


test acc: 0.1499 - test loss : 3.6421



100%|██████████| 1407/1407 [01:19<00:00, 17.67it/s]


Epoch : 27 - acc: 0.1612 - loss : 3.5229


100%|██████████| 313/313 [00:04<00:00, 69.51it/s]


test acc: 0.1509 - test loss : 3.6274



100%|██████████| 1407/1407 [01:19<00:00, 17.79it/s]


Epoch : 28 - acc: 0.1574 - loss : 3.5249


100%|██████████| 313/313 [00:04<00:00, 67.45it/s]


test acc: 0.1448 - test loss : 3.6342



100%|██████████| 1407/1407 [01:19<00:00, 17.73it/s]


Epoch : 29 - acc: 0.1535 - loss : 3.5428


100%|██████████| 313/313 [00:04<00:00, 64.68it/s]


test acc: 0.1424 - test loss : 3.6730



100%|██████████| 1407/1407 [01:18<00:00, 17.92it/s]


Epoch : 30 - acc: 0.1526 - loss : 3.5663


100%|██████████| 313/313 [00:04<00:00, 63.65it/s]


test acc: 0.1340 - test loss : 3.7761

Saved checkpoint at epoch 30


100%|██████████| 1407/1407 [01:19<00:00, 17.78it/s]


Epoch : 31 - acc: 0.1491 - loss : 3.5835


100%|██████████| 313/313 [00:04<00:00, 65.52it/s]


test acc: 0.1389 - test loss : 3.6919



100%|██████████| 1407/1407 [01:17<00:00, 18.08it/s]


Epoch : 32 - acc: 0.1564 - loss : 3.5435


100%|██████████| 313/313 [00:04<00:00, 68.29it/s]


test acc: 0.1473 - test loss : 3.6508



100%|██████████| 1407/1407 [01:18<00:00, 17.95it/s]


Epoch : 33 - acc: 0.1599 - loss : 3.5154


100%|██████████| 313/313 [00:04<00:00, 67.27it/s]


test acc: 0.1512 - test loss : 3.6168



100%|██████████| 1407/1407 [01:18<00:00, 18.00it/s]


Epoch : 34 - acc: 0.1606 - loss : 3.5273


100%|██████████| 313/313 [00:04<00:00, 65.66it/s]


test acc: 0.1432 - test loss : 3.6793



100%|██████████| 1407/1407 [01:18<00:00, 17.82it/s]


Epoch : 35 - acc: 0.1584 - loss : 3.5384


100%|██████████| 313/313 [00:04<00:00, 64.14it/s]


test acc: 0.1224 - test loss : 3.8141



100%|██████████| 1407/1407 [01:19<00:00, 17.80it/s]


Epoch : 36 - acc: 0.1569 - loss : 3.5496


100%|██████████| 313/313 [00:04<00:00, 68.31it/s]


test acc: 0.1496 - test loss : 3.6791



100%|██████████| 1407/1407 [01:17<00:00, 18.04it/s]


Epoch : 37 - acc: 0.1589 - loss : 3.5226


100%|██████████| 313/313 [00:04<00:00, 66.64it/s]


test acc: 0.1483 - test loss : 3.6574



100%|██████████| 1407/1407 [01:14<00:00, 18.93it/s]


Epoch : 38 - acc: 0.1581 - loss : 3.5373


100%|██████████| 313/313 [00:04<00:00, 75.73it/s]


test acc: 0.1489 - test loss : 3.6635



100%|██████████| 1407/1407 [01:12<00:00, 19.30it/s]


Epoch : 39 - acc: 0.1580 - loss : 3.5287


100%|██████████| 313/313 [00:04<00:00, 74.55it/s]


test acc: 0.1514 - test loss : 3.6524



100%|██████████| 1407/1407 [01:11<00:00, 19.67it/s]


Epoch : 40 - acc: 0.1560 - loss : 3.5392


100%|██████████| 313/313 [00:04<00:00, 70.26it/s]


test acc: 0.1514 - test loss : 3.6484

Saved checkpoint at epoch 40


100%|██████████| 1407/1407 [01:11<00:00, 19.66it/s]


Epoch : 41 - acc: 0.1506 - loss : 3.5645


100%|██████████| 313/313 [00:04<00:00, 72.37it/s]


test acc: 0.1434 - test loss : 3.6915



100%|██████████| 1407/1407 [01:06<00:00, 21.19it/s]


Epoch : 42 - acc: 0.1554 - loss : 3.5551


100%|██████████| 313/313 [00:03<00:00, 78.34it/s]


test acc: 0.1372 - test loss : 3.7130



100%|██████████| 1407/1407 [01:09<00:00, 20.29it/s]


Epoch : 43 - acc: 0.1441 - loss : 3.6117


100%|██████████| 313/313 [00:03<00:00, 82.91it/s]


test acc: 0.1268 - test loss : 3.7978



100%|██████████| 1407/1407 [01:06<00:00, 21.05it/s]


Epoch : 44 - acc: 0.1455 - loss : 3.6230


100%|██████████| 313/313 [00:03<00:00, 89.29it/s] 


test acc: 0.1351 - test loss : 3.7367



100%|██████████| 1407/1407 [00:58<00:00, 23.95it/s]


Epoch : 45 - acc: 0.1512 - loss : 3.5659


100%|██████████| 313/313 [00:03<00:00, 100.62it/s]


test acc: 0.1349 - test loss : 3.7125



100%|██████████| 1407/1407 [00:55<00:00, 25.18it/s]


Epoch : 46 - acc: 0.1520 - loss : 3.5691


100%|██████████| 313/313 [00:03<00:00, 87.09it/s] 


test acc: 0.1365 - test loss : 3.7200



100%|██████████| 1407/1407 [00:57<00:00, 24.51it/s]


Epoch : 47 - acc: 0.1490 - loss : 3.5858


100%|██████████| 313/313 [00:03<00:00, 94.13it/s] 


test acc: 0.1317 - test loss : 3.7383



100%|██████████| 1407/1407 [00:57<00:00, 24.57it/s]


Epoch : 48 - acc: 0.1532 - loss : 3.5705


100%|██████████| 313/313 [00:03<00:00, 98.95it/s] 


test acc: 0.1315 - test loss : 3.7347



100%|██████████| 1407/1407 [01:01<00:00, 22.88it/s]


Epoch : 49 - acc: 0.1525 - loss : 3.5732


100%|██████████| 313/313 [00:03<00:00, 87.54it/s] 


test acc: 0.1404 - test loss : 3.7136



100%|██████████| 1407/1407 [01:16<00:00, 18.34it/s]


Epoch : 50 - acc: 0.1545 - loss : 3.5526


100%|██████████| 313/313 [00:04<00:00, 74.02it/s]


test acc: 0.1335 - test loss : 3.7737

Saved checkpoint at epoch 50


100%|██████████| 1407/1407 [01:10<00:00, 19.92it/s]


Epoch : 51 - acc: 0.1534 - loss : 3.5645


100%|██████████| 313/313 [00:04<00:00, 74.55it/s]


test acc: 0.1361 - test loss : 3.7427



100%|██████████| 1407/1407 [01:11<00:00, 19.62it/s]


Epoch : 52 - acc: 0.1544 - loss : 3.5580


100%|██████████| 313/313 [00:04<00:00, 71.24it/s]


test acc: 0.1334 - test loss : 3.7486



100%|██████████| 1407/1407 [01:11<00:00, 19.59it/s]


Epoch : 53 - acc: 0.1525 - loss : 3.5578


100%|██████████| 313/313 [00:04<00:00, 75.74it/s]


test acc: 0.1343 - test loss : 3.7515



100%|██████████| 1407/1407 [01:11<00:00, 19.66it/s]


Epoch : 54 - acc: 0.1507 - loss : 3.5807


100%|██████████| 313/313 [00:04<00:00, 74.57it/s]


test acc: 0.1421 - test loss : 3.7008



100%|██████████| 1407/1407 [01:09<00:00, 20.30it/s]


Epoch : 55 - acc: 0.1470 - loss : 3.6016


100%|██████████| 313/313 [00:04<00:00, 76.43it/s]


test acc: 0.1249 - test loss : 3.7965



100%|██████████| 1407/1407 [01:11<00:00, 19.74it/s]


Epoch : 56 - acc: 0.1531 - loss : 3.5728


100%|██████████| 313/313 [00:04<00:00, 74.92it/s]


test acc: 0.1429 - test loss : 3.6793



100%|██████████| 1407/1407 [01:10<00:00, 20.09it/s]


Epoch : 57 - acc: 0.1547 - loss : 3.5517


100%|██████████| 313/313 [00:04<00:00, 78.08it/s]


test acc: 0.1474 - test loss : 3.6930



100%|██████████| 1407/1407 [01:09<00:00, 20.18it/s]


Epoch : 58 - acc: 0.1555 - loss : 3.5486


100%|██████████| 313/313 [00:04<00:00, 74.87it/s]


test acc: 0.1415 - test loss : 3.7438



100%|██████████| 1407/1407 [01:12<00:00, 19.51it/s]


Epoch : 59 - acc: 0.1533 - loss : 3.5778


100%|██████████| 313/313 [00:04<00:00, 75.16it/s]


test acc: 0.1405 - test loss : 3.7446



100%|██████████| 1407/1407 [01:09<00:00, 20.30it/s]


Epoch : 60 - acc: 0.1529 - loss : 3.5842


100%|██████████| 313/313 [00:04<00:00, 76.12it/s]


test acc: 0.1314 - test loss : 3.8293

Saved checkpoint at epoch 60


100%|██████████| 1407/1407 [01:09<00:00, 20.27it/s]


Epoch : 61 - acc: 0.1440 - loss : 3.6138


100%|██████████| 313/313 [00:04<00:00, 75.68it/s]


test acc: 0.1303 - test loss : 3.8075



100%|██████████| 1407/1407 [01:10<00:00, 20.08it/s]


Epoch : 62 - acc: 0.1459 - loss : 3.6070


100%|██████████| 313/313 [00:04<00:00, 76.99it/s]


test acc: 0.1219 - test loss : 3.8237



100%|██████████| 1407/1407 [01:09<00:00, 20.15it/s]


Epoch : 63 - acc: 0.1402 - loss : 3.6462


100%|██████████| 313/313 [00:04<00:00, 70.98it/s]


test acc: 0.1325 - test loss : 3.7476



100%|██████████| 1407/1407 [01:10<00:00, 20.00it/s]


Epoch : 64 - acc: 0.1455 - loss : 3.6177


100%|██████████| 313/313 [00:03<00:00, 78.35it/s]


test acc: 0.1255 - test loss : 3.7820



100%|██████████| 1407/1407 [01:10<00:00, 20.03it/s]


Epoch : 65 - acc: 0.1425 - loss : 3.6402


100%|██████████| 313/313 [00:04<00:00, 73.07it/s]


test acc: 0.1212 - test loss : 3.8071



100%|██████████| 1407/1407 [01:11<00:00, 19.74it/s]


Epoch : 66 - acc: 0.1431 - loss : 3.6369


100%|██████████| 313/313 [00:04<00:00, 73.80it/s]


test acc: 0.1171 - test loss : 3.8430



100%|██████████| 1407/1407 [01:10<00:00, 19.93it/s]


Epoch : 67 - acc: 0.1384 - loss : 3.6570


100%|██████████| 313/313 [00:04<00:00, 74.62it/s]


test acc: 0.1239 - test loss : 3.8345



100%|██████████| 1407/1407 [01:11<00:00, 19.70it/s]


Epoch : 68 - acc: 0.1389 - loss : 3.6377


100%|██████████| 313/313 [00:04<00:00, 76.38it/s]


test acc: 0.1093 - test loss : 3.9232



100%|██████████| 1407/1407 [01:10<00:00, 19.83it/s]


Epoch : 69 - acc: 0.1401 - loss : 3.6437


100%|██████████| 313/313 [00:04<00:00, 77.88it/s]


test acc: 0.1332 - test loss : 3.7582



100%|██████████| 1407/1407 [01:09<00:00, 20.24it/s]


Epoch : 70 - acc: 0.1408 - loss : 3.6504


100%|██████████| 313/313 [00:04<00:00, 76.04it/s]


test acc: 0.1254 - test loss : 3.7742

Saved checkpoint at epoch 70


100%|██████████| 1407/1407 [01:11<00:00, 19.81it/s]


Epoch : 71 - acc: 0.1418 - loss : 3.6357


100%|██████████| 313/313 [00:04<00:00, 74.92it/s]


test acc: 0.1191 - test loss : 3.8253



100%|██████████| 1407/1407 [01:09<00:00, 20.18it/s]


Epoch : 72 - acc: 0.1367 - loss : 3.6662


100%|██████████| 313/313 [00:04<00:00, 75.39it/s]


test acc: 0.1137 - test loss : 3.8784



 19%|█▉        | 265/1407 [00:13<00:58, 19.60it/s]

In [None]:
## Có thể load weights vào model có cấu trúc tương tự nhưng khác tên
# class ModelV1(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.conv1 = nn.Conv2d(3, 64, 3)

# class ModelV2(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.conv1 = nn.Conv2d(3, 64, 3)

# #Load weights từ ModelV1 sang ModelV2
# model_v1_checkpoint = torch.load('model_v1_checkpoint.pt')
# model_v2 = ModelV2()
# model_v2.load_state_dict(model_v1_checkpoint['model_state_dict'])

##### Plotting Results

In [None]:
train_accs = [acc.cpu().item() for acc in train_accs]
test_accs = [acc.cpu().item() for acc in test_accs]
# print(train_accs)
# print(test_accs)
plt.style.use('seaborn')
plt.plot(range(1, 101), train_accs, label='Train Accuracy')
plt.plot(range(1, 101), test_accs, label='Test Accuracy')

plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.title("Train vs Test Accuracy")
plt.legend(loc='lower right')