In [1]:

import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import timm
from einops.layers.torch import Rearrange
from einops import repeat
from torch import Tensor

import matplotlib.pyplot as plt
import numpy as pd
import sys
from tqdm.notebook import tqdm

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True


In [2]:
class CardDataset(Dataset):
    def __init__(self, dir_path, transform=None):
        self.data = ImageFolder(dir_path, transform = transform)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [4]:
train_path = '/aul/homes/amaha038/DeepLearning/Datasets/Card_Dataset_Kaggle/train/'
val_path = '/aul/homes/amaha038/DeepLearning/Datasets/Card_Dataset_Kaggle/valid/'

train_data = CardDataset(train_path, transform = transform)
val_data = CardDataset(val_path, transform = transform)

train_loader = DataLoader(train_data, batch_size = 32, num_workers=4, shuffle=True)
val_loader = DataLoader(val_data, batch_size = 32, num_workers=4, shuffle=False)

In [6]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels, patch_size, emb_size):
        super().__init__()
        self.patch_size = patch_size
        self.projection = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size),
            nn.Linear(patch_size * patch_size * in_channels, emb_size)
        )

    def forward(self, x: Tensor) -> Tensor:
        x = self.projection(x)
        return x

"""Note on Rearrange from Einops
Inputsize = x: [B, C, H, W], let's say H = 224 and C = 3.
1. Say, patch_size = 4
Now, h = 224//4 = 56, w = 224//4 = 56
2. B C H W = b c (h p1) (w p2), because b = B, c = C, h*p1 = 224, w*p2 = 224
3. Now, Reshaping: b c (h p1) (w p2) -> b (h w) (p1 p2 c)
What we get: (h w) = number of patches (flattened to one dimension)
             (p1 p2 c): patch pixels + channels flattened into a vector = **PatchVectorSize**
4. So, we get, [B, 3136, 4*4*C] = [B, NumPatches, PatchVectorSize]. 
    Now, we have sequence of 3136 flattened patches
5. We have to project each patch to certain embedding.
   Let's say the emb_size = embedding dimension
   Now, we apply, nn.Linear(PatchVectorSize, emb_size).
   Finally, each tensor will be of dimension = [Batch, NumPatches, emb_size]
"""

"Note on Rearrange from Einops\nInputsize = x: [B, C, H, W], let's say H = 224 and C = 3.\n1. Say, patch_size = 4\nNow, h = 224//4 = 56, w = 224//4 = 56\n2. B C H W = b c (h p1) (w p2), because b = B, c = C, h*p1 = 224, w*p2 = 224\n3. Now, Reshaping: b c (h p1) (w p2) -> b (h w) (p1 p2 c)\nWhat we get: (h w) = number of patches (flattened to one dimension)\n             (p1 p2 c): patch pixels + channels flattened into a vector = **PatchVectorSize**\n4. So, we get, [B, 3136, 4*4*C] = [B, NumPatches, PatchVectorSize]. \n    Now, we have sequence of 3136 flattened patches\n5. We have to project each patch to certain embedding.\n   Let's say the emb_size = embedding dimension\n   Now, we apply, nn.Linear(PatchVectorSize, emb_size).\n   Finally, each tensor will be of dimension = [Batch, NumPatches, emb_size]\n"

In [None]:
class Attention(nn.Module):
    

In [7]:
class Attention(nn.Module):
    def __init__(self, dim, n_heads, dropout):
        super().__init__()
        self.n_heads = n_heads
        self.att = nn.MultiheadAttention(embed_dim=dim,
                                         num_heads=n_heads,
                                         dropout=dropout)
        self.q = nn.Linear(dim, dim)
        self.k = nn.Linear(dim, dim)
        self.v = nn.Linear(dim, dim)

    def forward(self, x): # x: [batch, seq_len, dim], we had this from the PatchEmbedding
        x = x.transpose(0, 1)  # [seq_len, batch, dim] as expected by multiheadattention
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)
        attn_output, _ = self.att(q, k, v)
        return attn_output.transpose(0, 1)  # [batch, seq_len, dim]


In [None]:
#Better readable multihead attention
"""
class Attention(nn.Module):
    def __init__(self, dim, n_heads, dropout):
        super().__init__()
        self.att = nn.MultiheadAttention(embed_dim=dim,
                                         num_heads=n_heads,
                                         dropout=dropout,
                                         batch_first=True) 

    def forward(self, x):
        # x: [batch, seq_len, dim] because batch_first=True
        attn_output, _ = self.att(x, x, x)  # Q=K=V=x
        return attn_output
"""

In [8]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [9]:
class FeedForward(nn.Sequential):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

In [10]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x

In [11]:
#credit: https://www.youtube.com/watch?v=j3VNqtJUoz0&list=PLcpLsgRAryqx-dwIuJ9tT6BxJu8__LUUW&index=3

class ViT(nn.Module):
    def __init__(self, ch=3, img_size=224, patch_size=4, emb_dim=32,
                n_layers=6, out_dim=53, dropout=0.1, heads=2): #out_dim=53, matching the number of classes
        super(ViT, self).__init__()

        # Attributes
        self.channels = ch
        self.height = img_size
        self.width = img_size
        self.patch_size = patch_size
        self.n_layers = n_layers

        # Patching
        self.patch_embedding = PatchEmbedding(in_channels=ch,
                                              patch_size=patch_size,
                                              emb_size=emb_dim)
        # Learnable params
        num_patches = (img_size // patch_size) ** 2
        self.pos_embedding = nn.Parameter(
            torch.randn(1, num_patches + 1, emb_dim))
        self.cls_token = nn.Parameter(torch.rand(1, 1, emb_dim))

        # Transformer Encoder
        self.layers = nn.ModuleList([])
        for _ in range(n_layers):
            transformer_block = nn.Sequential(
                ResidualAdd(PreNorm(emb_dim, Attention(emb_dim, n_heads = heads, dropout = dropout))),
                ResidualAdd(PreNorm(emb_dim, FeedForward(emb_dim, emb_dim, dropout = dropout))))
            self.layers.append(transformer_block)

        # Classification head
        self.head = nn.Sequential(nn.LayerNorm(emb_dim), nn.Linear(emb_dim, out_dim))


    def forward(self, img):
        # Get patch embedding vectors
        x = self.patch_embedding(img)
        b, n, _ = x.shape

        # Add cls token to inputs
        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)
        x = torch.cat([cls_tokens, x], dim=1)
        x += self.pos_embedding[:, :(n + 1)]

        # Transformer layers
        for i in range(self.n_layers):
            x = self.layers[i](x)

        # Output based on classification token
        return self.head(x[:, 0, :])

In [12]:
model = ViT()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViT(
  (patch_embedding): PatchEmbedding(
    (projection): Sequential(
      (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=4, p2=4)
      (1): Linear(in_features=48, out_features=32, bias=True)
    )
  )
  (layers): ModuleList(
    (0-5): 6 x Sequential(
      (0): ResidualAdd(
        (fn): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): Attention(
            (att): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
            )
            (q): Linear(in_features=32, out_features=32, bias=True)
            (k): Linear(in_features=32, out_features=32, bias=True)
            (v): Linear(in_features=32, out_features=32, bias=True)
          )
        )
      )
      (1): ResidualAdd(
        (fn): PreNorm(
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (fn): FeedForward(
            (0): Linear(in_features=32, out_featu

In [13]:
optimizer = optim.AdamW(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 150
train_losses, val_losses = [], []
best_val_acc = 0.0
best_val_loss = float('inf')

for epoch in range(epochs):

    #training
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, labels in tqdm(train_loader, desc='Training loop'):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        #prediction
        _, preds = torch.max(outputs, 1)
        correct_train += (preds == labels).sum().item()
        total_train += labels.size(0)

        loss.backward()
        optimizer.step()
        running_loss += loss.item() * labels.size(0) # loss.item() gives the average loss per image in the current batch
    train_loss = running_loss / len(train_loader.dataset)
    train_losses.append(train_loss)
    train_acc = correct_train / total_train

    #validation phase

    model.eval()
    running_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc='Validation loop'):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            _, preds = torch.max(outputs, 1)
            correct_val += (preds == labels).sum().item()
            total_val += labels.size(0)
            running_loss += loss.item() * labels.size(0)
    val_loss = running_loss / len(val_loader.dataset)
    val_losses.append(val_loss)
    val_acc = correct_val / total_val

    print(f"Epoch {epoch+1}/{epochs} - Train loss: {train_loss}, Val Loss: {val_loss}. Train Acc.: {train_acc}, Val Acc.: {val_acc}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), './weights/best_val_acc.pth')
        print(f"The model weight is saved based on val_acc: {best_val_acc:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './weights/best_val_loss.pth')
        print(f"The model weight is saved based on val_loss: {best_val_loss:.4f}")

Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 1/150 - Train loss: 3.946616685653158, Val Loss: 3.683026289490034. Train Acc.: 0.026888772298006295, Val Acc.: 0.05660377358490566
The model weight is saved based on val_acc: 0.0566
The model weight is saved based on val_loss: 3.6830


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 2/150 - Train loss: 3.3724719097081413, Val Loss: 2.833555831549303. Train Acc.: 0.12631164742917103, Val Acc.: 0.2528301886792453
The model weight is saved based on val_acc: 0.2528
The model weight is saved based on val_loss: 2.8336


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 3/150 - Train loss: 2.8426398863197, Val Loss: 2.4806122761852336. Train Acc.: 0.2296694648478489, Val Acc.: 0.27169811320754716
The model weight is saved based on val_acc: 0.2717
The model weight is saved based on val_loss: 2.4806


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 4/150 - Train loss: 2.586635190067111, Val Loss: 2.2428677536406605. Train Acc.: 0.26285414480587616, Val Acc.: 0.3320754716981132
The model weight is saved based on val_acc: 0.3321
The model weight is saved based on val_loss: 2.2429


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 5/150 - Train loss: 2.4479025858772014, Val Loss: 2.2033963743245826. Train Acc.: 0.287906610703043, Val Acc.: 0.2943396226415094
The model weight is saved based on val_loss: 2.2034


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 6/150 - Train loss: 2.342922113374548, Val Loss: 2.015141246453771. Train Acc.: 0.3074501573976915, Val Acc.: 0.30566037735849055
The model weight is saved based on val_loss: 2.0151


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 7/150 - Train loss: 2.256357228268106, Val Loss: 1.897922255408089. Train Acc.: 0.3277806925498426, Val Acc.: 0.3660377358490566
The model weight is saved based on val_acc: 0.3660
The model weight is saved based on val_loss: 1.8979


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 8/150 - Train loss: 2.1687494646464667, Val Loss: 1.869799207291513. Train Acc.: 0.34942287513116477, Val Acc.: 0.37358490566037733
The model weight is saved based on val_acc: 0.3736
The model weight is saved based on val_loss: 1.8698


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 9/150 - Train loss: 2.107718061925482, Val Loss: 1.8639586543137172. Train Acc.: 0.36621196222455404, Val Acc.: 0.3849056603773585
The model weight is saved based on val_acc: 0.3849
The model weight is saved based on val_loss: 1.8640


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 10/150 - Train loss: 2.059309301946746, Val Loss: 1.831354436334574. Train Acc.: 0.37985309548793283, Val Acc.: 0.4
The model weight is saved based on val_acc: 0.4000
The model weight is saved based on val_loss: 1.8314


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 11/150 - Train loss: 2.010642581571437, Val Loss: 1.7183841583863744. Train Acc.: 0.38903462749213014, Val Acc.: 0.39622641509433965
The model weight is saved based on val_loss: 1.7184


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 12/150 - Train loss: 1.9593258444434822, Val Loss: 1.7156534968682056. Train Acc.: 0.40752885624344176, Val Acc.: 0.41509433962264153
The model weight is saved based on val_acc: 0.4151
The model weight is saved based on val_loss: 1.7157


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 13/150 - Train loss: 1.9083947620011577, Val Loss: 1.6898194061135345. Train Acc.: 0.41408709338929695, Val Acc.: 0.4188679245283019
The model weight is saved based on val_acc: 0.4189
The model weight is saved based on val_loss: 1.6898


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 14/150 - Train loss: 1.8791028967934416, Val Loss: 1.7081445630991234. Train Acc.: 0.4226128016789087, Val Acc.: 0.4226415094339623
The model weight is saved based on val_acc: 0.4226


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 15/150 - Train loss: 1.8200002754847875, Val Loss: 1.6730261361823893. Train Acc.: 0.4420251836306401, Val Acc.: 0.4830188679245283
The model weight is saved based on val_acc: 0.4830
The model weight is saved based on val_loss: 1.6730


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 16/150 - Train loss: 1.787022522839269, Val Loss: 1.6448621286536163. Train Acc.: 0.4530430220356768, Val Acc.: 0.45660377358490567
The model weight is saved based on val_loss: 1.6449


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]

Validation loop:   0%|          | 0/9 [00:00<?, ?it/s]

Epoch 17/150 - Train loss: 1.7497663985769747, Val Loss: 1.6406634182300208. Train Acc.: 0.4695697796432319, Val Acc.: 0.46037735849056605
The model weight is saved based on val_loss: 1.6407


Training loop:   0%|          | 0/239 [00:00<?, ?it/s]