In [26]:
import os
import random
from PIL import Image
from pathlib import Path
from timeit import default_timer as Timer
import matplotlib.pyplot as plt
import torch
from torch import nn
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import Subset, DataLoader, ConcatDataset, Dataset
from torchinfo import summary
import wandb
import onnx
from tqdm.auto import tqdm
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

print(torch.__version__)
print(torchvision.__version__)

2.6.0+cpu
0.21.0+cpu


In [27]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [28]:
data_path = Path("dataset/")
train_dataset_path = data_path/"train"
test_dataset_path = data_path/"test"
val_dataset_path = data_path/"val"

In [None]:
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])

train_data = datasets.ImageFolder(root= train_dataset_path,
                                  transform= transform,
                                  target_transform= None)
val_data = datasets.ImageFolder(root= val_dataset_path,
                                transform= transform,
                                target_transform= None)
test_data = datasets.ImageFolder(root= test_dataset_path,
                                 transform= transform)

train_data, val_data, test_data

(Dataset ImageFolder
     Number of datapoints: 84000
     Root location: dataset\train
     StandardTransform
 Transform: ToTensor(),
 Dataset ImageFolder
     Number of datapoints: 24000
     Root location: dataset\val
     StandardTransform
 Transform: ToTensor(),
 Dataset ImageFolder
     Number of datapoints: 12000
     Root location: dataset\test
     StandardTransform
 Transform: ToTensor())

In [39]:
train_data[1][0].shape

torch.Size([3, 32, 32])

In [55]:
class CNNBlock(nn.Module):
    def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
        super().__init__()
        self.Layer = torch.nn.Sequential(
            nn.Conv2d(input_shape, hidden_units, kernel_size= (3,3)),
            nn.BatchNorm2d(hidden_units),
            nn.ReLU(),
            nn.Conv2d(hidden_units, hidden_units, kernel_size= (3,3)),
            nn.BatchNorm2d(hidden_units),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size= (2,2)),
            nn.Conv2d(hidden_units, output_shape, kernel_size=(5,5)),
            nn.BatchNorm2d(output_shape),
            nn.ReLU()
        )

    def forward(self, x):
        return self.Layer(x)

In [56]:
## CNNBlock output is (64, 10, 10)
H,W,C,P = 10, 10, 64, 2
N = int(H * W / (P ** 2))
D = P * P * C
N, D

(25, 256)

In [65]:
class PatchEmbedding(nn.Module):
    def __init__(self,
                 in_channels: int,
                 patch_size: int,
                 embedding_dim: int) -> None:
        super().__init__()
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.embedding_dim = embedding_dim
        self.patcher = nn.Conv2d(in_channels= in_channels,
                                 out_channels= embedding_dim,
                                 stride= patch_size,
                                 kernel_size= patch_size,
                                 padding= 0)
        self.flatten = nn.Flatten(start_dim= 2,
                                  end_dim= 3)
        
    def forward(self, x):
        image_res = x.shape[-1]
        assert (image_res % self.patch_size == 0), "patch size should be divisible with image resolution"
        x_patched = self.patcher(x)
        x_flattened = self.flatten(x_patched)
        return x_flattened.permute(0,2,1)

In [66]:
class MultiHeadSelfAttentionBlock(nn.Module):
    def __init__(self,
                 embedding_dim : int,
                 num_heads : int,
                 att_dropout : float):
        super().__init__()

        self.LayerNorm = nn.LayerNorm(normalized_shape= embedding_dim)

        self.MultiHeadAttention = nn.MultiheadAttention(embed_dim= embedding_dim,
                                                        num_heads= num_heads,
                                                        dropout= att_dropout,
                                                        batch_first= True)
        
    def forward(self, x):
        x = self.LayerNorm(x)
        attn_output, _ = self.MultiHeadAttention(query= x,
                                                 key= x,
                                                 value= x,
                                                 need_weights = False)
        return attn_output

In [67]:
class MultiLayerPreceptronBlock(nn.Module):
    def __init__(self,
                 embedding_dim: int,
                 mlp_size: int,
                 dropout: float):
        super().__init__()

        self.LayerNorm = nn.LayerNorm(normalized_shape= embedding_dim)

        self.MLP = nn.Sequential(
            nn.Linear(in_features= embedding_dim,
                      out_features= mlp_size),
            nn.GELU(),
            nn.Dropout(p=dropout),
            nn.Linear(in_features= mlp_size,
                      out_features= embedding_dim),
            nn.Dropout(p= dropout)
        )

    def forward(self, x):
        x = self.LayerNorm(x)
        x = self.MLP(x)
        return x

In [68]:
class TransformerEncoder(nn.Module):
    def __init__(self,
                 embedding_dim: int,
                 num_heads: int,
                 mlp_size: int,
                 attn_dropout: float,
                 mlp_dropout: float):
        super().__init__()
        self.MSA_Block = MultiHeadSelfAttentionBlock(embedding_dim= embedding_dim,
                                               num_heads= num_heads,
                                               att_dropout= attn_dropout)
        self.MLP_Block = MultiLayerPreceptronBlock(embedding_dim= embedding_dim,
                                             mlp_size= mlp_size,
                                             dropout= mlp_dropout)
        
    def forward(self, x):
        x = self.MSA_Block(x) + x
        x = self.MLP_Block(x) + x
        x = self.MSA_Block(x) + x
        return x

In [69]:
class ViTBlock(nn.Module):
    def __init__(self,
                 image_size: int,
                 in_channels: int,
                 patch_size: int,
                 num_transformer_layers: int,
                 embedding_dim: int,
                 mlp_size: int,
                 num_heads: int,
                 attn_dropout: float,
                 mlp_dropout: float,
                 embedding_dropout: float,
                 num_classes: int = 2):
        super().__init__()

        assert image_size % patch_size == 0, "patch size is divisible by image size"

        self.num_patches = int(image_size ** 2 / patch_size ** 2)
        
        self.class_embedding = nn.Parameter(torch.randn(1, 1, embedding_dim),
                                            requires_grad= True)
        
        self.position_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embedding_dim),
                                               requires_grad= True)
        
        self.patch_embedding = PatchEmbedding(in_channels= in_channels,
                                              patch_size= patch_size,
                                              embedding_dim= embedding_dim)
        
        self.embedding_dropout = nn.Dropout(p = embedding_dropout)

        self.transformerencoder = nn.Sequential(* [TransformerEncoder(embedding_dim= embedding_dim,
                                                     num_heads= num_heads,
                                                     mlp_size= mlp_size,
                                                     attn_dropout= attn_dropout,
                                                     mlp_dropout= mlp_dropout) for _ in range(num_transformer_layers)])
        

    def forward(self, x):
        batch_size = x.shape[0]

        class_token = self.class_embedding.expand(batch_size, -1, -1)

        x = self.patch_embedding(x)

        x = torch.cat((class_token, x), dim = 1)

        x = self.position_embedding + x

        x = self.embedding_dropout(x)

        x = self.transformerencoder(x)

        return x

In [74]:
class AttentionMechBlock(nn.Module):
    def __init__(self, dim, units=128):
        super().__init__()
        self.query = nn.Linear(dim, units)
        self.key = nn.Linear(dim, units)
        self.value = nn.Linear(dim, units)
        self.LayerNorm = nn.LayerNorm(normalized_shape= units)
    
    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        attn = torch.softmax(Q @ K.transpose(1,2) / (x.size(-1)**0.5), dim=-1)
        return self.LayerNorm((attn @ V).mean(dim=1))

In [75]:
class HybridModel(nn.Module):

    def __init__(self,
                 image_size: int,
                 in_channels: int,
                 hidden_units: int,
                 output_shape: int,
                 patch_size: int,
                 num_transformer_layers: int,
                 embedding_dim: int,
                 mlp_size: int,
                 num_heads: int,
                 attn_dropout: float,
                 mlp_dropout: float,
                 embedding_dropout: float,
                 units: int = 128,
                 num_classes: int = 2):
        super().__init__()
        self.CNNBlock = CNNBlock(input_shape= 3,
                                 hidden_units= hidden_units,
                                 output_shape= output_shape)
        self.ViTBlock = ViTBlock(image_size= image_size,
                                 in_channels= in_channels,
                                 patch_size= patch_size,
                                 num_transformer_layers= num_transformer_layers,
                                 embedding_dim= embedding_dim,
                                 mlp_size= mlp_size,
                                 num_heads= num_heads,
                                 attn_dropout= attn_dropout,
                                 mlp_dropout= mlp_dropout,
                                 embedding_dropout= embedding_dropout,
                                 num_classes= num_classes)
        self.AttentionMechBlock = AttentionMechBlock(dim= embedding_dim,
                                                     units= units)
        self.classifier = torch.nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p = mlp_dropout),
            nn.Linear(in_features= units,
                      out_features= num_classes)
        )
    
    def forward(self, x):
        x = self.CNNBlock(x)
        x = self.ViTBlock(x)
        x = self.AttentionMechBlock(x)
        x = self.classifier(x)
        return x


In [76]:
Model = HybridModel(image_size= 10,
                    in_channels= 64,
                    hidden_units= 32,
                    output_shape= 64,
                    patch_size= 2,
                    num_transformer_layers= 5,
                    embedding_dim= 256,
                    mlp_size= 1024,
                    num_heads= 128,
                    attn_dropout= 0,
                    mlp_dropout= 0.1,
                    embedding_dropout= 0.1,
                    units= 128,
                    num_classes= 2
                    )

In [77]:
summary(Model,
        input_size= (32, 3, 32, 32))

Layer (type:depth-idx)                                  Output Shape              Param #
HybridModel                                             [32, 2]                   --
├─CNNBlock: 1-1                                         [32, 64, 10, 10]          --
│    └─Sequential: 2-1                                  [32, 64, 10, 10]          --
│    │    └─Conv2d: 3-1                                 [32, 32, 30, 30]          896
│    │    └─BatchNorm2d: 3-2                            [32, 32, 30, 30]          64
│    │    └─ReLU: 3-3                                   [32, 32, 30, 30]          --
│    │    └─Conv2d: 3-4                                 [32, 32, 28, 28]          9,248
│    │    └─BatchNorm2d: 3-5                            [32, 32, 28, 28]          64
│    │    └─ReLU: 3-6                                   [32, 32, 28, 28]          --
│    │    └─MaxPool2d: 3-7                              [32, 32, 14, 14]          --
│    │    └─Conv2d: 3-8                                 

In [78]:
optimizer = torch.optim.Adam(Model.parameters(), lr = 0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer= optimizer,
    mode= "min",
    factor= 0.5,
    patience= 5,
    verbose= True
)

loss_func = torch.nn.CrossEntropyLoss()



In [None]:
from Scripts import engine

wandb.init(project="AI_Image_Classification", name="Hybrid_model_01", settings=wandb.Settings(symlink=False))
wandb.config.epochs = 10
wandb.config.batch_size = 32

config = wandb.config
results = { 
        "train loss": [],
        "train acc": [],
        "test loss": [],
        "test acc": []
    }
    
train_dataloader = DataLoader(train_data,
                            batch_size= config.batch_size,
                            shuffle= True)
val_dataloader = DataLoader(val_data,
                            batch_size= config.batch_size)
test_dataloader = DataLoader(test_data,
                             batch_size= config.batch_size)

for epoch in tqdm(range(config.epochs)):
    train_loss, train_acc,y_train_actual, y_train_predicted = engine.train_loop(model= Model,
                                                                                train_dataloader= train_dataloader,
                                                                                loss_fn= loss_func,
                                                                                optimizer= optimizer,
                                                                                device= device)
    test_loss, test_acc, y_test_actual, y_test_predicted = engine.test_loop(model= Model,
                                                                            test_dataloader= val_dataloader,
                                                                            loss_fn= loss_func,
                                                                            device= device)
    
    results["train loss"].append(train_loss.item() if isinstance(train_loss, torch.Tensor) else train_loss)
    results["train acc"].append(train_acc.item() if isinstance(train_acc, torch.Tensor) else train_acc)
    results["test loss"].append(test_loss.item() if isinstance(test_loss, torch.Tensor) else test_loss)
    results["test acc"].append(test_acc.item() if isinstance(test_acc, torch.Tensor) else test_acc)

    wandb.log({
        "epoch" : epoch + 1,
        "train_loss" : train_loss,
        "train_accuracy" : train_acc,
        "test_loss" : test_loss,
        "test_accuracy" : test_acc,
    })

    print(f"Epoch {epoch + 1}/{config.epochs}: train loss: {train_loss:.4f} |\ntrain accuracy: {train_acc:.4f} |\ntest loss: {test_loss:.4f} |\ntest accuracy: {test_acc:.4f}")

    torch.onnx.export(
        Model,
        torch.randn(1,3,224,224),
        "Models\model.onnx",
        input_names = ["input"],
        output_names = ["output"],
    )

wandb.log_artifact("Models\hybridmodel.onnx", name= "Hybrid_model_01", type= "model")

print("Model training completed.")

In [None]:
classification_report(y_train_actual, y_train_predicted)

In [None]:
classification_report(y_test_actual, y_test_predicted)

In [None]:
cm = confusion_matrix(y_train_actual, y_train_predicted)
disp = ConfusionMatrixDisplay(cm)
disp.plot();

In [None]:
cm = confusion_matrix(y_test_actual, y_test_predicted)
disp = ConfusionMatrixDisplay(cm)
disp.plot();