In [10]:
 import torch
import torchvision
from pathlib import Path

In [3]:
import matplotlib.pyplot as plt
import torch
import torchvision
import torchinfo
from torch import nn
from torchvision import transforms

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device
     

'cuda'

In [12]:
data_path = Path("data2/")

In [13]:
image_path = data_path / "pizza_steak_sushi"
train_dir = image_path / "train"
test_dir = image_path / "test"

In [14]:
#Transforms
data_transform = transforms.Compose([
    transforms.Resize(size=(224,224)),
    # transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor()
])

In [15]:
from torchvision import datasets
train_data = datasets.ImageFolder(train_dir, transform=data_transform, target_transform=None)
test_data = datasets.ImageFolder(test_dir, transform=data_transform, target_transform=None)


In [16]:
from torch.utils.data import DataLoader
import os

In [17]:
train_dataloader = DataLoader(dataset=train_data, num_workers=os.cpu_count(), batch_size=32, shuffle=True)
test_dataloader = DataLoader(dataset=test_data, num_workers=os.cpu_count(), batch_size=32, shuffle=True)

In [18]:
#Patch Embeddings

In [29]:
class PatchEmbeddings(nn.Module):
    
    def __init__(self, 
                 in_channels: int=3, 
                 embeddings_dimensions: int=768,
                 patch_size: int=16):
        super().__init__()
        
        self.patch_size = patch_size
        self.patched_embeddings = nn.Conv2d(in_channels=in_channels, out_channels=embeddings_dimensions, stride=patch_size, padding=0, kernel_size=patch_size)
        
        self.flatten_embeddings = nn.Flatten(self.patched_embeddings, start_dim=2, end_dim=3)
        
        def forward(self, x):
            image_resolution = x.shape[-1]
            assert image_resolution % patch_size == 0, f"Input image size must be divisible by patch size, image shape: {image_resolution}, patch size: {self.patch_size}"

            x_patched = self.patched_embeddings(x)
            x_flatten = self.flatten(x_patched)
            return x_flatten.permute(0,2,1)

In [34]:
class MultiHeadSelfAttentionBlock(nn.Module):
    
    def __init__(
        self,
        num_heads: int=12,
        embeddings_dimension: int=768,
        attn_dropout: int=0
    ):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(embeddings_dimension)
        self.multi_attn_layer = nn.MultiheadAttention(num_heads=num_heads, attn_dropout=attn_dropout, batch_first=True)
        
    def forward(self, x):
        x = self.layer_norm(x)
        attn_output, _  = self.multi_attn_layer(query=x, key=x, value=x, need_weights=False)
        return attn_output
        
        

In [35]:
class MLPBlock(nn.Module):
    def __init__(
        self,
        embeddings_dimension: int=768,
        dropout: int=0.1,
        mlp_size:  int=3072
    ):
        super().__init__()
        
        self.layer_norm = nn.LayerNorm(embeddings_dimension=embeddings_dimension),
        
        self.mlp = nn.Sequential(
            
        nn.Linear(in_features=embeddings_dimension, out_features=mlp_size),
        nn.GELU(),
        nn.Dropout(dropout=dropout),
        
        nn.Linear(in_features=mlp_size, out_features=embeddings_dimension),
        nn.GELU(),
        nn.Dropout(dropout=dropout)
        )
        
    def forward(self, x):
        x = self.layer_norm(x),
        x = self.mlp(x)
        
        return x

In [36]:
class TransfornmerEncoderBlock(nn.Module):
    def __init__(
        self,
        num_heads: int=12,
        embeddings_dimension: int=768,
        dropout: int=0.1,
        mlp_size:  int=3072,
        attn_dropout: int=0
    ):
        super().__init__()
        
        self.msa_layer = MultiHeadSelfAttentionBlock(num_heads=num_heads, embeddings_dimension=embeddings_dimension, attn_dropout=attn_dropout)
        
        self.mlp_block = MLPBlock(dropout=dropout, embeddings_dimension=embeddings_dimension, mlp_size=mlp_size)
        
    def forward(self, x):
        x = self.msa_layer(x) + x
        x = self.mlp_block(x) + x
        
        return x

In [37]:
encoder = TransfornmerEncoderBlock()


TypeError: MultiheadAttention.__init__() got an unexpected keyword argument 'attn_dropout'

In [28]:
from torchinfo import summary

summary(model=encoder,
        input_size=(1, 197, 768), # (batch_size, number_of_patches, embedding_dimension)
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

NameError: name 'encoder' is not defined