In [None]:
#!/usr/bin/env python
# coding: utf-8

import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import pathlib
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau




: 

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Download dataset using kagglehub
import kagglehub
path = kagglehub.dataset_download("masoudnickparvar/brain-tumor-mri-dataset")
data_dir = pathlib.Path(path)
print(f"Path to dataset files: {data_dir}")

# Define transformations for the input images
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),  # Convert grayscale images to 3-channel RGB (required by ViT)
    transforms.Resize((224, 224)),  # Resize all images to 224x224 pixels
    transforms.ToTensor(),  # Convert image to a tensor with values in [0, 1]
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize pixel values to [-1, 1]
])

# Load training and validation datasets using torchvision's ImageFolder 
train_set = torchvision.datasets.ImageFolder(data_dir.joinpath('Training'), transform=transform)
val_set = torchvision.datasets.ImageFolder(data_dir.joinpath('Testing'), transform=transform)

# Create DataLoader objects to enable batch processing
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=2)


# Vision Transformer (ViT) Implementation
class PatchEmbedding(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768):
        super().__init__()
         # (i) Calculating the number of patches
        self.num_patches = (img_size // patch_size) ** 2  # Total number of patches (14x14 = 196 for a 224x224 image)
        # (iii) Creating the projection layer
        self.projection = nn.Conv2d(in_channels,       # 3 channels (RGB)
                                    embed_dim,         # The embedding dimension (size of the resulting vector)
                                    kernel_size=patch_size,  # Size of the patches (16x16)
                                    stride=patch_size)       # Set to be the same as kernel_size to ensure Non-overlapping patches

    def forward(self, x):
        # (i) Splitting the Image into Patches + (iii) Projecting Each Patch to a Vector (Embedding)
        x = self.projection(x)  # Produces a tensor of shape [batch_size, embed_dim, 14, 14]

        # (ii) Flattening Each Patch
        x = x.flatten(2)  # Flatten to [batch_size, embed_dim, num_patches] (14x14 patches = 196)

        # (iv) Stacking Patches into a Sequence
        x = x.transpose(1, 2)  # Rearrange to [batch_size, num_patches, embed_dim]
        return x
    
#  Step-by-Step Process:
    # (i) Splitting the Image into Patches (Non-Overlapping)
    # (ii) Flattening Each Patch (Making them 1D)
    # (iii) Projecting Each Patch to a Vector (Embedding)
    # (iv) Stacking Patches into a Sequence


class PositionEmbedding(nn.Module):
    def __init__(self, num_patches, embed_dim):
        super().__init__()
        self.position_embedding = nn.Parameter(torch.randn(1, num_patches + 1, embed_dim))

    def forward(self, x):
        batch_size = x.size(0)
        cls_token = nn.Parameter(torch.randn(1, 1, x.size(-1))).to(device)
        cls_token = cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_token, x), dim=1)
        x = x + self.position_embedding
        return x


class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.layernorm2 = nn.LayerNorm(embed_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.attention(self.layernorm1(x), self.layernorm1(x), self.layernorm1(x))[0]
        x = x + self.mlp(self.layernorm2(x))
        return x


class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_channels=3, num_classes=4, embed_dim=768, num_heads=8, mlp_dim=2048, num_layers=6, dropout=0.1):
        super().__init__()
        self.patch_embedding = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        self.num_patches = (img_size // patch_size) ** 2
        self.position_embedding = PositionEmbedding(self.num_patches, embed_dim)

        self.encoder = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(num_layers)
        ])

        self.layernorm = nn.LayerNorm(embed_dim)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.patch_embedding(x)
        x = self.position_embedding(x)

        for block in self.encoder:
            x = block(x)

        x = self.layernorm(x)
        cls_token = x[:, 0]
        return self.classifier(cls_token)




In [None]:
# Initialize model
model = VisionTransformer().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)


# Training Loop
num_epochs = 10
train_loss_list = []
val_loss_list = []
accuracy_list = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    train_loss /= len(train_loader)
    train_loss_list.append(train_loss)
    
    # Validation Phase
    model.eval()
    correct = 0
    total = 0
    val_loss = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    val_loss /= len(val_loader)
    accuracy = 100 * correct / total
    val_loss_list.append(val_loss)
    accuracy_list.append(accuracy)
    
    scheduler.step(val_loss)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%')


# Plot Loss and Accuracy
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_loss_list, label='Train Loss')
plt.plot(val_loss_list, label='Validation Loss')
plt.legend()
plt.title('Loss')

plt.subplot(1, 2, 2)
plt.plot(accuracy_list, label='Accuracy')
plt.legend()
plt.title('Accuracy')

plt.show()
