In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'tomatoleaf:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F619181%2F1105687%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240629%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240629T065839Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4f407181187cab0d90fbef2ad0522f6f1f810c51102bb24fb1203bf7cdb7ba9118993269112a83a69e3c77f030104ce7e37987e1322f33b92d47deef8a2eb5688ef4f2139afa827532baf3f63a8b7d28d0d523f4990c020786d5a02444af2ff32c0aa6293ddb598e51e4d1cff1ccf84e7bf3fa0106091b43018760e8565f8b4ca732dc5632e7749c56b560caccf94ab6880b88887a5d9d1ffe34dbd2d5704df8daa1f0fb00c2719823ed9485b7a19e2434c5ab63221b8c813e794159daedc3a5e5c1953d5b5cb1933c6b5e509b27cb2625b271ce5bb253adebe6d6e35c3a9f69401cfd6a7685d8f4e4d0d0e662ea8767f96bb70db00b7f92c106ccb7a18b1c4a'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading tomatoleaf, 187559775 bytes compressed


Downloaded and uncompressed: tomatoleaf

Data source import complete.


In [2]:
!pip install einops --trusted-host pypi.org --trusted-host files.pythonhosted.org




In [3]:
import tensorflow as tf

In [4]:
# Ensure we use GPU if available
device_name = tf.test.gpu_device_name()
print(f'Using device: {device_name}')

Using device: /device:GPU:0


In [5]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [6]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torchvision.models import resnet50
from einops.layers.torch import Rearrange
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

class HardDistillationLoss(nn.Module):
    def __init__(self, teacher, n_classes=10, device='cuda'): # Add device argument
        super().__init__()
        self.teacher = teacher.to(device) # Move teacher to device
        self.criterion = nn.CrossEntropyLoss()
        self.n_classes = n_classes
        # Add a projection layer to map teacher logits to student's class range
        self.projection = nn.Linear(1000, n_classes).to(device)

    def forward(self, inputs, outputs, labels):
        base_loss = self.criterion(outputs[0], labels)

        with torch.no_grad():
            teacher_outputs = self.teacher(inputs)
            # Project teacher logits to student's class range
            projected_logits = self.projection(teacher_outputs)
            teacher_labels = torch.argmax(projected_logits, dim=1)
        teacher_loss = self.criterion(outputs[1], teacher_labels)

        return 0.5 * base_loss + 0.5 * teacher_loss



Distillation Token

In [24]:
class PatchEmbedding(nn.Module):
    def __init__(self, in_channels, patch_size, emb_size, img_size, device='cuda'):  # Add device as a parameter
        super().__init__()
        self.projection = nn.Sequential(
            nn.Conv2d(in_channels, emb_size, kernel_size=patch_size, stride=patch_size),
            Rearrange('b e (h) (w) -> b (h w) e'),
        ).to(device)
        self.cls_token = nn.Parameter(torch.randn(1, 1, emb_size).to(device))  # Move to device
        self.dist_token = nn.Parameter(torch.randn(1, 1, emb_size).to(device))  # Move to device
        self.positions = nn.Parameter(torch.randn((img_size // patch_size)**2 + 2, emb_size).to(device))  # Move to device


    def forward(self, x):
        b = x.shape[0]
        x = self.projection(x)
        cls_tokens = self.cls_token.expand(b, -1, -1)
        dist_tokens = self.dist_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, dist_tokens, x), dim=1)
        x += self.positions
        return x



In [25]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return x + self.fn(x, **kwargs)


In [26]:
class FeedForwardBlock(nn.Sequential):
    def __init__(self, emb_size, expansion=4, drop_p=0.):
        super().__init__(
            nn.Linear(emb_size, expansion * emb_size),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(expansion * emb_size, emb_size),
        )


In [27]:
class TransformerEncoderBlock(nn.Sequential):
    def __init__(self, emb_size=768, drop_p=0., forward_expansion=4, forward_drop_p=0., **kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                MultiHeadAttention(emb_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(emb_size),
                FeedForwardBlock(emb_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            ))
        )


In [36]:
class DeiT(nn.Module):
    def __init__(self, in_channels, patch_size, emb_size, img_size, depth, n_classes, device):
        super().__init__()
        # Pass the device to the PatchEmbedding
        self.patch_embedding = PatchEmbedding(in_channels, patch_size, emb_size, img_size).to(device)  # Move to device
        # Pass emb_size as a keyword argument
        self.transformer_encoder = TransformerEncoder(depth, emb_size=emb_size).to(device)  # Move to device
        self.classification_head = ClassificationHead(emb_size, n_classes, device).to(device)

    def forward(self, x):
      x = self.patch_embedding(x)
      x = self.transformer_encoder(x)
      return self.classification_head(x)


In [37]:
img_size = 224
batch_size = 16
in_channels = 3
patch_size = 16
emb_size = 768
depth = 12
n_classes = 10

# Check the range of your labels
def check_labels(labels):
    if torch.any(labels < 0) or torch.any(labels >= n_classes):
        print("Error: Found labels out of range!")
        print(f"Labels: {labels}")
        return False
    return True

    if check_labels(labels):
        print("Labels are within the valid range.")
    else:
        print("Invalid labels detected.")

train_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = ImageFolder('/kaggle/input/tomatoleaf/tomato/train', transform=train_transform)
val_dataset = ImageFolder('/kaggle/input/tomatoleaf/tomato/val', transform=val_transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)


In [38]:
class TransformerEncoder(nn.Sequential):
    def __init__(self, depth=12, **kwargs):
        super().__init__(*[TransformerEncoderBlock(**kwargs) for _ in range(depth)])


Classification Head

In [39]:
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size=768, num_heads=8, dropout=0.):
        super().__init__()
        self.emb_size = emb_size
        self.num_heads = num_heads
        self.qkv = nn.Linear(emb_size, emb_size * 3)
        self.att_drop = nn.Dropout(dropout)
        self.projection = nn.Linear(emb_size, emb_size)

    def forward(self, x, mask=None):
        qkv = self.qkv(x).chunk(3, dim=-1)
        queries, keys, values = map(lambda t: t.view(t.shape[0], t.shape[1], self.num_heads, self.emb_size // self.num_heads).transpose(1, 2), qkv)
        scores = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) / (self.emb_size ** (1 / 2))
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = self.att_drop(F.softmax(scores, dim=-1))
        out = torch.einsum('bhqk, bhvd -> bhqd', attn, values).transpose(1, 2).contiguous().view(x.shape[0], x.shape[1], self.emb_size)
        return self.projection(out)


In [40]:
class ClassificationHead(nn.Module):
    def __init__(self, emb_size=768, n_classes=10, device='cuda'): # Add device as argument
        super().__init__()
        self.head = nn.Linear(emb_size, n_classes).to(device) # Move head to device
        self.dist_head = nn.Linear(emb_size, n_classes).to(device) # Move dist_head to device

    def forward(self, x):
        x, x_dist = x[:, 0], x[:, 1]
        x_head = self.head(x)
        x_dist_head = self.dist_head(x_dist)
        if self.training:
            return x_head, x_dist_head
        else:
            return (x_head + x_dist_head) / 2



In [41]:
import gc
from torchvision.models import resnet50, ResNet50_Weights

learning_rate = 1e-4
num_epochs = 10

# Define device first to be used in model instantiation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

gc.collect()
torch.cuda.empty_cache()

teacher = resnet50(weights=ResNet50_Weights.DEFAULT)
teacher.eval()
teacher.to(device)  # Move the teacher model to the GPU

# Pass device to the DeiT constructor so model is created on the correct device
student = DeiT(in_channels, patch_size, emb_size, img_size, depth, n_classes, device)
optimizer = optim.Adam(student.parameters(), lr=learning_rate)
criterion = HardDistillationLoss(teacher, device=device)

In [35]:
import torch
print(torch.__version__)


2.3.0+cu121


MultiHead Attention

In [None]:
# Training loop
for epoch in range(num_epochs):
  student.train()
  total_loss = 0.0
  total_correct = 0

  for inputs, labels in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
    inputs, labels = inputs.to(device), labels.to(device)

    # Forward pass, explicitly moving teacher outputs to GPU
    with torch.no_grad():  # No need to calculate gradients for teacher
        teacher_outputs = teacher(inputs)  # inputs is already on the device

    # Reduce batch size if it's too large
    batch_size = inputs.size(0)
    if batch_size > 16:  # Adjust this threshold as needed
        num_splits = batch_size // 16
        input_splits = torch.split(inputs, 16)
        output_splits = []
        for input_split in input_splits:
            output_splits.append(tuple(o.to(device) for o in student(input_split)))
        outputs = tuple(torch.cat(tensors, dim=0) for tensors in zip(*output_splits))
    else:
        outputs = tuple(o.to(device) for o in student(inputs))  # Move student outputs to GPU

    # Compute loss
    loss = criterion(inputs, outputs, labels)

    # Backward pass and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    # Calculate accuracy
    _, predicted = torch.max(outputs[0], 1)
    total_correct += (predicted == labels).sum().item()

  # Print average loss and accuracy for the epoch
  avg_loss = total_loss / len(train_dataloader)
  accuracy = total_correct / len(train_dataset)
  print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

# Save the trained model
torch.save(student.state_dict(), 'deit_student.pth')

Epoch 1/10:  45%|████▍     | 279/625 [02:59<03:39,  1.58it/s]

Model Definition

In [None]:
student.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in tqdm(val_dataloader, desc='Evaluation'):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = student(inputs)
        _, predicted = torch.max(outputs, 1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

print(f'Validation Accuracy: {accuracy:.4f}')
print(f'Confusion Matrix:\n{cm}')
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}')


plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

