In [None]:
# Install necessary packages
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install scipy

import clip
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
import os
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import shutil

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-chkumfvc
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-chkumfvc
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
# Check PyTorch version
import packaging
version = packaging.version.parse(torch.__version__)
if version > packaging.version.parse('1.7.0'):
    print("Pytorch version is above 1.7.0")
    print("It is version:", version)
else:
    print("PyTorch version is not above 1.7.0. Please Upgrade")

Pytorch version is above 1.7.0
It is version: 2.6.0+cu124


In [None]:
# Load CLIP model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
# Unfreeze more layers from CLIP
for name, param in model.named_parameters():
    if "visual" in name:
        param.requires_grad = True

In [None]:
class TransformerHead(nn.Module):
    def __init__(self, input_dim, output_dim=512, num_heads=8, num_layers=2, dropout_prob=0.2):
        super(TransformerHead, self).__init__()
        self.attn_layer = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dropout=dropout_prob),
            num_layers=num_layers
        )
        self.fc_img = nn.Linear(input_dim, output_dim)
        self.fc_txt = nn.Linear(input_dim, output_dim)
        self.layer_norm = nn.LayerNorm(input_dim)
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, image_features, text_features):
        features = torch.cat((image_features, text_features), dim=-1)  # Concatenate image + text features
        features = features.unsqueeze(1)  # Add batch dimension for attention
        attn_output, _ = self.attn_layer(features, features, features)  # Attention layer
        attn_output = self.dropout(attn_output)  # Apply dropout to attention output
        transformer_output = self.transformer(attn_output)  # Transformer layer
        transformer_output = transformer_output.squeeze(1)  # Remove sequence length dimension

        transformer_output = self.dropout(transformer_output)  # Dropout applied on the transformer output
        img_out = self.fc_img(transformer_output)  # Image branch
        txt_out = self.fc_txt(transformer_output)  # Text branch
        return img_out, txt_out

    def encode_image_only(self, image_features):
        # Combine with dummy text input
        dummy_text = torch.zeros_like(image_features)  # (B, 512)
        features = torch.cat((image_features, dummy_text), dim=-1)  # (B, 1024)
        features = features.unsqueeze(1)  # (B, 1, 1024)

        # Pass through attention and transformer
        attn_output, _ = self.attn_layer(features, features, features)
        attn_output = self.dropout(attn_output)  # Apply dropout to attention output
        transformer_output = self.transformer(attn_output).squeeze(1)
        transformer_output = self.layer_norm(transformer_output)  # (B, 1024)

        # Output 512-dim embedding from the image branch
        img_out = self.fc_img(transformer_output)  # (B, 512)
        return img_out


# DATA HERE!!!!

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("amirmakir/dogs-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/dogs-dataset


In [None]:
dataset_root = os.path.join(path, 'dogs', 'train')

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Define paths
dataset_root = os.path.join(path, 'dogs', 'train')  # Path to your "train" folder
val_root = "/kaggle/working/dogs/val"  # Path to save the validation split (working directory)

# Create the validation root folder if it doesn't exist
if not os.path.exists(val_root):
    os.makedirs(val_root)

# Split data within each class folder
for class_name in os.listdir(dataset_root):
    class_folder = os.path.join(dataset_root, class_name)
    if os.path.isdir(class_folder):
        # Get list of image files in the class folder
        image_files = [f for f in os.listdir(class_folder) if os.path.isfile(os.path.join(class_folder, f))]

        # Split the images into training and validation sets
        train_files, val_files = train_test_split(image_files, test_size=0.2, random_state=42)

        # Create corresponding folders in the validation directory
        val_class_folder = os.path.join(val_root, class_name)
        if not os.path.exists(val_class_folder):
            os.makedirs(val_class_folder)

        # Copy validation images to the validation folder
        for val_image in val_files:
            src = os.path.join(class_folder, val_image)
            dst = os.path.join(val_class_folder, val_image)
            shutil.copy(src, dst)  # Use copy instead of move

# After this, the validation set should be created in /kaggle/working/dogs/val


In [None]:
from torchvision import datasets
from torch.utils.data import DataLoader
import clip
import os

# Define the transformation for CLIP preprocessing (same as when we loaded the model)
# CLIP preprocess automatically resizes, normalizes, and converts to tensor
train_transform = preprocess
val_transform = preprocess

# Create datasets for train and validation using ImageFolder
train_dataset = datasets.ImageFolder(root=dataset_root, transform=train_transform)
val_dataset = datasets.ImageFolder(root=val_root, transform=val_transform)

# Create DataLoaders for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
class_names = train_dataset.classes
print(class_names)

['Afghan_hound', 'Blenheim_spaniel', 'Chihuahua', 'Japanese_spaniel', 'Maltese_dog', 'Pekinese', 'Rhodesian_ridgeback', 'Shih_Tzu', 'papillon', 'toy_terrier']


In [None]:
def generate_text_prompts(labels, class_names):
    return [f"A photo of a {class_names[label]}" for label in labels]

In [None]:
input_dim = 512 * 2
transformer_head = TransformerHead(input_dim=input_dim, output_dim=512).to(device)

In [None]:
# Initialize optimizer, scaler, and scheduler
optimizer = torch.optim.AdamW(transformer_head.parameters(), lr=1e-6, weight_decay=1e-4)
scaler = torch.cuda.amp.GradScaler(enabled=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20, eta_min=1e-6)

# CLIP loss function
def clip_loss(image_features, text_features):
    logits_per_image = image_features @ text_features.T
    logits_per_text = logits_per_image.T
    labels = torch.arange(image_features.size(0), device=image_features.device)
    loss_img = F.cross_entropy(logits_per_image, labels)
    loss_txt = F.cross_entropy(logits_per_text, labels)
    return (loss_img + loss_txt) / 2

# Training loop with early stopping and learning rate scheduler
num_epochs = 10
best_val_loss = float('inf')
patience = 5
epochs_without_improvement = 0

for epoch in range(num_epochs):
    transformer_head.train()
    running_train_loss = 0.0

    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Train)"):
        images = images.to(device)
        labels = labels.to(device)
        texts = [f"A photo of a {train_dataset.classes[label]}" for label in labels]
        texts = clip.tokenize(texts).to(device)

        with torch.cuda.amp.autocast():
            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)
            img_out, txt_out = transformer_head(image_features, text_features)
            loss = clip_loss(img_out, txt_out)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        running_train_loss += loss.item()

    avg_train_loss = running_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    transformer_head.eval()
    running_val_loss = 0.0

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} (Val)"):
            images = images.to(device)
            labels = labels.to(device)
            texts = [f"A photo of a {train_dataset.classes[label]}" for label in labels]
            texts = clip.tokenize(texts).to(device)

            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)
            loss = clip_loss(image_features, text_features)
            running_val_loss += loss.item()

    avg_val_loss = running_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_without_improvement = 0
        print(f"Validation loss improved. Saving model...")
        torch.save(transformer_head.state_dict(), 'best_transformer_head.pth')
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print("Early stopping triggered. Training stopped.")
        break

    scheduler.step()

# Load best model and save the fine-tuned model
transformer_head.load_state_dict(torch.load('best_transformer_head.pth'))
torch.save(transformer_head.state_dict(), 'transformer_head_finetuned.pth')

  scaler = torch.cuda.amp.GradScaler(enabled=True)
  with torch.cuda.amp.autocast():
Epoch 1/10 (Train): 100%|██████████| 51/51 [00:17<00:00,  2.98it/s]


Epoch 1/10 - Training Loss: 12.6524


Epoch 1/10 (Val): 100%|██████████| 11/11 [00:02<00:00,  4.94it/s]


Epoch 1/10 - Validation Loss: 3.9354
Validation loss improved. Saving model...


Epoch 2/10 (Train): 100%|██████████| 51/51 [00:17<00:00,  2.88it/s]


Epoch 2/10 - Training Loss: 11.2631


Epoch 2/10 (Val): 100%|██████████| 11/11 [00:02<00:00,  5.39it/s]


Epoch 2/10 - Validation Loss: 3.9354


Epoch 3/10 (Train): 100%|██████████| 51/51 [00:16<00:00,  3.02it/s]


Epoch 3/10 - Training Loss: 11.0106


Epoch 3/10 (Val): 100%|██████████| 11/11 [00:02<00:00,  5.42it/s]


Epoch 3/10 - Validation Loss: 3.9354


Epoch 4/10 (Train): 100%|██████████| 51/51 [00:17<00:00,  2.98it/s]


Epoch 4/10 - Training Loss: 9.8913


Epoch 4/10 (Val): 100%|██████████| 11/11 [00:02<00:00,  5.33it/s]


Epoch 4/10 - Validation Loss: 3.9354


Epoch 5/10 (Train): 100%|██████████| 51/51 [00:16<00:00,  3.03it/s]


Epoch 5/10 - Training Loss: 9.3664


Epoch 5/10 (Val): 100%|██████████| 11/11 [00:02<00:00,  3.93it/s]


Epoch 5/10 - Validation Loss: 3.9354


Epoch 6/10 (Train): 100%|██████████| 51/51 [00:16<00:00,  3.02it/s]


Epoch 6/10 - Training Loss: 8.4260


Epoch 6/10 (Val): 100%|██████████| 11/11 [00:02<00:00,  5.41it/s]


Epoch 6/10 - Validation Loss: 3.9354
Early stopping triggered. Training stopped.


## Compute Accuracy with Newly Trained Model

In [None]:
from torchvision import datasets
from torch.utils.data import DataLoader

# Paths
test_root = os.path.join(path, 'dogs', 'test')

# Load test set
test_dataset = datasets.ImageFolder(root=test_root, transform=preprocess)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Get all class names
class_names = test_dataset.classes
print("Class names:", class_names)

# Generate text features for all classes once
with torch.no_grad():
    all_texts = [f"A photo of a {classname}" for classname in class_names]
    tokenized_texts = clip.tokenize(all_texts).to(device)
    text_features_all = model.encode_text(tokenized_texts)  # Shape: (num_classes, 512)

Class names: ['Afghan_hound', 'Blenheim_spaniel', 'Chihuahua', 'Japanese_spaniel', 'Maltese_dog', 'Pekinese', 'Rhodesian_ridgeback', 'Shih_Tzu', 'papillon', 'toy_terrier']


In [None]:
def compute_topk_accuracy(image_features, text_features_all, labels, topk=(1, 3, 5)):
    # Compute similarity scores
    logits = image_features @ text_features_all.T  # (B, num_classes)
    _, topk_indices = logits.topk(max(topk), dim=1, largest=True, sorted=True)

    results = {}
    for k in topk:
        correct = topk_indices[:, :k].eq(labels.view(-1, 1)).sum().item()
        results[f"top{k}"] = correct
    return results

In [None]:
# Normalize text features for cosine similarity (ensure both are float32)
text_features_all_norm = F.normalize(text_features_all, dim=-1).float()

with torch.no_grad():
    for images, labels in tqdm(test_loader, desc="Evaluating"):
        images = images.to(device)
        labels = labels.to(device)

        # Get image features and cast to float32
        image_features = model.encode_image(images).float()

        # Get 512-dim image embedding from your transformer
        img_out = transformer_head.encode_image_only(image_features)

        # Normalize both image and text features
        img_out = F.normalize(img_out, dim=-1)
        text_features_all_norm = F.normalize(text_features_all, dim=-1).float()  # Ensures dtype match

        # Compare embeddings via dot product (cosine sim)
        accs = compute_topk_accuracy(img_out, text_features_all_norm, labels)

        print("img_out shape:", img_out.shape)  # Should be (B, 512)
        print("text_features_all shape:", text_features_all.shape)  # Should be (num_classes, 512)

        top1_total += accs['top1']
        top3_total += accs['top3']
        top5_total += accs['top5']
        num_samples += images.size(0)

# Final results
print(f"\nTop-1 Accuracy: {top1_total / num_samples * 100:.2f}%")
print(f"Top-3 Accuracy: {top3_total / num_samples * 100:.2f}%")
print(f"Top-5 Accuracy: {top5_total / num_samples * 100:.2f}%")


Evaluating:  10%|█         | 1/10 [00:00<00:02,  4.17it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  20%|██        | 2/10 [00:00<00:01,  4.14it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  30%|███       | 3/10 [00:00<00:01,  4.40it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  40%|████      | 4/10 [00:00<00:01,  4.57it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  50%|█████     | 5/10 [00:01<00:01,  4.65it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  60%|██████    | 6/10 [00:01<00:00,  4.73it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  70%|███████   | 7/10 [00:01<00:00,  4.68it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating:  90%|█████████ | 9/10 [00:01<00:00,  4.83it/s]

img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])
img_out shape: torch.Size([32, 512])
text_features_all shape: torch.Size([10, 512])


Evaluating: 100%|██████████| 10/10 [00:02<00:00,  4.57it/s]

img_out shape: torch.Size([29, 512])
text_features_all shape: torch.Size([10, 512])

Top-1 Accuracy: 12.18%
Top-3 Accuracy: 32.30%
Top-5 Accuracy: 54.20%





In [None]:
def compute_zero_shot_accuracy(model, image_loader, text_features_all, device, topk=(1, 3, 5)):
    top1_total = 0
    top3_total = 0
    top5_total = 0
    num_samples = 0

    # Ensure text features are normalized
    text_features_all_norm = F.normalize(text_features_all, dim=-1).float()

    with torch.no_grad():
        for images, labels in tqdm(image_loader, desc="Zero-shot Evaluation"):
            images = images.to(device)
            labels = labels.to(device)

            # Get image features (using zero-shot CLIP model)
            image_features = model.encode_image(images).float()

            # Normalize the image features
            image_features = F.normalize(image_features, dim=-1)

            # Compute cosine similarity between image features and text features
            logits = image_features @ text_features_all_norm.T  # (B, num_classes)
            _, topk_indices = logits.topk(max(topk), dim=1, largest=True, sorted=True)

            # Compute the top-k accuracy
            for k in topk:
                correct = topk_indices[:, :k].eq(labels.view(-1, 1)).sum().item()
                if k == 1:
                    top1_total += correct
                elif k == 3:
                    top3_total += correct
                elif k == 5:
                    top5_total += correct
            num_samples += images.size(0)

    top1_accuracy = top1_total / num_samples * 100
    top3_accuracy = top3_total / num_samples * 100
    top5_accuracy = top5_total / num_samples * 100

    return top1_accuracy, top3_accuracy, top5_accuracy


In [None]:
# Load text features for all classes once
with torch.no_grad():
    all_texts = [f"A photo of a {classname}" for classname in class_names]
    tokenized_texts = clip.tokenize(all_texts).to(device)
    text_features_all = model.encode_text(tokenized_texts)  # Shape: (num_classes, 512)

# Compute zero-shot accuracy on the test set using the zero-shot CLIP model
top1_zero_shot, top3_zero_shot, top5_zero_shot = compute_zero_shot_accuracy(model, test_loader, text_features_all, device)

# Output zero-shot accuracy
print(f"Zero-shot Top-1 Accuracy: {top1_zero_shot:.2f}%")
print(f"Zero-shot Top-3 Accuracy: {top3_zero_shot:.2f}%")
print(f"Zero-shot Top-5 Accuracy: {top5_zero_shot:.2f}%")


Zero-shot Evaluation: 100%|██████████| 10/10 [00:02<00:00,  4.76it/s]

Zero-shot Top-1 Accuracy: 74.45%
Zero-shot Top-3 Accuracy: 91.17%
Zero-shot Top-5 Accuracy: 97.48%



