In [4]:
# Mount Google Drive and load data
from google.colab import drive
drive.mount('/content/drive')

# Unzip images (comment out after first run)
!unzip -q "/content/drive/MyDrive/csproj/landmark_images.zip" -d .

# Fix nested folder structure if needed
!if [ -d "landmark_images/landmark_images" ]; then mv landmark_images/landmark_images/* landmark_images/ && rm -rf landmark_images/__MACOSX landmark_images/landmark_images; fi

# Copy CSVs
!cp "/content/drive/MyDrive/csproj/train.csv" .
!cp "/content/drive/MyDrive/csproj/val.csv" .
!cp "/content/drive/MyDrive/csproj/test.csv" .


print("Files ready")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
replace ./__MACOSX/._landmark_images? [y]es, [n]o, [A]ll, [N]one, [r]ename: Files ready


## Imports

In [5]:
import os
import csv
import random
import unicodedata
import gc

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast, GradScaler
from transformers import CLIPProcessor, CLIPModel
import pandas as pd
import numpy as np
from PIL import Image
from pathlib import Path
from tqdm import tqdm
from torchvision import transforms
from peft import LoraConfig, get_peft_model

print("Imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

Imports successful
PyTorch version: 2.9.0+cu126
CUDA available: True


##Config

In [6]:
OUTPUT_DIR = "clip_lora"
CHECKPOINT_DIR = "checkpoints_lora"

MODEL_NAME = "openai/clip-vit-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Training hyperparameters
BATCH_SIZE = 128
ACCUMULATION_STEPS = 1
LEARNING_RATE = 2e-4
NUM_EPOCHS = 10
WEIGHT_DECAY = 1e-2

# LoRA settings
LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.05
WEIGHT_DECAY = 1e-3

# Data sampling (per landmark)
TRAIN_IMAGES_PER_LANDMARK = 11
VAL_IMAGES_PER_LANDMARK = 3
TEST_IMAGES_PER_LANDMARK = 2

# Regularization
EARLY_STOPPING_PATIENCE = 3
GRADIENT_CLIP_MAX_NORM = 1.0

SEED = 42

Path(OUTPUT_DIR).mkdir(exist_ok=True)
Path(CHECKPOINT_DIR).mkdir(exist_ok=True)

print(f"Device: {DEVICE}")

Device: cuda


## Util

In [9]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def normalize_path(path):
    return unicodedata.normalize('NFD', path)

def load_csv_safe(filepath):
    rows = []
    with open(filepath, 'r') as f:
        reader = csv.reader(f)
        header = next(reader)
        for row in reader:
            rows.append(row)
    return pd.DataFrame(rows, columns=header)

set_seed(SEED)
print(f"Random seed set to {SEED}")

Random seed set to 42


## Sample Data

In [10]:
def sample_images_per_landmark(df, images_per_landmark, seed=42):
    sampled_rows = []

    for landmark, group in df.groupby('landmark_name'):
        # Get unique images for this landmark
        unique_images = group.drop_duplicates('image_path')

        # Sample up to N images
        n_sample = min(images_per_landmark, len(unique_images))
        sampled_images = unique_images.sample(n=n_sample, random_state=seed)

        # Keep all rows (captions) for these sampled images
        sampled_image_paths = sampled_images['image_path'].tolist()
        landmark_rows = group[group['image_path'].isin(sampled_image_paths)]
        sampled_rows.append(landmark_rows)

    return pd.concat(sampled_rows, ignore_index=True)

# Load CSVs
print("Loading CSVs...")
train_df = load_csv_safe('train.csv')
val_df = load_csv_safe('val.csv')
test_df = load_csv_safe('test.csv')

# Normalize paths
train_df['image_path'] = train_df['image_path'].apply(normalize_path)
val_df['image_path'] = val_df['image_path'].apply(normalize_path)
test_df['image_path'] = test_df['image_path'].apply(normalize_path)

print(f"Original sizes:")
print(f"Train: {len(train_df):,} pairs, {train_df['image_path'].nunique():,} images")
print(f"Val: {len(val_df):,} pairs, {val_df['image_path'].nunique():,} images")
print(f"Test: {len(test_df):,} pairs, {test_df['image_path'].nunique():,} images")

# Sample images per landmark
train_df = sample_images_per_landmark(train_df, TRAIN_IMAGES_PER_LANDMARK, seed=SEED)
val_df = sample_images_per_landmark(val_df, VAL_IMAGES_PER_LANDMARK, seed=SEED)
test_df = sample_images_per_landmark(test_df, TEST_IMAGES_PER_LANDMARK, seed=SEED)

# Limit to 5 captions per image (train only)
train_df = train_df.groupby('image_path').apply(
    lambda x: x.sample(min(5, len(x)), random_state=SEED)
).reset_index(drop=True)

print(f"\nAfter sampling:")
print(f"Train: {len(train_df):,} pairs, {train_df['image_path'].nunique():,} images, {train_df['landmark_name'].nunique()} landmarks")
print(f"Val: {len(val_df):,} pairs, {val_df['image_path'].nunique():,} images, {val_df['landmark_name'].nunique()} landmarks")
print(f"Test: {len(test_df):,} pairs, {test_df['image_path'].nunique():,} images, {test_df['landmark_name'].nunique()} landmarks")

# Save sampled CSVs
train_df.to_csv('train_sampled.csv', index=False)
val_df.to_csv('val_sampled.csv', index=False)
test_df.to_csv('test_sampled.csv', index=False)
print("\nSaved sampled CSVs")

Loading CSVs...
Original sizes:
Train: 262,640 pairs, 26,264 images
Val: 5,628 pairs, 5,628 images
Test: 5,628 pairs, 5,628 images


  train_df = train_df.groupby('image_path').apply(



After sampling:
Train: 51,590 pairs, 10,318 images, 938 landmarks
Val: 2,814 pairs, 2,814 images, 938 landmarks
Test: 1,876 pairs, 1,876 images, 938 landmarks

Saved sampled CSVs


## Dataset Class

In [11]:
class LandmarkDataset(Dataset):

    def __init__(self, csv_path, processor):
        self.df = pd.read_csv(csv_path)
        self.processor = processor
        self.resize = transforms.Resize((224, 224))

        # Filter out missing images
        before = len(self.df)
        self.df = self.df[self.df['image_path'].apply(lambda p: Path(p).exists())]
        after = len(self.df)

        if before != after:
            print(f"  Warning: {before - after} missing images filtered out")

        print(f"Loaded {len(self.df):,} pairs from {csv_path}")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['image_path']).convert('RGB')
        image = self.resize(image)

        return {
            'image': image,
            'text': row['description'],
            'landmark_name': row['landmark_name']
        }

print("Dataset class defined")

Dataset class defined


## Load Model with LoRA

In [13]:
torch.cuda.empty_cache()
gc.collect()

print("Loading CLIP model...")
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
base_model = CLIPModel.from_pretrained(MODEL_NAME)

# Configure LoRA - target the attention layers
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "out_proj",  # Attention projections
    ],
    lora_dropout=LORA_DROPOUT,
    bias="none",
)

# Apply LoRA
model = get_peft_model(base_model, lora_config)
model = model.to(DEVICE)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\nLoRA model loaded")
print(f"Trainable params: {trainable_params:,} ({100*trainable_params/total_params:.2f}%)")
print(f"Total params: {total_params:,}")

Loading CLIP model...

LoRA model loaded
Trainable params: 3,932,160 (2.53%)
Total params: 155,209,473


## Training Functions

In [14]:
def contrastive_loss(image_embeds, text_embeds, logit_scale):
    image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
    text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

    logits_per_image = logit_scale * (image_embeds @ text_embeds.t())
    logits_per_text = logits_per_image.t()

    batch_size = len(image_embeds)
    labels = torch.arange(batch_size, device=image_embeds.device)

    loss_i2t = nn.functional.cross_entropy(logits_per_image, labels)
    loss_t2i = nn.functional.cross_entropy(logits_per_text, labels)

    return (loss_i2t + loss_t2i) / 2


def train_epoch_lora(model, dataloader, optimizer, scheduler, scaler, device):
    model.train()
    total_loss = 0
    optimizer.zero_grad()

    pbar = tqdm(dataloader, desc="Training")
    for batch_idx, batch in enumerate(pbar):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with autocast('cuda', enabled=(device == 'cuda')):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                pixel_values=pixel_values,
                return_loss=False
            )
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds
            logit_scale = model.base_model.logit_scale.exp()

            loss = contrastive_loss(image_embeds, text_embeds, logit_scale)
            loss = loss / ACCUMULATION_STEPS

        scaler.scale(loss).backward()

        if (batch_idx + 1) % ACCUMULATION_STEPS == 0 or (batch_idx + 1) == len(dataloader):
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_MAX_NORM)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * ACCUMULATION_STEPS
        pbar.set_postfix({'loss': f'{loss.item() * ACCUMULATION_STEPS:.4f}'})

    return total_loss / len(dataloader)


def evaluate_lora(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                pixel_values=pixel_values,
                return_loss=False
            )
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds
            logit_scale = model.base_model.logit_scale.exp()

            loss = contrastive_loss(image_embeds, text_embeds, logit_scale)
            total_loss += loss.item()

            image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
            text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
            similarity = logit_scale * (image_embeds @ text_embeds.t())
            predictions = similarity.argmax(dim=1)
            labels = torch.arange(len(predictions), device=device)
            correct += (predictions == labels).sum().item()
            total += len(predictions)

    return total_loss / len(dataloader), correct / total


class EarlyStopping:
    def __init__(self, patience=2):
        self.patience = patience
        self.counter = 0
        self.best_loss = None

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss >= self.best_loss:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        else:
            self.best_loss = val_loss
            self.counter = 0
        return False

print("Training functions defined")

Training functions defined


## Dataloaders

In [18]:
def collate_fn(batch):
    images = [item['image'] for item in batch]
    texts = [item['text'] for item in batch]

    inputs = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    return {
        'pixel_values': inputs['pixel_values'],
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
    }

print("Creating datasets...")
train_dataset = LandmarkDataset('train_sampled.csv', processor)
val_dataset = LandmarkDataset('val_sampled.csv', processor)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=8,
    collate_fn=collate_fn,
    pin_memory=(DEVICE == 'cuda')
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=8,
    collate_fn=collate_fn,
    pin_memory=(DEVICE == 'cuda')
)

print(f"\nDataLoaders created")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

Creating datasets...
Loaded 51,590 pairs from train_sampled.csv
Loaded 2,814 pairs from val_sampled.csv

DataLoaders created
Train batches: 404
Val batches: 22


## Setup Training

In [19]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=len(train_loader) * NUM_EPOCHS
)

scaler = GradScaler('cuda', enabled=(DEVICE == 'cuda'))
early_stopping = EarlyStopping(patience=EARLY_STOPPING_PATIENCE)

print("Training components initialized")

Training components initialized


## Training Loop

In [None]:
print("\n" + "="*60)
print("Starting LoRA training...")
print("="*60)

best_val_loss = float('inf')
history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    print("-" * 40)

    train_loss = train_epoch_lora(model, train_loader, optimizer, scheduler, scaler, DEVICE)
    val_loss, val_accuracy = evaluate_lora(model, val_loader, DEVICE)

    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['val_accuracy'].append(val_accuracy)

    print(f"\nResults:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_loss:.4f}")
    print(f"  Val Accuracy: {val_accuracy:.4f}")

    # Save checkpoint
    checkpoint = {
        'epoch': epoch + 1,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
    }
    model.save_pretrained(f"{CHECKPOINT_DIR}/checkpoint_epoch_{epoch + 1}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(f"{OUTPUT_DIR}/best_model")
        print(f"  ✓ New best model saved!")

    if early_stopping(val_loss):
        print(f"\n{'='*60}")
        print(f"Early stopping triggered at epoch {epoch + 1}")
        print(f"{'='*60}")
        break

print("\n" + "="*60)
print("Training complete!")
print(f"Best validation loss: {best_val_loss:.4f}")
print("="*60)


Starting LoRA training...

Epoch 1/10
----------------------------------------


Training: 100%|██████████| 404/404 [04:10<00:00,  1.61it/s, loss=0.0499]
Evaluating: 100%|██████████| 22/22 [00:17<00:00,  1.29it/s]



Results:
  Train Loss: 1.3672
  Val Loss: 3.1319
  Val Accuracy: 0.2697
  ✓ New best model saved!

Epoch 2/10
----------------------------------------


Training: 100%|██████████| 404/404 [04:09<00:00,  1.62it/s, loss=0.0000]
Evaluating: 100%|██████████| 22/22 [00:15<00:00,  1.40it/s]



Results:
  Train Loss: 0.4830
  Val Loss: 3.6525
  Val Accuracy: 0.2537

Epoch 3/10
----------------------------------------


Training: 100%|██████████| 404/404 [04:11<00:00,  1.60it/s, loss=0.0001]
Evaluating: 100%|██████████| 22/22 [00:15<00:00,  1.43it/s]



Results:
  Train Loss: 0.3062
  Val Loss: 3.7981
  Val Accuracy: 0.2576

Epoch 4/10
----------------------------------------


Training: 100%|██████████| 404/404 [04:05<00:00,  1.65it/s, loss=0.0026]
Evaluating: 100%|██████████| 22/22 [00:15<00:00,  1.44it/s]



Results:
  Train Loss: 0.2445
  Val Loss: 3.8597
  Val Accuracy: 0.2665

Early stopping triggered at epoch 4

Training complete!
Best validation loss: 3.1319


## Save to drive


In [None]:
!cp -r {OUTPUT_DIR} "/content/drive/MyDrive/csproj/"
!cp -r {CHECKPOINT_DIR} "/content/drive/MyDrive/csproj/"
print("✓ LoRA model saved to Google Drive")

✓ LoRA model saved to Google Drive
