In [8]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/paligemma-2/transformers/paligemma2-3b-pt-224")

print("Path to model files:", path)

Path to model files: /kaggle/input/paligemma-2/transformers/paligemma2-3b-pt-224/1


In [1]:
#!pip install torch>=2.0.0 torchvision --index-url https://download.pytorch.org/whl/cu118

In [1]:
# Verify GPU
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu124
True


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory

data = pd.read_csv("/kaggle/input/riscmm/RISCM/captions.csv")
data.head()

Unnamed: 0,source,split,image,caption_1,caption_2,caption_3,caption_4,caption_5
0,NWPU,test,NWPU_31430.jpg,A gray plane on the runway and the lawn beside .,A grey plane is on the runway by the lawn .,There is an airplane on the runway with a larg...,A plane is parked on the runway next to the gr...,There is a plane on the runway beside the grass .
1,NWPU,test,NWPU_31431.jpg,Three small planes parked in a line on the air...,"There are four aircraft on the open ground, Th...",There are many planes of different sizes in a ...,Four planes are parked on the runway .,Four planes of different sizes were on the mar...
2,NWPU,test,NWPU_31432.jpg,A plane parked in a line on the airport with s...,A white plane was parked on the instruction li...,An airplane parked in an open area with many c...,A plane is parked on the open space .,There is 1 plane on the ground marked .
3,NWPU,test,NWPU_31433.jpg,A small plane and a big plane parked next to b...,A white plane and a gray plane parked at the b...,Two planes of different sizes are neatly parke...,A large plane and a small plane are parked nea...,Two planes are on the marked ground .
4,NWPU,test,NWPU_31434.jpg,Two planes parked next to boarding bridges .,Two aircraft were parked at the departure gates .,Two planes of different sizes are neatly parke...,Two planes are parked next to the terminal .,Two planes are on the marked ground .


In [5]:
wandb.login(key="d070aabfe54f4733fb727662604b037dee34842c")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33madigew[0m ([33madigew-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [1]:
import torch
import pandas as pd
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
from peft import LoraConfig, get_peft_model
from PIL import Image
import wandb
import os
from torch.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader

class RISCDataset(Dataset):
    def __init__(self, image_dir, df, processor):
        self.image_dir = image_dir
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_dir, row.image)
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
        image = Image.open(image_path).convert('RGB')
        caption = row.caption_1
        if pd.isna(caption):
            raise ValueError(f"Missing caption for image {row.image}")
        text_input = f"<image> caption {caption}"
        inputs = self.processor(
            text=text_input,
            images=image,
            return_tensors="pt",
            padding="longest"
        )
        return inputs

def load_dataset(image_dir, caption_file):
    df = pd.read_csv(caption_file)
    train_df = df[df['split'] == 'train']
    val_df = df[df['split'] == 'test']
    return train_df, val_df

def train_lora(model_name, image_dir, caption_file, output_dir,
               lora_rank=4, epochs=5, learning_rate=1e-5,
               max_train_samples=None, max_val_samples=None,
               batch_size=1, accum_steps=4):
    
    wandb.init(project="DI725_Phase2", config={
        "lora_rank": lora_rank,
        "epochs": epochs,
        "lr": learning_rate,
        "max_train_samples": max_train_samples,
        "max_val_samples": max_val_samples
    })

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type != "cuda":
        raise RuntimeError("GPU not available. Ensure you're using a GPU runtime in Colab.")
    
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_name, torch_dtype=torch.float16).to(device)
    processor = PaliGemmaProcessor.from_pretrained(model_name, use_fast=True)

    lora_config = LoraConfig(
        r=lora_rank,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1
    )
    model = get_peft_model(model, lora_config)

    train_df, val_df = load_dataset(image_dir, caption_file)
    if max_train_samples:
        train_df = train_df.iloc[:max_train_samples]
    if max_val_samples:
        val_df = val_df.iloc[:max_val_samples]
    print(f"Training on {len(train_df)} samples, validating on {len(val_df)} samples")

    train_dataset = RISCDataset(image_dir, train_df, processor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = RISCDataset(image_dir, val_df, processor)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scaler = GradScaler()
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        steps = 0
        optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(train_loader):
            try:
                batch = {k: v.squeeze(0).to(device) for k, v in batch.items()}

                with autocast('cuda'):
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        pixel_values=batch['pixel_values'],
                        labels=batch['input_ids']
                    )
                    loss = outputs.loss / accum_steps
                
                scaler.scale(loss).backward()
                
                if (batch_idx + 1) % accum_steps == 0 or (batch_idx + 1) == len(train_loader):
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                
                total_loss += loss.item() * accum_steps
                steps += 1
                if steps % 10 == 0:
                    print(f"Epoch {epoch+1}, Step {steps}, Loss: {loss.item() * accum_steps:.4f}")
                torch.cuda.empty_cache()
            except Exception as e:
                print(f"Error in batch {batch_idx+1}: {e}")
                torch.cuda.empty_cache()
                continue

        avg_train_loss = total_loss / steps if steps > 0 else 0
        wandb.log({"epoch": epoch+1, "train_loss": avg_train_loss})

        model.eval()
        val_loss = 0
        val_steps = 0
        for batch_idx, batch in enumerate(val_loader):
            try:
                batch = {k: v.squeeze(0).to(device) for k, v in batch.items()}
                with torch.no_grad(), autocast('cuda'):
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        pixel_values=batch['pixel_values'],
                        labels=batch['input_ids']
                    )
                    val_loss += outputs.loss.item()
                val_steps += 1
                torch.cuda.empty_cache()
            except Exception as e:
                print(f"Validation error {batch_idx+1}: {e}")
                torch.cuda.empty_cache()
                continue

        avg_val_loss = val_loss / val_steps if val_steps > 0 else 0
        wandb.log({"epoch": epoch+1, "val_loss": avg_val_loss})
        print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")
        model.train()

    model.save_pretrained(output_dir)
    processor.save_pretrained(output_dir)
    wandb.finish()

if __name__ == "__main__":
    model_name = "/kaggle/input/paligemma-2/transformers/paligemma2-3b-pt-224/1"
    image_dir = "/kaggle/input/riscmm/RISCM/resized"
    caption_file = "/kaggle/input/riscmm/RISCM/captions.csv"
    output_dir = "/kaggle/working/paligemma_lora"

    train_lora(
        model_name=model_name,
        image_dir=image_dir,
        caption_file=caption_file,
        output_dir=output_dir,
        max_train_samples=None,
        max_val_samples=None
    )


  from .autonotebook import tqdm as notebook_tqdm
E0000 00:00:1746972490.527849      10 common_lib.cc:612] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: ===
learning/45eac/tfrc/runtime/common_lib.cc:230


ModuleNotFoundError: No module named 'peft'