<a href="https://colab.research.google.com/github/ajaysuseel/MiniProject_AD/blob/main/finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/ajaysuseel/MiniProject_AD.git

Cloning into 'MiniProject_AD'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 21 (delta 0), reused 15 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (21/21), 1.64 MiB | 6.01 MiB/s, done.


In [1]:
import torch
import json
import requests
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer

In [2]:
# ✅ CONFIGURABLE VARIABLES (Update these)
GITHUB_REPO = "https://raw.githubusercontent.com/ajaysuseel/MiniProject_AD/main/ajay/"
JSON_FILE = "captions.json"
IMAGES_FOLDER = "images/"

In [3]:
# ===========================
# ✅ Function: Load BLIP Model
# ===========================
def load_blip_model():
    model_name = "Salesforce/blip-image-captioning-base"
    print("Loading BLIP-1 model and processor...")
    processor = BlipProcessor.from_pretrained(model_name)
    model = BlipForConditionalGeneration.from_pretrained(model_name)
    return model, processor

In [4]:
# ===========================
# ✅ Function: Load Dataset from GitHub
# ===========================
def load_dataset_from_github():
    json_url = GITHUB_REPO + JSON_FILE
    try:
        response = requests.get(json_url)
        response.raise_for_status()  # Raises error for bad response
        data = response.json()
        print(f"Successfully loaded {len(data)} image-caption pairs.")
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error loading dataset: {e}")
        return []


In [5]:
# ===========================
# ✅ Custom Dataset Class
# ===========================
class CaptionDataset(Dataset):
    def __init__(self, data, processor, images_dir):
        self.data = data
        self.processor = processor
        self.images_dir = images_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_url = self.images_dir + item["filename"]

        try:
            image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
        except Exception as e:
            print(f"Error loading image {item['filename']}: {e}")
            return None

        # Tokenize text & set labels
        encoding = self.processor(
            text=item["description"],
            images=image,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )

        encoding = {key: val.squeeze(0) for key, val in encoding.items()}  # Remove batch dim
        encoding["labels"] = encoding["input_ids"]  # ✅ Fix for missing labels

        return encoding

In [6]:
# ===========================
# ✅ Function: Create DataLoader
# ===========================
def create_dataloader(data, processor, batch_size=4):
    dataset = CaptionDataset(data, processor, GITHUB_REPO + IMAGES_FOLDER)

    def collate_fn(batch):
        batch = [b for b in batch if b is not None]  # Remove failed loads
        if len(batch) == 0:
            return None
        keys = batch[0].keys()
        return {key: torch.stack([b[key] for b in batch]) for key in keys}

    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [7]:
# ===========================
# ✅ Function: Train Model
# ===========================
from torch.optim import AdamW
from tqdm import tqdm
# import wandb

def train_blip_model(model, dataloader, num_epochs=3, learning_rate=5e-5):
    """
    Fine-tunes the BLIP-1 model with a progress bar and logging.

    Parameters:
        model: The BLIP-1 model.
        dataloader: The DataLoader with images and captions.
        num_epochs (int): Number of training epochs.
        learning_rate (float): Learning rate for optimizer.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Initialize Weights & Biases (W&B) logging (optional)
    # wandb.init(project="blip1_finetuning", name="blip1_small_dataset")

    print(f"🚀 Starting fine-tuning on {device} for {num_epochs} epochs...")

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            optimizer.zero_grad()

            # Move batch to device
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            # Update progress bar with loss
            progress_bar.set_postfix(loss=loss.item())

            # Log to W&B
            # wandb.log({"loss": loss.item()})

        avg_loss = epoch_loss / len(dataloader)
        print(f"✅ Epoch {epoch+1} completed | Average Loss: {avg_loss:.4f}")

    # wandb.finish()
    print("🎯 Fine-tuning complete!")

    return model

In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [9]:
# ===========================
# ✅ Main Execution
# ===========================
if __name__ == "__main__":
    model, processor = load_blip_model()
    data = load_dataset_from_github()

    if not data:
        print("No data found. Exiting.")
    else:
        dataloader = create_dataloader(data, processor)
        if dataloader is None:
            print("Error: No valid data samples found. Exiting.")
        else:
            train_blip_model(model, dataloader)

Loading BLIP-1 model and processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Successfully loaded 10 image-caption pairs.


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

🚀 Starting fine-tuning on cuda for 3 epochs...



Epoch 1/3:   0%|          | 0/3 [00:00<?, ?it/s][A
Epoch 1/3:   0%|          | 0/3 [00:05<?, ?it/s, loss=12.9][A
Epoch 1/3:  33%|███▎      | 1/3 [00:05<00:11,  5.73s/it, loss=12.9][A
Epoch 1/3:  33%|███▎      | 1/3 [00:07<00:11,  5.73s/it, loss=10.2][A
Epoch 1/3:  67%|██████▋   | 2/3 [00:07<00:03,  3.59s/it, loss=10.2][A
Epoch 1/3:  67%|██████▋   | 2/3 [00:08<00:03,  3.59s/it, loss=9.56][A
Epoch 1/3: 100%|██████████| 3/3 [00:08<00:00,  2.96s/it, loss=9.56]


✅ Epoch 1 completed | Average Loss: 10.8676


Epoch 2/3: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it, loss=8.48]


✅ Epoch 2 completed | Average Loss: 8.7630


Epoch 3/3: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it, loss=7.99]

✅ Epoch 3 completed | Average Loss: 8.1333
🎯 Fine-tuning complete!





In [30]:
import requests

# Corrected RAW GitHub URL
GITHUB_JSON_URL = "https://raw.githubusercontent.com/ajaysuseel/MiniProject_AD/main/ajay/captions.json"

try:
    response = requests.get(GITHUB_JSON_URL)
    response.raise_for_status()  # Raise an error for bad response (e.g., 404)

    # Decode JSON
    data = response.json()
    print("JSON successfully loaded:", data[:2])  # Show first 2 entries

except requests.exceptions.RequestException as e:
    print("Error fetching JSON:", e)
except ValueError as e:
    print("Error parsing JSON:", e)


JSON successfully loaded: [{'filename': 'file41.jpg', 'description': 'A white truck parked on the street and two cars are beside it'}, {'filename': 'file42.jpg', 'description': 'A group of people walking down a sidewalk and two cars are parked by the side in front'}]
