<a href="https://colab.research.google.com/github/ambermanijha/Multimodal_Fine_Tuning/blob/main/BLIP_2_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -------- Cell 1: Setup & Dataset Extraction --------
!pip install -q pillow tqdm transformers datasets accelerate peft bitsandbytes

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import os, shutil, zipfile, glob
from pathlib import Path

DATA_ROOT = "/content/liver_ultrasound"
ZIP_PATH = "/content/ultrasound-liver.zip"

# --- Extract uploaded ZIP file ---
if os.path.exists(ZIP_PATH):
    os.makedirs(DATA_ROOT, exist_ok=True)
    print(f"üì¶ Extracting {ZIP_PATH}...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(DATA_ROOT)
    print("‚úÖ Dataset extracted successfully.")
else:
    print("‚ö†Ô∏è Please upload 'annotated-ultrasound-liver-images.zip' to /content first.")

# --- Preview folder structure ---
print("\nüìÇ Dataset structure:")
for root, dirs, files in os.walk(DATA_ROOT):
    depth = root.replace(DATA_ROOT, "").count(os.sep)
    indent = "  " * depth
    print(f"{indent}{os.path.basename(root)}/ ({len(files)} files)")
    if depth > 1:  # show only first two levels
        break


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üì¶ Extracting /content/ultrasound-liver.zip...
‚úÖ Dataset extracted successfully.

üìÇ Dataset structure:
liver_ultrasound/ (2 files)
  7272660/ (0 files)
    Malignant/ (0 files)


In [None]:
# -------- Cell 2: Generate Captions (Optional, Resumable) --------
import os, glob, json, torch
from tqdm import tqdm
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SAVE_PATH = "/content/liver_ultrasound/captions_progress.json"
DATA_ROOT = "/content/liver_ultrasound"
print("Device:", DEVICE)

# --- Find all image files ---
image_paths = []
for ext in ("*.jpg", "*.jpeg", "*.png"):
    image_paths += glob.glob(os.path.join(DATA_ROOT, "**", ext), recursive=True)
image_paths = sorted(list(set(image_paths)))
print(f"Found {len(image_paths)} images.")

if len(image_paths) == 0:
    raise SystemExit("No images found. Upload or set correct DATA_ROOT.")

# --- Load BLIP model ---
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(DEVICE).eval()

# --- Safely load progress ---
progress, processed = [], set()
if os.path.exists(SAVE_PATH):
    try:
        if os.path.getsize(SAVE_PATH) > 0:  # only read non-empty file
            with open(SAVE_PATH, "r") as f:
                progress = json.load(f)
            processed = {p["image_path"] for p in progress}
            print(f"‚úÖ Loaded {len(progress)} previous captions.")
        else:
            print("‚ö†Ô∏è Progress file exists but is empty ‚Äî starting fresh.")
    except json.JSONDecodeError:
        print("‚ö†Ô∏è Progress file is corrupted ‚Äî starting fresh.")

SAVE_EVERY = 25

# --- Generate captions ---
for i, img_path in enumerate(tqdm(image_paths)):
    if img_path in processed:
        continue
    try:
        img = Image.open(img_path).convert("RGB")
        inputs = processor(images=img, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=48)
        caption = processor.decode(out[0], skip_special_tokens=True)
        caption = f"This is an ultrasound image of a liver. {caption}"
        progress.append({"image_path": img_path, "caption": caption})
        processed.add(img_path)
    except Exception as e:
        print("Error:", e)
        continue

    # Save progress periodically
    if (i + 1) % SAVE_EVERY == 0:
        with open(SAVE_PATH, "w") as f:
            json.dump(progress, f, indent=2)

# --- Final save ---
with open(SAVE_PATH, "w") as f:
    json.dump(progress, f, indent=2)

print(f"‚úÖ Captions saved -> {SAVE_PATH}")
if progress:
    print(json.dumps(progress[0], indent=2))


Device: cuda
Found 735 images.
‚ö†Ô∏è Progress file exists but is empty ‚Äî starting fresh.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 735/735 [05:19<00:00,  2.30it/s]

‚úÖ Captions saved -> /content/liver_ultrasound/captions_progress.json
{
  "image_path": "/content/liver_ultrasound/7272660/Benign/Benign/image/1.jpg",
  "caption": "This is an ultrasound image of a liver. a breast with a small, flat, flat, flat, flat, flat, flat, flat, flat"
}





In [None]:
# -------- Cell 3: Convert + Fine-tune BLIP-2 with LoRA + QLoRA (4-bit, memory-optimized) --------
# Run this cell after Cell 1 (and Cell 2 if you generated captions)

import os, json, torch, shutil
from datasets import load_dataset
from transformers import (
    Blip2Processor, Blip2ForConditionalGeneration,
    TrainingArguments, Trainer, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
import torch.nn as nn

# Disabling fused CuDNN kernels to avoid type mismatch error - Keep this in case it helps with other issues
torch.backends.cudnn.enabled = False

# ======== Paths ========
DATA_ROOT = "/content/liver_ultrasound"
PROGRESS_PATH = os.path.join(DATA_ROOT, "captions_progress.json")
OUTPUT_JSON = os.path.join(DATA_ROOT, "blip2_train.json")
SAVE_DIR = "/content/drive/MyDrive/blip2_liver_finetuned_qlora"
os.makedirs(SAVE_DIR, exist_ok=True)

# ======== 1) Convert captions_progress.json ‚Üí BLIP-2 json ========
with open(PROGRESS_PATH, "r") as f:
    data = json.load(f)

# ‚ö° Limit sample count for initial run (increase later once stable)
data = data[:100]

blip_ready = [{"image": x["image_path"], "text": x["caption"]} for x in data]
with open(OUTPUT_JSON, "w") as f:
    json.dump(blip_ready, f, indent=2)
print(f"‚úÖ Converted data saved: {OUTPUT_JSON} | Entries: {len(blip_ready)}")

# ======== 2) Load dataset ========
dataset = load_dataset("json", data_files=OUTPUT_JSON, split="train")
print("Loaded dataset entries:", len(dataset))

# ======== 3) Use smaller BLIP-2 variant (fits Colab) ========
MODEL_NAME = "Salesforce/blip2-flan-t5-xl"
processor = Blip2Processor.from_pretrained(MODEL_NAME)

# ======== 4) QLoRA (4-bit) quantization config ========
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print("Loading model with QLoRA 4-bit quantization (may take a few minutes)...")
model = Blip2ForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

# ======== 5) Attach LoRA adapters ========
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v", "k", "o", "wi", "wo"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
print("‚úÖ LoRA adapters attached.")

# ======== Add custom dropout layer to handle potential type issues ========
class CustomDropout(nn.Module):
    def __init__(self, p: float = 0.5, inplace: bool = False):
        super().__init__()
        if p < 0 or p > 1:
            raise ValueError("dropout probability has to be between 0 and 1, but got {}".format(p))
        self.p = p
        self.inplace = inplace

    def forward(self, x):
        # Ensure the input is in a supported format for dropout
        original_dtype = x.dtype
        if original_dtype not in [torch.float32, torch.float16, torch.bfloat16]:
            x = x.float()

        x = torch.nn.functional.dropout(x, self.p, self.training, self.inplace)

        # Cast back to original dtype if necessary and if supported
        if original_dtype != x.dtype and original_dtype in [torch.float32, torch.float16, torch.bfloat16]:
             x = x.to(original_dtype)

        return x

def replace_dropout_layers(model, custom_dropout_layer):
    for name, module in model.named_children():
        if isinstance(module, nn.Dropout):
            setattr(model, name, custom_dropout_layer(module.p, module.inplace))
        else:
            replace_dropout_layers(module, custom_dropout_layer)

replace_dropout_layers(model, CustomDropout)
print("‚úÖ Replaced dropout layers with custom implementation.")


# ======== 6) Preprocessing ========
def preprocess(examples):
    proc = processor(
        images=examples["image"],
        text=examples["text"],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=64,   # shorter seq to save VRAM
    )
    labels = proc["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    proc["labels"] = labels
    for k, v in proc.items():
        proc[k] = v.tolist()
    return proc

dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
dataset.set_format(type="torch")
print("‚úÖ Dataset preprocessed and converted to torch tensors.")

# ======== 7) SafeTrainer ========
from transformers import Trainer
class SafeTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        inputs.pop("num_items_in_batch", None)
        outputs = model(**inputs)
        loss = outputs.get("loss")
        if loss is None:
            logits = outputs["logits"]
            labels = inputs["labels"]
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# ======== 8) Training arguments ========
# Note: Keeping bnb_config here for clarity, but the model is already loaded with it
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )

args = TrainingArguments(
    output_dir="./blip2_qlora",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
    report_to="none"
)


# ======== 9) Train ========
trainer = SafeTrainer(model=model, args=args, train_dataset=dataset)
trainer.train()

# ======== 10) Save adapters + processor ========
print("Saving adapters and processor to:", SAVE_DIR)
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("‚úÖ Training finished and saved to:", SAVE_DIR)

‚úÖ Converted data saved: /content/liver_ultrasound/blip2_train.json | Entries: 100


Generating train split: 0 examples [00:00, ? examples/s]

Loaded dataset entries: 100
Loading model with QLoRA 4-bit quantization (may take a few minutes)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ LoRA adapters attached.
‚úÖ Replaced dropout layers with custom implementation.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

‚úÖ Dataset preprocessed and converted to torch tensors.


Step,Training Loss
20,120.1322
40,116.2764
60,108.7827
80,103.0107
100,97.494


Saving adapters and processor to: /content/drive/MyDrive/blip2_liver_finetuned_qlora
‚úÖ Training finished and saved to: /content/drive/MyDrive/blip2_liver_finetuned_qlora
