<a href="https://colab.research.google.com/github/ayyucedemirbas/BLIP-VQA-Rad_Instruction_Tuning/blob/main/blip_vqa_rad_instruction_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets accelerate timm

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    TrainingArguments,
    Trainer,
    default_data_collator
)
import gc

In [None]:
!huggingface-cli login --token token --add-to-git-credential

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    TrainingArguments,
    Trainer
)

import gc
import os
import numpy as np
import torch
from PIL import Image
from datasets import load_dataset
from sklearn.model_selection import KFold

In [None]:
dataset = load_dataset("ayyuce/vqa-rad-instructions")
full_dataset = dataset["train"]
test_dataset = dataset["test"]

README.md:   0%|          | 0.00/100 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/42.3M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1793 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/451 [00:00<?, ? examples/s]

In [None]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
indices = np.arange(len(full_dataset))

In [None]:
best_models = []
val_losses = []

In [None]:
def init_model():
    model_name = "Salesforce/blip-vqa-base"
    processor = BlipProcessor.from_pretrained(model_name)
    model = BlipForConditionalGeneration.from_pretrained(model_name)
    return processor, model

In [None]:
for fold, (train_idx, val_idx) in enumerate(kf.split(indices)):
    print(f"\n{'='*50}")
    print(f"STARTING FOLD {fold+1}/{num_folds}")
    print(f"{'='*50}")

    # Force clear memory before each fold
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    processor, model = init_model()

    train_subset = full_dataset.select(train_idx.tolist())
    val_subset = full_dataset.select(val_idx.tolist())

    print(f"Train size: {len(train_subset)}, Validation size: {len(val_subset)}")

    def preprocess_function(examples):
        images = examples["image"]
        questions = examples["instruction"]
        answers = examples["response"]

        inputs = processor(
            images=images,
            text=questions,
            padding="max_length",
            truncation=True,
            max_length=64,
            return_tensors="pt"
        )

        target_encoding = processor.tokenizer(
            answers,
            padding="max_length",
            truncation=True,
            max_length=64,
            return_tensors="pt"
        )

        labels = target_encoding.input_ids.clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100

        batch = {
            "pixel_values": inputs.pixel_values,
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": labels
        }

        return batch

    print(f"Processing training data for fold {fold+1}...")
    train_processed = train_subset.map(
        preprocess_function,
        batched=True,
        batch_size=4,
        num_proc=1,
        remove_columns=train_subset.column_names,
        load_from_cache_file=False,
        desc=f"Processing train fold {fold+1}"
    )

    print(f"Processing validation data for fold {fold+1}...")
    val_processed = val_subset.map(
        preprocess_function,
        batched=True,
        batch_size=4,
        num_proc=1,
        remove_columns=val_subset.column_names,
        load_from_cache_file=False,
        desc=f"Processing val fold {fold+1}"
    )

    fold_output_dir = f"blip-vqa-rad-fold-{fold+1}"
    training_args = TrainingArguments(
        output_dir=fold_output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        learning_rate=3e-5,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=1,
        push_to_hub=False,
        report_to="none",
        fp16=True if torch.cuda.is_available() else False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_processed,
        eval_dataset=val_processed,
        data_collator=default_data_collator,
    )

    print(f"Training fold {fold+1}...")
    trainer.train()

    print(f"Evaluating fold {fold+1}...")
    eval_results = trainer.evaluate()
    val_loss = eval_results["eval_loss"]
    val_losses.append(val_loss)
    print(f"Validation loss for fold {fold+1}: {val_loss}")

    best_model_path = f"{fold_output_dir}/best_model"
    trainer.save_model(best_model_path)
    best_models.append(best_model_path)
    print(f"Best model for fold {fold+1} saved to {best_model_path}")

    del train_subset, val_subset, train_processed, val_processed
    del trainer, model, processor
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print(f"Fold {fold+1} complete!")


STARTING FOLD 1/5


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Train size: 1434, Validation size: 359
Processing training data for fold 1...


Processing train fold 1:   0%|          | 0/1434 [00:00<?, ? examples/s]

Processing validation data for fold 1...


Processing val fold 1:   0%|          | 0/359 [00:00<?, ? examples/s]



Training fold 1...


Epoch,Training Loss,Validation Loss
1,3.4825,3.186147


In [None]:
print("\n\n" + "="*50)
print("K-FOLD CROSS-VALIDATION SUMMARY")
print("="*50)
for i, (model_path, val_loss) in enumerate(zip(best_models, val_losses)):
    print(f"Fold {i+1}: Model path: {model_path}, Validation Loss: {val_loss}")

best_fold_idx = np.argmin(val_losses)
best_fold_model = best_models[best_fold_idx]
print(f"\nBest model is from fold {best_fold_idx+1} with validation loss {val_losses[best_fold_idx]}")
print(f"Best model path: {best_fold_model}")

print("\nProcessing test dataset for final evaluation...")
best_processor = BlipProcessor.from_pretrained(best_fold_model)
best_model = BlipForConditionalGeneration.from_pretrained(best_fold_model)

def test_preprocess_function(examples):
    images = examples["image"]
    questions = examples["instruction"]
    answers = examples["response"]

    inputs = best_processor(
        images=images,
        text=questions,
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )

    target_encoding = best_processor.tokenizer(
        answers,
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )

    labels = target_encoding.input_ids.clone()
    labels[labels == best_processor.tokenizer.pad_token_id] = -100

    batch = {
        "pixel_values": inputs.pixel_values,
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": labels
    }

    return batch

test_processed = test_dataset.map(
    test_preprocess_function,
    batched=True,
    batch_size=4,
    num_proc=1,
    remove_columns=test_dataset.column_names,
    load_from_cache_file=False,
    desc="Processing test dataset"
)

test_args = TrainingArguments(
    output_dir="./test_results",
    per_device_eval_batch_size=2,
    report_to="none"
)

test_trainer = Trainer(
    model=best_model,
    args=test_args,
    eval_dataset=test_processed,
    data_collator=default_data_collator,
)

print("Evaluating best model on test set...")
test_results = test_trainer.evaluate()
print(f"Test Loss: {test_results['eval_loss']}")

def test_model(image_path, question, model_path=None):
    """
    Test a BLIP VQA model with a new image and question.

    Args:
        image_path: Path to the image file
        question: Question text
        model_path: Path to the model to use (defaults to best model)

    Returns:
        The model's answer
    """
    if model_path is None:
        model_path = best_fold_model

    processor = BlipProcessor.from_pretrained(model_path)
    model = BlipForConditionalGeneration.from_pretrained(model_path)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    image = Image.open(image_path)
    inputs = processor(images=image, text=question, return_tensors="pt")

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=64,
            num_beams=5,
            min_length=1,
            do_sample=False,
            repetition_penalty=1.5
        )

    answer = processor.decode(outputs[0], skip_special_tokens=True)

    return answer

def ensemble_predict(image_path, question):
    """
    Get predictions from all models in the ensemble

    Args:
        image_path: Path to the image file
        question: Question text

    Returns:
        List of predictions from all models
    """
    predictions = []

    for i, model_path in enumerate(best_models):
        print(f"Getting prediction from fold {i+1} model...")
        pred = test_model(image_path, question, model_path)
        predictions.append(pred)
        print(f"Fold {i+1} prediction: {pred}")

    return predictions

print("\nTraining and evaluation complete!")
print("You can now use test_model() or ensemble_predict() to get predictions for new images.")