<a href="https://colab.research.google.com/github/ayyucedemirbas/BLIP-VQA-Rad_Instruction_Tuning/blob/main/blip_vqa_rad_instruction_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets accelerate timm

In [1]:
import torch
from datasets import load_dataset
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    TrainingArguments,
    Trainer
)

In [None]:
!huggingface-cli login

In [4]:
model_name = "Salesforce/blip-vqa-base"

In [5]:
dataset = load_dataset("ayyuce/vqa-rad-instructions")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
def preprocess_function(examples):
    images = examples["image"]
    questions = examples["instruction"]
    answers = examples["response"]

    encoding = processor(
        images,
        questions,
        padding="max_length",
        truncation=True,
        max_length=64,
    )


    labels = processor.tokenizer(
        answers,
        padding="max_length",
        truncation=True,
        max_length=64,
    ).input_ids

    encoding["labels"] = labels
    return encoding


train_dataset = dataset["train"].map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=dataset["train"].column_names,
    num_proc=4,
    load_from_cache_file=False
)


test_dataset = dataset["test"].map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=dataset["test"].column_names,
    num_proc=4,
    load_from_cache_file=False
)

Map (num_proc=4):   0%|          | 0/1793 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/451 [00:00<?, ? examples/s]

In [19]:
training_args = TrainingArguments(
    output_dir="blip-vqa-rad-checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    logging_steps=50,
    load_best_model_at_end=True,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
)



In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3379,0.289737
2,0.2955,0.281151


There were missing keys in the checkpoint model loaded: ['text_decoder.cls.predictions.decoder.bias'].


TrainOutput(global_step=1794, training_loss=0.27869081709828797, metrics={'train_runtime': 2034.7594, 'train_samples_per_second': 1.762, 'train_steps_per_second': 0.882, 'total_flos': 2.128017216878936e+18, 'train_loss': 0.27869081709828797, 'epoch': 2.0})

In [15]:
trainer.push_to_hub("ayyuce/blip-vqa-rad")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayyuce/blip-vqa-rad-checkpoints/commit/51b23f87c228f3da64884dfb88261a10e8185eed', commit_message='ayyuce/blip-vqa-rad', commit_description='', oid='51b23f87c228f3da64884dfb88261a10e8185eed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ayyuce/blip-vqa-rad-checkpoints', endpoint='https://huggingface.co', repo_type='model', repo_id='ayyuce/blip-vqa-rad-checkpoints'), pr_revision=None, pr_num=None)

In [22]:
trainer.push_to_hub("MLforHealthcare/blip-vqa-rad")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayyuce/blip-vqa-rad-checkpoints/commit/758aa3ff40fe6d99921442107a5c8a7fc7be39d6', commit_message='MLforHealthcare/blip-vqa-rad', commit_description='', oid='758aa3ff40fe6d99921442107a5c8a7fc7be39d6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ayyuce/blip-vqa-rad-checkpoints', endpoint='https://huggingface.co', repo_type='model', repo_id='ayyuce/blip-vqa-rad-checkpoints'), pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-vqa-base")

In [None]:
tokenizer.save_pretrained("MLforHealthcare/blip-vqa-rad")
tokenizer.push_to_hub("MLforHealthcare/blip-vqa-rad")