In [1]:
!pip install --upgrade --quiet transformers datasets accelerate peft bitsandbytes pillow --no-deps

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import transformers, accelerate, peft
print("transformers:", transformers.__version__)
print("accelerate:   ", accelerate.__version__)
print("peft:         ", peft.__version__)

2025-05-14 05:49:54.761382: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747201795.236462      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747201795.367606      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


transformers: 4.51.3
accelerate:    1.6.0
peft:          0.15.2


In [3]:
import os
from datasets import load_dataset, Features, Value, Image
from transformers import (
    BlipProcessor,
    BlipForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType,
)
import bitsandbytes as bnb

In [4]:
from datasets import load_dataset, concatenate_datasets
from transformers import BlipProcessor, BlipForQuestionAnswering, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from PIL import Image
import torch
from torch.nn.utils.rnn import pad_sequence

# ─── 1) Load & merge CSVs ───────────────────────────────────────────────
raw = load_dataset(
    "csv",
    data_files={
      "train":      "/kaggle/input/vr-data-curation1/qa_dataset2.csv",
      "validation": "/kaggle/input/vr-data-curation2/qa_dataset2.csv",
    },
)
raw = raw.remove_columns(["image_id", "product_type"])
full_ds = concatenate_datasets([raw["train"], raw["validation"]])
splits = full_ds.shuffle(seed=42).train_test_split(test_size=0.10)
train_ds, val_ds = splits["train"], splits["test"]
print(f"Train size: {len(train_ds)},  Validation size: {len(val_ds)}")

# ─── 2) Prepare processor & model config ───────────────────────────────
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", use_fast=True)
IMAGE_ROOT = "/kaggle/input/abo-small/images/small"
PAD_ID = processor.tokenizer.pad_token_id

config = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    low_cpu_mem_usage=True,
).config
DECODER_START = config.decoder_start_token_id or processor.tokenizer.cls_token_id

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Train size: 119059,  Validation size: 13229


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [5]:
# ─── PRE-FILTER EMPTY ANSWERS ───────────────────────────────────────
# (so that preprocess_batch can assume every example has an answer)
train_ds = train_ds.filter(
    lambda ex: bool(ex["answer"] and ex["answer"].strip()),
    num_proc=1,
)
val_ds = val_ds.filter(
    lambda ex: bool(ex["answer"] and ex["answer"].strip()),
    num_proc=1,
)

# ─── UPDATED PREPROCESS_BATCH (no more 'valid' list needed) ─────────
def preprocess_batch(examples):
    # 1) tokenize questions
    enc = processor.tokenizer(
        examples["question"],
        truncation=True,
        max_length=128,
        padding=False,
    )
    # 2) tokenize answers
    tgt = processor.tokenizer(
        examples["answer"],
        truncation=True,
        max_length=32,
        padding=False,
    )
    labels = tgt["input_ids"]  # list of lists of ints

    # 3) build decoder inputs & mask
    decoder_input_ids = [
        [DECODER_START] + lab[:-1] for lab in labels
    ]
    decoder_attention_mask = [
        [1] * len(seq) for seq in decoder_input_ids
    ]

    return {
        "path":                    examples["path"],  # needed by collator
        "input_ids":               enc["input_ids"],
        "attention_mask":          enc["attention_mask"],
        "labels":                  labels,
        "decoder_input_ids":       decoder_input_ids,
        "decoder_attention_mask":  decoder_attention_mask,
    }

# ─── MAP TEXT-ONLY FIELDS ───────────────────────────────────────────
tokenized_train = train_ds.map(
    preprocess_batch,            # your updated function
    batched=True,
    batch_size=32,               # ← smaller to reduce peak mem
    num_proc=1,                  # ← single process, no duplicates
    remove_columns=["path","question","answer"],
    load_from_cache_file=True,
    keep_in_memory=False,        # ← crucial: no full-split cache
)

tokenized_val = val_ds.map(
    preprocess_batch,
    batched=True,
    batch_size=32,
    num_proc=1,
    remove_columns=["path","question","answer"],
    load_from_cache_file=True,
    keep_in_memory=False,
)

# ─── SET_FORMAT (text-only tensors) ────────────────────────────────
tokenized_train.set_format(
    type="torch",
    columns=[
        "path",                   # we still need the path for collator
        "input_ids","attention_mask",
        "labels",
        "decoder_input_ids","decoder_attention_mask",
    ],
)
tokenized_val.set_format(
    type="torch",
    columns=[
        "path",
        "input_ids","attention_mask",
        "labels",
        "decoder_input_ids","decoder_attention_mask",
    ],
)

Filter:   0%|          | 0/119059 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13229 [00:00<?, ? examples/s]

Map:   0%|          | 0/119058 [00:00<?, ? examples/s]

Map:   0%|          | 0/13229 [00:00<?, ? examples/s]

In [6]:
# ── 5. Model + PEFT (LoRA) setup
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,        # HuggingFace helper to reduce peak CPU RAM
).to(device)

# model = prepare_model_for_kbit_training(model)  
# model = model.to(device)
# 
peft_config = LoraConfig(
    # task_type=TaskType.QUESTION_ANS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["query", "value","key","dense"],
)

peft_model = get_peft_model(model, peft_config)
# after peft_model = get_peft_model(model, peft_config)

# # 1) outer PEFT wrapper
# _orig_peft_forward = peft_model.forward
# def _clean_peft_forward(*args, **kwargs):
#     for k in ("inputs_embeds", "start_positions", "end_positions"):
#         kwargs.pop(k, None)
#     return _orig_peft_forward(*args, **kwargs)
# peft_model.forward = _clean_peft_forward

# # 2) inner/base BLIP wrapper
# _orig_base_forward = peft_model.base_model.forward
# def _clean_base_forward(*args, **kwargs):
#     for k in ("inputs_embeds", "start_positions", "end_positions"):
#         kwargs.pop(k, None)
#     return _orig_base_forward(*args, **kwargs)
# peft_model.base_model.forward = _clean_base_forward
peft_model = peft_model.to(device)
peft_model.print_trainable_parameters()

trainable params: 7,692,288 || all params: 392,364,860 || trainable%: 1.9605


In [7]:
# from datasets import load_metric

# metric = load_metric("exact_match")  # or any metric you like

# def compute_metrics(eval_pred):
#     generated_tokens, label_tokens = eval_pred
#     decoded_preds = processor.tokenizer.batch_decode(
#         generated_tokens, skip_special_tokens=True
#     )
#     decoded_labels = processor.tokenizer.batch_decode(
#         label_tokens, skip_special_tokens=True
#     )
#     return {"exact_match": metric.compute(predictions=decoded_preds, references=decoded_labels)}

# training_args = Seq2SeqTrainingArguments(
#     # … everything as before …
#     predict_with_generate=True,              # run .generate()
#     load_best_model_at_end=True,             # now safe
#     metric_for_best_model="exact_match",     # must match your compute_metrics key
# )
# trainer = Seq2SeqTrainer(
#     model=peft_model,
#     args=training_args,
#     train_dataset=tokenized["train"],
#     eval_dataset=tokenized["validation"],
#     data_collator=default_data_collator,
#     tokenizer=processor.tokenizer,
#     compute_metrics=compute_metrics,         # pass it in
# )

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  tokenizer=processor.tokenizer,
  model=peft_model,            # you’ve already provided decoder_input_ids
  label_pad_token_id=-100,
  padding="longest",           # dynamic pad each training batch
  return_tensors="pt",
)

In [9]:
from transformers import DataCollatorWithPadding

data_collator_pad = DataCollatorWithPadding(
    tokenizer=processor.tokenizer,
    padding="longest",      # pad each batch to its own max length
    return_tensors="pt",    # return PyTorch tensors
)

In [10]:
from torch.nn.utils.rnn import pad_sequence
from PIL import Image
import os

def collate_fn(batch):
    # A) load & preprocess *this batch* of images on CPU
    imgs = [
        Image.open(os.path.join(IMAGE_ROOT, ex["path"])).convert("RGB")
        for ex in batch
    ]
    # processor call returns CPU tensors
    pixel_values = processor(
        images=imgs,
        return_tensors="pt",
    ).pixel_values  # shape: (B, C, H, W), on CPU

    # B) gather & pad text fields (all on CPU)
    input_ids         = [torch.tensor(f["input_ids"],         dtype=torch.long) for f in batch]
    attention_mask    = [torch.tensor(f["attention_mask"],    dtype=torch.long) for f in batch]
    labels            = [torch.tensor(f["labels"],            dtype=torch.long) for f in batch]
    decoder_input_ids = [torch.tensor(f["decoder_input_ids"], dtype=torch.long) for f in batch]
    decoder_attn_mask = [torch.tensor(f["decoder_attention_mask"], dtype=torch.long) for f in batch]

    input_ids         = pad_sequence(input_ids,        batch_first=True, padding_value=PAD_ID)
    attention_mask    = pad_sequence(attention_mask,   batch_first=True, padding_value=0)
    labels            = pad_sequence(labels,           batch_first=True, padding_value=-100)
    decoder_input_ids = pad_sequence(decoder_input_ids, batch_first=True, padding_value=PAD_ID)
    decoder_attn_mask = pad_sequence(decoder_attn_mask, batch_first=True, padding_value=0)

    return {
        "pixel_values":           pixel_values,
        "input_ids":              input_ids,
        "attention_mask":         attention_mask,
        "labels":                 labels,
        "decoder_input_ids":      decoder_input_ids,
        "decoder_attention_mask": decoder_attn_mask,
    }

In [11]:
import os, warnings

# 1) Turn off tokenizers’ internal thread-pool when forked
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 2) Suppress HF “attention_mask” reminder
warnings.filterwarnings(
    "ignore",
    message=r"We strongly recommend passing in an `attention_mask`"
)

# 3) Suppress the DataParallel gather warning
warnings.filterwarnings(
    "ignore",
    message=r"Was asked to gather along dimension 0, but all input tensors were scalars"
)

warnings.filterwarnings(
    "ignore",
    message="To copy construct from a tensor.*"
)


In [12]:
# import pandas as pd
# from tabulate import tabulate
# from transformers import TrainerCallback, TrainerState, TrainerControl

# class DataFrameLogCallback(TrainerCallback):
#     def __init__(self, max_rows: int = 5):
#         super().__init__()
#         self.max_rows = max_rows

#     def on_log(self, args, state: TrainerState, control: TrainerControl, logs=None, **kwargs):
#         # 1) Assemble the history into a DataFrame
#         df = pd.DataFrame(state.log_history)
#         # 2) Keep only the rows that have a loss
#         df = df[df["loss"].notna()][["step","loss","learning_rate","epoch"]]
#         # 3) Format numbers
#         df["loss"] = df["loss"].map("{:.4f}".format)
#         df["learning_rate"] = df["learning_rate"].map("{:.2e}".format)
#         df["epoch"] = df["epoch"].map("{:.2f}".format)
#         # 4) Print the last few rows as a Markdown table
#         print(tabulate(df.tail(self.max_rows), headers="keys", tablefmt="github"))


In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback, TrainerState, TrainerControl
from transformers import EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="vqa_peft_out",

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,

    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",

    num_train_epochs=10,

    # ← run eval & save every 500 steps (instead of only at epoch end)
    do_eval=True,

    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,

    # ← logging every 20 steps
    logging_strategy="steps",
    logging_steps=200,
    logging_first_step=True,

    # ← rolling back to the best checkpoint by eval_loss
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # ← actually generate answers to compute your metric
    predict_with_generate=True,
    generation_max_length=32,

    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    remove_unused_columns=False,
    report_to=["none"],

    label_names=["labels"],
    dataloader_prefetch_factor=2,
    
    bf16=True,
)


# 3) Instantiate your Trainer, adding PrintLogsCallback *before* the bar redraws
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
    callbacks=[ EarlyStoppingCallback(early_stopping_patience=4) ],
)

  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()

Step,Training Loss,Validation Loss
200,1.7379,1.466797
400,1.2958,1.083984
600,1.0612,0.963867
800,0.9645,0.897949
1000,0.9017,0.855957
1200,0.8566,0.821777
1400,0.8266,0.793457
1600,0.8138,0.773438
1800,0.7921,0.762695
2000,0.7708,0.745605


TrainOutput(global_step=9200, training_loss=0.7291243445354959, metrics={'train_runtime': 31006.8104, 'train_samples_per_second': 38.397, 'train_steps_per_second': 0.3, 'total_flos': 2.493733570102109e+16, 'train_loss': 0.7291243445354959, 'epoch': 9.882321332616872})

In [15]:
tokenized_val[0]

{'path': '81/81705bb0.jpg',
 'input_ids': tensor([ 101, 2054, 2003, 1996, 4435, 1029,  102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([  101,  9733, 22083,  6558,   102]),
 'decoder_input_ids': tensor([  101,   101,  9733, 22083,  6558]),
 'decoder_attention_mask': tensor([1, 1, 1, 1, 1])}

In [16]:
# 8) Merge your PEFT adapters & save a standalone model + processor
merged = trainer.model.merge_and_unload()
merged.save_pretrained("vqa-blip-base-final-model")
processor.save_pretrained("vqa-blip-base-final-model")


[]

In [17]:
# from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor
# from PIL import Image
# import os
# import torch
# import matplotlib.pyplot as plt

# # --- Set up the VQA pipeline  -----------------------
# vqa_pipe = pipeline(
#     "visual-question-answering",
#     model="vqa-blip-base-final-model",
#     processor="vqa-blip-base-final-model",
#     device=0,                # or -1 for CPU
# )

# # --- Or load model + processor for manual generation ---
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_gen = BlipForConditionalGeneration.from_pretrained(
#     "vqa-blip-base-final-model",
#     torch_dtype=torch.bfloat16,
# ).to(device)
# processor_gen = BlipProcessor.from_pretrained("vqa-blip-base-final-model")

# # Make sure IMAGE_ROOT is set to your image folder root
# IMAGE_ROOT = "/kaggle/input/abo-small/images/small"
# N = 100

# for i, ex in enumerate(val_ds.select(range(N))):
#     img_path  = os.path.join(IMAGE_ROOT, ex["path"])
#     image     = Image.open(img_path).convert("RGB")
#     question  = ex["question"]
#     gt_answer = ex["answer"]

#     # ==== display the image ====
#     plt.figure(figsize=(4,4))
#     plt.imshow(image)
#     plt.axis("off")
#     plt.show()

#     # ==== pipeline inference ====
#     pipe_out   = vqa_pipe({"image": image, "question": question})
#     pred_pipe  = pipe_out[0]["answer"]

#     # ==== manual generation inference ====
#     inputs       = processor_gen(
#         images=image,
#         text=question,
#         return_tensors="pt"
#     ).to(device)
#     generated_ids = model_gen.generate(**inputs, max_length=32)
#     pred_manual   = processor_gen.decode(
#         generated_ids[0], skip_special_tokens=True
#     )

#     # ==== print QA pair ====
#     print(f"Example {i+1}")
#     print(f"Question       : {question}")
#     print(f"Ground‐truth   : {gt_answer}")
#     print(f"Pred (pipeline): {pred_pipe}")
#     print(f"Pred (generate): {pred_manual}")
#     print("-" * 60)


In [18]:

# # 9) Inference test (manual generate)
# import torch
# from PIL import Image
# from transformers import BlipProcessor, BlipForConditionalGeneration
# import matplotlib.pyplot as plt

# # 9.1) Load the merged model & processor
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BlipForConditionalGeneration.from_pretrained(
#     "vqa-blip-base-final-model", torch_dtype=torch.float16
# ).to(device)
# processor = BlipProcessor.from_pretrained("vqa-blip-base-final-model")

# # 9.2) Pick an example from your validation set
# #    (we saved only the path there, so load the image again)
# sample = tokenized_val[0]
# path = os.path.join('/kaggle/input/abo-small/images/small',sample['path'])
# image  = Image.open(path).convert("RGB")
# plt.imshow(image)
# question = "What is shown in the picture?"

# from transformers import pipeline
# vqa = pipeline(
#     "visual-question-answering",
#     model="vqa-blip-base-final-model",
#     processor="vqa-blip-base-final-model",
#     device=0,                      # or -1 for CPU
# )
# print(vqa({"image": image, "question": question}))


In [19]:
# from PIL import Image
# from transformers import pipeline

# merged = trainer.model.merge_and_unload()
# merged.save_pretrained("vqa-blip-base-final-model")
# processor.save_pretrained("vqa-blip-base-final-model")

# # 9) Inference test (manual generate)
# import torch
# from PIL import Image
# from transformers import BlipProcessor, BlipForConditionalGeneration
# import matplotlib.pyplot as plt

# # 9.1) Load the merged model & processor
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BlipForConditionalGeneration.from_pretrained(
#     "vqa-blip-base-final-model", torch_dtype=torch.bfloat16
# ).to(device)
# processor = BlipProcessor.from_pretrained("vqa-blip-base-final-model")

# # 1) Reload your merged model + processor into a pipeline
# vqa = pipeline(
#     "visual-question-answering",
#     model="vqa-blip-base-final-model",
#     processor="vqa-blip-base-final-model",
#     device=0,  # or -1 for CPU
# )

# # 2) Pick a sample from your tokenized_val
# sample = tokenized_val[0]

# # 3) Load & preprocess the image
# path = os.path.join('/kaggle/input/abo-small/images/small',sample['path'])
# image  = Image.open(path).convert("RGB")
# plt.imshow(image)

# # 4) Define the question (could also grab from your raw val_ds if you kept it)
# question = "What is shown in the picture?"

# # 5) Decode the ground‑truth answer from the label IDs
# gt_answer = processor.tokenizer.decode(
#     sample["labels"],
#     skip_special_tokens=True,
# )

# # 6) Run inference
# result = vqa({"image": image, "question": question})

# # 7) Print everything
# print(f"Question:          {question}")
# print(f"Predicted Answer:  {result['answer'] if isinstance(result, dict) else result[0]['answer']}")
# print(f"Ground Truth:      {gt_answer}")
