In [1]:
!pip install --upgrade --quiet transformers datasets accelerate peft bitsandbytes pillow --no-deps

import transformers, accelerate, peft
print("transformers:", transformers.__version__)
print("accelerate:   ", accelerate.__version__)
print("peft:         ", peft.__version__)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[?25h

2025-05-15 07:24:17.528661: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747293857.770961      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747293857.842378      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


transformers: 4.51.3
accelerate:    1.6.0
peft:          0.15.2


In [2]:
# ───────────────────────────────────────────────────────────────────────────────
# Qwen2.5-VL-3B-Instruct-AWQ + LoRA fine-tuning for VQA on Kaggle T4
# Copy-paste this into your notebook cell and run.
# ───────────────────────────────────────────────────────────────────────────────

# 0) Install & imports ────────────────────────────────────────────────────────
# !pip install --upgrade transformers accelerate peft bitsandbytes datasets pillow --no-deps

import os, warnings, torch
from PIL import Image
from datasets import load_dataset, concatenate_datasets
from datasets import ClassLabel


from transformers import (
    Qwen2_5_VLProcessor,
    Qwen2_5_VLForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

# ───────────────────────────────────────────────────────────────────────────────
# 1) Load & filter CSVs ───────────────────────────────────────────────────────
RAW_PATHS = {
    "train":      "/kaggle/input/vr-data-curation1/qa_dataset2.csv",
    "validation": "/kaggle/input/vr-data-curation2/qa_dataset2.csv",
}
raw = load_dataset("csv", data_files=RAW_PATHS)
for split in raw:
    raw[split] = raw[split].filter(lambda ex: bool((ex.get("answer") or "").strip()))

full = concatenate_datasets([raw["train"], raw["validation"]])

# 1) Get the sorted list of your product types
unique_types = full.unique("product_type")
unique_types.sort()  # optional, just to keep names in order

# 2) Build a ClassLabel feature with those names
product_type_feature = ClassLabel(names=unique_types)

# 3) Cast that column on your Dataset
full = full.cast_column("product_type", product_type_feature)

splits = full.shuffle(seed=42).train_test_split(test_size=0.10,seed = 42)
train_ds, val_ds = splits["train"], splits["test"]

def is_large_enough(ex):
    img = Image.open(f"/kaggle/input/abo-small/images/small/{ex['path']}").convert("RGB")
    return img.width >= 28 and img.height >= 28

train_ds = train_ds.filter(is_large_enough)
val_ds   = val_ds.filter(is_large_enough)

# small_val_split = train_ds.train_test_split(
#     test_size=1000,
#     seed=42,
# )
# train_ds = small_val_split["test"]

small_val_split = val_ds.train_test_split(
    shuffle = True,
    test_size=2000,
    seed=42,
)
val_ds = small_val_split["test"]

print(f"Train size: {len(train_ds)}, Validation size: {len(val_ds)}")

# ───────────────────────────────────────────────────────────────────────────────
# 2) Processor & model (bnb 4-bit via Qwen class, no Triton) ───────────────────
from transformers import BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration

# 2a) Keep your existing processor
processor = Qwen2_5_VLProcessor.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", trust_remote_code=True,
    min_pixels=224 * 224, #Since abo images in 256,we tried to keep a little lower than that.
    max_pixels=1280*28*28,
)


# 2b) Define bitsandbytes 4-bit config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

# 2c) Load with the Qwen seq-to-seq class in 4-bit
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    trust_remote_code=True,
    quantization_config=bnb_config, 
    # torch_dtype=torch.float16,# ← replace all load_in_4bit / bnb_4bit_* args
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
# 2d) Wrap for k-bit training & LoRA
model = prepare_model_for_kbit_training(model)


# Grab the correct image‐placeholder token string
tokenizer = processor.tokenizer
# image_tok = tokenizer.convert_ids_to_tokens(tokenizer.image_token_id)

# ───────────────────────────────────────────────────────────────────────────────
# 3) LoRA configuration ────────────────────────────────────────────────────────
peft_config = LoraConfig(
    inference_mode=False,
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "lm_head",
    ]
)

peft_model = get_peft_model(model, peft_config)
peft_model.to(device)
# ───────────────────────────────────────────────────────────────────────────────
# 4) Preprocessing function ────────────────────────────────────────────────────
def preprocess_batch(examples):
    enc = processor.tokenizer(examples["question"], truncation=True, max_length=128)
    tgt = processor.tokenizer(examples["answer"],   truncation=True, max_length=32)
    labels = tgt["input_ids"]
    return {
        "path": examples["path"],
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": labels,
        "question": examples["question"],
    }

tokenized_train = train_ds.map(preprocess_batch, batched=True)
tokenized_val   = val_ds.map(  preprocess_batch, batched=True)

for tok in (tokenized_train, tokenized_val):
    tok.set_format(
        type="torch",
        columns=[
            "path","input_ids","attention_mask",
            "labels","question"
        ]
    )

# ───────────────────────────────────────────────────────────────────────────────
# 5) Custom collator ───────────────────────────────────────────────────────────
from torchvision.transforms import Resize
from torch.nn.utils.rnn import pad_sequence

# RESIZE = Resize((256, 256))

# def collate_fn(batch):
#     # 1) Load + resize images
#     imgs = [
#         RESIZE(
#             Image.open(f"/kaggle/input/abo-small/images/small/{ex['path']}")
#             .convert("RGB")
#         )
#         for ex in batch
#     ]

#     # 2) Build the multimodal prompts
#     samples = [
#         [
#             {"role":"user", "content":[{"type":"text","text":ex["question"]}]},
#             {"role":"user", "content":[{"type":"image","image":imgs[i]}]},
#         ]
#         for i, ex in enumerate(batch)
#     ]
#     prompts = processor.apply_chat_template(
#         samples,
#         tokenize=False,
#         add_generation_prompt=True
#     )

#     # 3) Tokenize prompt+image together
#     proc_out = processor(
#         text=prompts,
#         images=imgs,
#         padding="longest",
#         truncation=True,
    #     return_tensors="pt",
    # )
    # input_ids      = proc_out.input_ids        # (B, T)
    # attention_mask = proc_out.attention_mask

    # # 4) Tokenize *just* the answers
    # label_seqs = [torch.tensor(ex["labels"], dtype=torch.long) for ex in batch]

    # # 5) Build a full-(B, T) label tensor filled with -100
    # labels = input_ids.new_full(input_ids.shape, -100)
    # for i, lab in enumerate(label_seqs):
    #     L = lab.size(0)
    #     # right-align your answer at the end of the sequence:
    #     labels[i, -L:] = lab

    # # 6) Return everything
    # return {
    #     "pixel_values":    proc_out.pixel_values,
    #     "image_grid_thw":  proc_out.image_grid_thw,
    #     "input_ids":       input_ids,
    #     "attention_mask":  attention_mask,
    #     "labels":          labels,
    # }

def collate_fn(batch):
    # 1) Load + resize images, keep only the valid ones
    valid_examples, valid_imgs, bad = [], [], []
    for ex in batch:
        try:
            img = Image.open(f"/kaggle/input/abo-small/images/small/{ex['path']}").convert("RGB")
            valid_examples.append(ex)
            valid_imgs.append(img)
        except Exception:
            bad.append(ex)
    if bad:
        print(f"Skipping {len(bad)} bad images")

    # 2) Build the multimodal prompts only for valid_examples
    samples = [
        [
            {"role":"user", "content":[{"type":"text","text": f"Answer the question in exactly one word: {ex['question']}"}]},
            {"role":"user", "content":[{"type":"image","image":valid_imgs[i]}]},
        ]
        for i, ex in enumerate(valid_examples)
    ]
    prompts = processor.apply_chat_template(
        samples,
        tokenize=False,
        add_generation_prompt=True
    )

    # 3) Tokenize prompt+image together — note images must be list-of-lists
    image_lists = [[img] for img in valid_imgs]
    proc_out = processor(
        text=prompts,
        images=image_lists,
        padding="longest",
        truncation=True,
        return_tensors="pt",
    )

    # …then rebuild your labels exactly as before, using valid_examples …
    input_ids      = proc_out.input_ids
    attention_mask = proc_out.attention_mask
    label_seqs = [torch.tensor(ex["labels"], dtype=torch.long) for ex in valid_examples]
    labels = input_ids.new_full(input_ids.shape, -100)
    for i, lab in enumerate(label_seqs):
        L = lab.size(0)
        labels[i, -L:] = lab

    return {
        "pixel_values":    proc_out.pixel_values,
        "image_grid_thw":  proc_out.image_grid_thw,
        "input_ids":       input_ids,
        "attention_mask":  attention_mask,
        "labels":          labels,
    }



# ───────────────────────────────────────────────────────────────────────────────
# 6) Trainer setup ──────────────────────────────────────────────────────────────
# # 6) Trainer setup ──────────────────────────────────────────────────────────────
# training_args = Seq2SeqTrainingArguments(
#     output_dir="qwen_peft_out",
#     per_device_train_batch_size=16,
#     gradient_accumulation_steps=2,
#     per_device_eval_batch_size=16,
#     learning_rate=3e-5,
#     num_train_epochs=1,
#     eval_strategy="steps",
#     eval_steps=10,
#     save_steps=10,
#     save_total_limit=3,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     predict_with_generate=True,
#     logging_steps=50,
#     report_to=["none"],
#     remove_unused_columns=False,
#     label_names=["labels"],
#     dataloader_num_workers=4,
#     dataloader_prefetch_factor=2,
#     dataloader_pin_memory=True,
#     gradient_checkpointing=False,        # disable in HF Trainer too

# )
from transformers import Seq2SeqTrainingArguments

# small_val_ds = tokenized_val.shuffle(seed=42).select(range(2000))


training_args = Seq2SeqTrainingArguments(
    output_dir="qwen_peft_out",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=16,
    # num_train_epochs=1,
    max_steps = 4800,

    # cosine decay with warmup
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",            # warm up over first 10% of steps

    eval_strategy="steps",
    eval_steps=800,
    save_steps=800,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    predict_with_generate=True,
    logging_steps=800,
    report_to=["none"],

    remove_unused_columns=False,
    label_names=["labels"],
    dataloader_num_workers=4,
    dataloader_prefetch_factor=2,
    dataloader_pin_memory=True,
    gradient_checkpointing=False,
    dataloader_drop_last=True,

    # fp16 = True,
    # bf16 = True,
)

trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,      # ← just these 200 examples

    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# ───────────────────────────────────────────────────────────────────────────────
# 7) Launch training ────────────────────────────────────────────────────────────
trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/66144 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66144 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/132287 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/132287 [00:00<?, ? examples/s]

Filter:   0%|          | 0/119058 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13229 [00:00<?, ? examples/s]

Train size: 118937, Validation size: 2000


preprocessor_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.70k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

Map:   0%|          | 0/118937 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
800,2.9523,1.890904
1600,1.8291,1.731933
2400,1.679,1.59419
3200,1.6175,1.524576
4000,1.5604,1.499502
4800,1.5017,1.492015


TrainOutput(global_step=4800, training_loss=1.8566721343994141, metrics={'train_runtime': 42045.3819, 'train_samples_per_second': 1.827, 'train_steps_per_second': 0.114, 'total_flos': 1.937308211600425e+17, 'train_loss': 1.8566721343994141, 'epoch': 0.6457688685591282})

In [3]:
save_dir = "/kaggle/working/"

# 8a) LoRA adapters only
peft_model.save_pretrained(f"{save_dir}/lora_adapters_qwen_2_5_4b")

# 8b) Base model (FP16 weights)
# peft_model.base_model.save_pretrained(f"{save_dir}/base_model_fp16")

# 8c) Processor / tokenizer
processor.save_pretrained(f"{save_dir}/qwen_2_5_processor")

[]

In [4]:
from peft import PeftModel

# assuming `peft_model` is your PeftModel-wrapped model:

# 1. Merge the adapters into the base model and unload the adapter code
merged_model = peft_model.merge_and_unload()

# 2. Save the merged model (no more LoRA hooks, just a standard Hugging Face model)
merged_model.save_pretrained(f"{save_dir}/merged_model_qwen_2_5_4b")