In [1]:
!pip install --upgrade --quiet transformers datasets accelerate peft bitsandbytes pillow --no-deps

import transformers, accelerate, peft
print("transformers:", transformers.__version__)
print("accelerate:   ", accelerate.__version__)
print("peft:         ", peft.__version__)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.1/411.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
[?25h

2025-05-13 23:39:37.296388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747179577.720375      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747179577.840743      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


transformers: 4.51.3
accelerate:    1.6.0
peft:          0.15.2


In [2]:
# 0) Install & imports
# !pip install --upgrade transformers accelerate peft datasets pillow --no-deps

import os, warnings, torch
from PIL import Image
from datasets import load_dataset, concatenate_datasets, ClassLabel
from transformers import (
    AutoProcessor,
    Qwen2VLForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

# pick your device once
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Load & filter CSVs
RAW_PATHS = {
    "train":      "/kaggle/input/vr-data-curation1/qa_dataset2.csv",
    "validation": "/kaggle/input/vr-data-curation2/qa_dataset2.csv",
}
raw = load_dataset("csv", data_files=RAW_PATHS)
for split in raw:
    raw[split] = raw[split].filter(lambda ex: bool((ex.get("answer") or "").strip()))

full = concatenate_datasets([raw["train"], raw["validation"]])
types = full.unique("product_type"); types.sort()
full = full.cast_column("product_type", ClassLabel(names=types))

full = full.shuffle(seed=42)

splits = full.shuffle(seed=42).train_test_split(test_size=0.10, seed=42)
train_ds, val_ds = splits["train"], splits["test"]

# train_ds = train_ds.train_test_split(test_size=1000, seed=42)["test"]
val_ds   = val_ds.shuffle(seed = 42).train_test_split(test_size=0.6,  seed=42)["test"]

print(f"Train: {len(train_ds)}, Val: {len(val_ds)}")

# 2) Processor & model in FP16
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    trust_remote_code=True,
    size = None,
    # resized_height=224,
    # resized_width=224,
)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device)

# 3) Attach LoRA (parameters already FP16)
peft_config = LoraConfig(
    inference_mode=False,
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","lm_head"]
)
peft_model = get_peft_model(model, peft_config).to(device)

# 4) Preprocessing fn
def preprocess_batch(examples):
    enc = processor.tokenizer(
        examples["question"], truncation=True, max_length=128
    )
    tgt = processor.tokenizer(
        examples["answer"],   truncation=True, max_length=32
    )
    return {
        "path": examples["path"],
        "input_ids": enc.input_ids,
        "attention_mask": enc.attention_mask,
        "labels": tgt.input_ids,
        "question": examples["question"],
    }

tokenized_train = train_ds.map(preprocess_batch, batched=True)
tokenized_val   = val_ds.map(preprocess_batch,   batched=True)
for ds in (tokenized_train, tokenized_val):
    ds.set_format(
        type="torch",
        columns=["path","input_ids","attention_mask","labels","question"]
    )

# 5) Collator (unchanged)
from torchvision.transforms import Resize
RESIZE = Resize((224, 224))

def collate_fn(batch):
    valid_ex, imgs, bad = [], [], []
    for ex in batch:
        try:
            img = Image.open(f"/kaggle/input/abo-small/images/small/{ex['path']}")\
                       .convert("RGB")
            valid_ex.append(ex); imgs.append(RESIZE(img))
        except:
            bad.append(ex)
    if bad:
        print(f"Skipping {len(bad)} images")#Ensuring that bad images are not sent to the model as it will throw error.

    samples = [
        [
            {"role":"user","content":[{"type":"text","text":ex["question"]}]},
            {"role":"user","content":[{"type":"image","image":imgs[i]}]},
        ]
        for i, ex in enumerate(valid_ex)
    ]
    prompts = processor.apply_chat_template(
        samples, tokenize=False, add_generation_prompt=True
    )
    proc = processor(
        text=prompts,
        images=[[img] for img in imgs],
        padding="longest",
        truncation=True,
        return_tensors="pt",
        # use_fast = True
    )
    input_ids, attention_mask = proc.input_ids, proc.attention_mask

    labels = input_ids.new_full(input_ids.shape, -100)
    for i, ex in enumerate(valid_ex):
        L = torch.tensor(ex["labels"]).size(0)
        labels[i, -L:] = torch.tensor(ex["labels"])

    return {
        "pixel_values":   proc.pixel_values,
        "image_grid_thw": proc.image_grid_thw,
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "labels":         labels,
    }

# 6) Training args & trainer with AMP
training_args = Seq2SeqTrainingArguments(
    output_dir="qwen2vl_fp16_peft",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    # num_train_epochs=5,
    # fp16=True,                      # ← enable mixed precision
    # bf16 = True,
    max_steps = 4000,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine", 

    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    predict_with_generate=True,

    logging_steps=200,
    report_to=["none"],

    remove_unused_columns=False,
    label_names=["labels"],
    dataloader_num_workers=4,
    dataloader_prefetch_factor=2,
    dataloader_pin_memory=True,
    gradient_checkpointing=False,
    dataloader_drop_last=True,
    # eval_dataloader_drop_last=True
    # drop_last = True,
)

from torch.utils.data import DataLoader
from transformers import Seq2SeqTrainer

class CustomTrainer(Seq2SeqTrainer):
    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return DataLoader(
            eval_dataset,
            batch_size=self.args.per_device_eval_batch_size,
            shuffle=False,
            collate_fn=self.data_collator,
            drop_last=True,                          # ← drop that last partial batch
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )


trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=collate_fn,
    tokenizer=processor.tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 7) Launch training
trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/66144 [00:00<?, ? examples/s]

Filter:   0%|          | 0/66144 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/132287 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/132287 [00:00<?, ? examples/s]

Train: 119058, Val: 7938


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Map:   0%|          | 0/119058 [00:00<?, ? examples/s]

Map:   0%|          | 0/7938 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
200,6.898,2.160806
400,1.8106,1.618875
600,1.5183,1.449295
800,1.4221,1.348134
1000,1.3487,1.298818
1200,1.2923,1.256351
1400,1.2343,1.220172
1600,1.1802,1.187163
1800,1.1732,1.159908
2000,1.0922,1.142382


TrainOutput(global_step=4000, training_loss=1.4511924858093261, metrics={'train_runtime': 38865.1798, 'train_samples_per_second': 6.587, 'train_steps_per_second': 0.103, 'total_flos': 2.95799468310528e+17, 'train_loss': 1.4511924858093261, 'epoch': 2.14944227926354})

In [3]:
# 8) Save the fine-tuned LoRA adapters + the base model + processor/tokenizer
save_dir = "/kaggle/working/"

# 8a) LoRA adapters only
peft_model.save_pretrained(f"{save_dir}/lora_adapters_fp16")

# 8b) Base model (FP16 weights)
# peft_model.base_model.save_pretrained(f"{save_dir}/base_model_fp16")

# 8c) Processor / tokenizer
processor.save_pretrained(f"{save_dir}/processor")


[]

In [4]:
from peft import PeftModel

# assuming `peft_model` is your PeftModel-wrapped model:

# 1. Merge the adapters into the base model and unload the adapter code
merged_model = peft_model.merge_and_unload()

# 2. Save the merged model (no more LoRA hooks, just a standard Hugging Face model)
merged_model.save_pretrained(f"{save_dir}/merged_model_fp16")
