In [1]:
!pip -q install --upgrade "transformers>=4.41.0" peft datasets bitsandbytes accelerate timm pillow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import os, pathlib

# Persist all Hugging Face artefacts under /workspace
HF_CACHE = "/workspace/hf_cache"
pathlib.Path(f"{HF_CACHE}/transformers").mkdir(parents=True, exist_ok=True)
pathlib.Path(f"{HF_CACHE}/datasets").mkdir(parents=True, exist_ok=True)

os.environ["HF_HOME"] = HF_CACHE                   # covers both libs by default
os.environ["TRANSFORMERS_CACHE"] = f"{HF_CACHE}/transformers"
os.environ["HF_DATASETS_CACHE"] = f"{HF_CACHE}/datasets"

In [3]:
from transformers import LlavaForConditionalGeneration, LlavaProcessor
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training




In [4]:
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"

model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    load_in_4bit=True,
    trust_remote_code=True,
    device_map="auto",
    cache_dir=HF_CACHE,          # ← caches to /workspace
)
model = prepare_model_for_kbit_training(model)

# right after you load your processor:
processor = LlavaProcessor.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    cache_dir=HF_CACHE,
)

# ──────────────────────────────────────
# 1) Tell the processor how to split each image into tokens
# ──────────────────────────────────────
processor.patch_size = model.config.vision_config.patch_size
# If your vision backbone adds a CLS token, this should be 1, otherwise 0.
# Most LLaVA models use 1:
processor.num_additional_image_tokens = 1  
processor.vision_feature_select_strategy = model.config.vision_feature_select_strategy


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [5]:
from peft import TaskType
lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    bias="none", task_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

trainable params: 21,168,128 || all params: 7,084,595,200 || trainable%: 0.2988


In [None]:
from datasets import load_dataset, DownloadConfig
def preprocess(example):
    question = example["question"]
    answer   = example["answers"][0]["answer"]

    # single string: <image> token + question + assistant prompt + answer
    prompt = f"USER: <image>\nQUESTION: {question}\nASSISTANT:"
    full   = prompt + " " + answer

    enc = processor(
        text=full,
        images=example["image"],
        padding="max_length",
        truncation=False,
        max_length=512,                # or bump to model.config.max_position_embeddings
        return_tensors="pt",
    )

    input_ids      = enc["input_ids"].squeeze(0)
    attention_mask = enc["attention_mask"].squeeze(0)
    pixel_values   = enc["pixel_values"].squeeze(0)

    # mask out the prompt portion so only the answer contributes to loss
    prompt_tokens = processor.tokenizer(prompt, add_special_tokens=False)["input_ids"]
    prompt_len    = len(prompt_tokens)

    labels = input_ids.clone()
    labels[:prompt_len] = -100

    return {
        "input_ids":      input_ids,
        "attention_mask": attention_mask,
        "pixel_values":   pixel_values,
        "labels":         labels,
    }

# ────────────────────────────────────────────────────────────────
# 2) Re-tokenize (force re-processing; drop the raw "image" column)
# ────────────────────────────────────────────────────────────────
download_cfg = DownloadConfig(resume_download=True, max_retries=5)

tokenised_ds = (
    load_dataset(
        "HuggingFaceM4/VQAv2",
        split="train",
        cache_dir=f"{HF_CACHE}/datasets",
        trust_remote_code=True,
        download_config=download_cfg,
        streaming=False,
    )
    .map(
        preprocess,
        batched=False,
        num_proc=4,
        remove_columns=[
            "question_id",
            "question",
            "answers",
            "multiple_choice_answer",
            "image_id",
            "question_type",
            "answer_type",
            "image",            # drop the raw PIL image
        ],
        load_from_cache_file=False,  # force re-map with new preprocess()
        desc="Tokenising VQAv2",
    )
)

print("Columns after preprocessing:", tokenised_ds.column_names)

Repo card metadata block was not found. Setting CardData to empty.


Tokenising VQAv2 (num_proc=4):   0%|          | 0/443757 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer, default_data_collator

# 1) make sure pad token is set
processor.tokenizer.pad_token = processor.tokenizer.eos_token
model.config.pad_token_id      = processor.tokenizer.eos_token_id

# 2) TrainingArguments for 5-epoch fine-tune on one A40 (≈84% free)
training_args = TrainingArguments(
    output_dir="./llava_vqav2_lora",
    num_train_epochs=5,
    per_device_train_batch_size=1,    # fit in memory
    gradient_accumulation_steps=8,    # effective batch size = 8
    learning_rate=2e-4,
    fp16=True,                        # half-precision
    gradient_checkpointing=True,      # save memory
    logging_steps=50,
    save_steps=500,
    save_total_limit=3,
    remove_unused_columns=False,      # keep our “labels” field
)

# 3) build Trainer (no `label_names` arg)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_ds,
    data_collator=default_data_collator,
)

# 4) launch training
trainer.train()

# 5) save LoRA adapters + tokenizer
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)
