In [1]:
import torch
# Check if CUDA is available
if torch.cuda.is_available():
   print("CUDA is available!")
else:
   print("CUDA is not available.")

CUDA is available!


In [2]:
import sys
!{sys.executable} -m pip install accelerate



In [3]:
!{sys.executable} -m pip install accelerate git+https://github.com/huggingface/trl.git bitsandbytes peft qwen-vl-utils trackio

Collecting git+https://github.com/huggingface/trl.git
  Cloning https://github.com/huggingface/trl.git to /JOBs/tmpdir/pbs.14506003.spcc-adm1/pip-req-build-amvij7rw
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl.git /JOBs/tmpdir/pbs.14506003.spcc-adm1/pip-req-build-amvij7rw
  Resolved https://github.com/huggingface/trl.git to commit 07b4a84e0a3c8f37a2508fe177615af019782946
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [4]:
!{sys.executable} -m pip install peft



In [5]:
import os
from typing import List, Dict, Any
from transformers import TrainingArguments


import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd

from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, TaskType, get_peft_model
from first_prompt_template import *
from utils import *
MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"  # HF model id
TRAIN_CSV = "custom_dataset/custom_dataset/new_train_labels.csv"    # CSV with 'file', 'question', 'label'
IMAGE_TRAIN_ROOT = "custom_dataset/custom_dataset/train"                          # root dir for images (or "custom_dataset/custom_dataset/train")
MAX_LENGTH = 512

OUTPUT_DIR = "./qwen3_vl_lora_clevr_1_12_V4"
USE_BF16 = torch.cuda.is_available()      # use bf16 if GPU supports, else fp16=False


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
USE_BF16 = torch.cuda.is_available()


# ----------------------------------------------------------
# Dataset that follows your sample = {images, messages} format
# and uses prompt_*_expl from prompt_template.py
# ----------------------------------------------------------
class CLEVRVLTrainDataset(Dataset):
    """
    CSV required columns:
      - file: image path
      - question: CLEVR-X question
      - label: '<explanation> -> <answer>'
      - qtype: (optional) 'binary' | 'counting' | 'attribute'
    """

    def __init__(self, csv_path: str, image_root: str):
        super().__init__()
        self.df = pd.read_csv(csv_path)
        self.image_root = image_root

    def __len__(self) -> int:
        return len(self.df)

    def _get_qtype(self, row) -> str:
        if "qtype" in self.df.columns:
            return str(row["qtype"]).lower()
        return classify_clevr_question(str(row["question"]))

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        row = self.df.iloc[idx]

        # 1) Load image
        file_path = str(row["file"])
        if not os.path.isabs(file_path):
            file_path = os.path.join(self.image_root, file_path)
        image = Image.open(file_path).convert("RGB")

        question = str(row["question"])
        label = str(row["label"])  # "<explanation> -> <answer>"
        qtype = self._get_qtype(row)

        # 2) Build conversation using your prompt_*_expl functions
        #    NOTE: num_shots=0 to avoid adding few-shot examples during SFT
        num_k = 6
        if qtype == "binary":
            messages = prompt_binary_expl(
                image=image,
                question=question,
                num_shots=num_k,
                with_image=False,   # we still include the main image; with_image only affects few-shots
            )
        elif qtype == "counting":
            messages = prompt_counting_expl(
                image=image,
                question=question,
                num_shots=num_k,
                with_image=False,
            )
        else:  # attribute
            messages = prompt_attribute_expl(
                image=image,
                question=question,
                num_shots=num_k,
                with_image=False,
            )

        # 3) Append assistant answer: '<explanation> -> <answer>'
        messages.append({
            "role": "assistant",
            "content": [
                {"type": "text", "text": label}
            ],
        })

        # 4) Return exactly the structure TRL expects for VLM:
        #    sample = {"images": [image], "messages": [...]}
        sample = {
            "images": [image],
            "messages": messages,
        }
        return sample

In [12]:
import gc
import time

def clear_memory():
    # Delete variables if they exist in the current global scope
    if 'inputs' in globals(): del globals()['inputs']
    if 'model' in globals(): del globals()['model']
    if 'processor' in globals(): del globals()['processor']
    if 'trainer' in globals(): del globals()['trainer']
    if 'bnb_config' in globals(): del globals()['bnb_config']
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [13]:
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

In [None]:
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16 if USE_BF16 else torch.float16,
)
# 4) TRL SFTConfig (like fine_tuning_vlm_trl.ipynb style)

# Pad token setup
if processor.tokenizer.pad_token_id is None:
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.use_cache = False

# 2) LoRA config (no quantization)
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,  # Directory to save the model
    num_train_epochs=5,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=8,  # Steps to accumulate gradients
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    max_length=None,
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    learning_rate=2e-4,  # Learning rate for training
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_strategy="epoch",  # Strategy for evaluation
    save_strategy="epoch",  # Strategy for saving the modepochel
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    # Hub and reporting
    push_to_hub=False,  # Whether to push model to Hugging Face Hub
    report_to="trackio",  # Reporting tool for tracking metrics
)
model = get_peft_model(model, lora_config)




In [15]:
from huggingface_hub import login
# Replace "YOUR_ACCESS_TOKEN" with your actual token
login(token="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")

In [16]:
# 3) Dataset(s)
dataset = CLEVRVLTrainDataset(TRAIN_CSV, IMAGE_TRAIN_ROOT)
eval_dataset = None  # or another CLEVRVLTrainDataset for validation



In [17]:
from torch.utils.data import random_split
eval_ratio = 0.1   # 10% for evaluation, change if you like
n_total = len(dataset)
n_eval = max(1, int(n_total * eval_ratio))
n_train = n_total - n_eval

train_dataset, eval_dataset = random_split(
    dataset,
    [n_train, n_eval],
    generator=torch.Generator().manual_seed(42),  # for reproducibility
)

print(f"Total: {n_total}, Train: {n_train}, Eval: {n_eval}")

Total: 5000, Train: 4500, Eval: 500


In [18]:
train_dataset[0]

{'images': [<PIL.Image.Image image mode=RGB size=480x320>],
 'messages': [{'role': 'system',
   'content': [{'type': 'text',
     'text': "\nYou are a visual reasoning assistant for synthetic 3D scenes. Think step-by-step but return only the final answer and a short explanation.\nEach image contains objects with the 4 attributes (shape, color, size, material)\nGiven an IMAGE and a QUESTION, your task is to answer 'yes' or 'no' strictly based on the image.\nYour final answer must always follow this format:\n<explanation> -> <answer>\nRules:\n- <answer> is exactly 'yes' or 'no' in lowercase.\n- <explanation> is concise and directly supports the final answer.\n"}]},
  {'role': 'user',
   'content': [{'type': 'text',
     'text': 'Please respond to the questions based on the given instructions and follow the format from demonstrations below.\n'}]},
  {'role': 'user',
   'content': [{'type': 'text',
     'text': 'QUESTION: What number of things are matte things that are in front of the ball

In [19]:
def print_trainable_params(model):
    """
    Print the number and percentage of trainable parameters
    vs total parameters.
    """
    total_params = 0
    trainable_params = 0

    for p in model.parameters():
        num = p.numel()
        total_params += num
        if p.requires_grad:
            trainable_params += num

    pct = 100.0 * trainable_params / total_params if total_params > 0 else 0.0

    print(f"Total parameters:     {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Trainable ratio:      {pct:.4f}%")


In [20]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    processing_class=processor,
)

print_trainable_params(trainer.model)


The model is already on multiple devices. Skipping the move to device specified in `args`.


Total parameters:     2,162,397,184
Trainable parameters: 34,865,152
Trainable ratio:      1.6123%


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'bos_token_id': None}.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: aubamedang14/trackio-dataset
* Found existing space: https://huggingface.co/spaces/aubamedang14/trackio
* View dashboard by going to: https://aubamedang14-trackio.hf.space/


* Created new run: aubamedang14-1764647313


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.6739,1.672187,1.690409,3194091.0,0.775424
2,1.6669,1.670065,1.687309,6388182.0,0.77568
3,1.669,1.669654,1.686558,9582273.0,0.77592
4,1.6649,1.669188,1.685156,12776364.0,0.776133


In [None]:
trainer.save_model(training_args.output_dir)

In [None]:
model = trainer.model
model.eval()

In [None]:
dataset_root = "custom_dataset/custom_dataset"
test_csv  = f"{dataset_root}/test_non_labels.csv"
output_csv = f"{dataset_root}/test_question_types.csv"
train_path = f"{dataset_root}/train_labels.csv"

In [None]:
dataset = load_custom_clevr(dataset_root, test_csv, False)

In [None]:
from run import *

In [None]:
results = run_clevrx_task(model = model, processor = processor, dataset_root= dataset_root ,csv_path= test_csv, num_k = 6)
results

Evaluating CLEVR-X: 201it [07:18,  2.41s/it]

Processed 200/500


Evaluating CLEVR-X: 401it [14:28,  2.24s/it]

Processed 400/500


Evaluating CLEVR-X: 500it [18:03,  2.17s/it]


CLEVR-X: No ground-truth answers available for evaluation.





In [None]:
_ARROW_RE = re.compile(r"\s*->\s*")

def split_explanation_answers(text: str):
    """
    Expect model output of the form:
        '<explanation> -> <answer>'
    Return (explanation, answer).

    If '->' is missing, we treat the whole text as explanation
    and return empty answer.
    """
    text = (text or "").strip()
    if not text:
        return "", ""

    parts = _ARROW_RE.split(text, maxsplit=1)
    if len(parts) == 1:
        # no arrow found
        return parts[0].strip(), ""

    explanation = parts[0].strip()
    answer = parts[1].strip()
    return explanation, answer

In [None]:
import pandas as pd

# results is your list of dicts from the eval loop
# e.g. each r has r["label"] and r["explanation"]

rows = []
for i, r in enumerate(results):
    explanation, answer =  split_explanation_answers(r.get("pred_full", ""))
    rows.append({
        "id": i,  # new incremental id: 0,1,2,...
        "answer": answer,
        "explanation": str(explanation).strip(),
    })

df = pd.DataFrame(rows)
df.to_csv("results_01_12v4.csv", index=False)

print("Saved", len(df), "rows to qwen_eval_results.csv")