## Testing out performance pre fine-tuning

First we load a sample image from the data and plot it.

In [None]:
from PIL import Image
from transformers.image_utils import load_image
import matplotlib.pyplot as plt
import numpy as np

image1 = load_image("visual-spatial-reasoning/images/000000262118.jpg")
plt.imshow(image1)

Now we can load the SmolVLM-Instruct model, which is the Base model finetuned for handling structured prompts/questions.

In [2]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                torch_dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager").to(DEVICE)

2025-02-13 04:56:02.205623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739422562.222897    2174 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739422562.228201    2174 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some kwargs in processor config are unused and will not have any effect: image_seq_len. 
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [3]:
# Load images
image1 = load_image("visual-spatial-reasoning/images/000000262118.jpg")

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "The bed is behind the bench, is that true or false?"}
        ]
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image1], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

print(generated_texts[0])

User:<image>The bed is right of the bench, is that true or false?
Assistant: True.


## Prelim Run on Test Set

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
from PIL import Image
from transformers.image_utils import load_image

processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
                                                torch_dtype=torch.bfloat16,
                                                _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager").to(DEVICE)

In [34]:
def run_inference(image, query):
  # Load images
  image1 = load_image(image)

  # Create input messages
  messages = [
      {
          "role": "user",
          "content": [
              {"type": "image"},
              {"type": "text", "text": f"{query.rstrip('.')}, true or false?"}
          ]
      },
  ]

  # Prepare inputs
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
  inputs = processor(text=prompt, images=[image1], return_tensors="pt")
  inputs = inputs.to(DEVICE)

  # Generate outputs
  generated_ids = model.generate(**inputs, max_new_tokens=500)
  generated_texts = processor.batch_decode(
      generated_ids,
      skip_special_tokens=True,
  )

  return generated_texts[0]

In [None]:
import re
import os
from tqdm import tqdm
from datasets import load_dataset

data_files = {"train": "train.jsonl", "dev": "dev.jsonl", "test": "test.jsonl"}
dataset = load_dataset("cambridgeltl/vsr_random", data_files=data_files)

# Load dataset from JSONL file
test_data = dataset['test']

# Regular expression pattern to extract assistant response
answer_pattern = re.compile(r"Assistant:\s*(\w+)")

correct = 0
preds = []
total = len(test_data)

# Function to process each entry
def evaluate(entry):
    global correct
    global preds
    image_path = os.path.join("images", entry["image"])
    output = run_inference(image_path, entry["caption"])
    # print(output)

    match = answer_pattern.search(output)
    answer = match.group(1) if match else None

    if answer == "True":
      preds.append(1)
    else:
      preds.append(0)

    # Validate answer
    if answer in {"True", "False"} and (answer == "True") == (entry["label"] == 1):
        correct += 1

# Process dataset with tqdm for progress tracking
for entry in tqdm(test_data, desc="Processing images"):
    evaluate(entry)

# save preds
with open("preds.txt", "w") as f:
    for i in range(len(preds)):
        f.write(str(preds[i])+"\n")

# Print results
print(f"Total images: {total}")
print(f"Correct: {correct} ({correct / total:.2%} accuracy)")

## Training

In [4]:
from datasets import load_dataset

data_files = {"train": "train.jsonl", "dev": "dev.jsonl", "test": "test.jsonl"}
dataset = load_dataset("cambridgeltl/vsr_random", data_files=data_files)

train_ds = dataset["train"]
test_ds = dataset["test"]
val_ds = dataset["dev"]

In [5]:
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoProcessor, BitsAndBytesConfig, AutoModelForVision2Seq

USE_QLORA = True
model_id = "HuggingFaceTB/SmolVLM-Instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)

lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )

lora_config.inference_mode = False

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        quantization_config=bnb_config if USE_QLORA else None,
        _attn_implementation="flash_attention_2",
        device_map='auto'
    )

model.add_adapter(lora_config)
model.enable_adapters()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
print(model.get_nb_trainable_parameters())

Some kwargs in processor config are unused and will not have any effect: image_seq_len. 


(10536960, 2256809840)


In [31]:
from transformers.image_utils import load_image
import os

image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

def collate_fn(examples):
  # print("In collate_fn")
    
  texts = []
  images = []

  for example in examples:

      image_path = os.path.join("visual-spatial-reasoning/images", example["image"])

      if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")

      image = load_image(image_path)

      if image.mode != 'RGB':
        image = image.convert('RGB')

      caption = example["caption"]
      label = "True." if example["label"] == 1 else "False."

      messages = [
          {
              "role": "user",
              "content": [
                  {"type": "image"},
                  {"type": "text", "text": f"{caption.rstrip('.')}, true or false?"}
              ]
          },
          {
              "role": "assistant",
              "content": [
                  {"type": "text", "text": label}
              ]
          }
      ]
      text = processor.apply_chat_template(messages, add_generation_prompt=False)

      texts.append(text.strip())
      images.append([image])

  batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
  labels = batch["input_ids"].clone()
  labels[labels == processor.tokenizer.pad_token_id] = -100
  labels[labels == image_token_id] = -100
  batch["labels"] = labels

  return batch

In [32]:
from transformers import TrainingArguments, Trainer

model_name = model_id.split("/")[-1]

training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=1,
    optim="paged_adamw_8bit", # for 8-bit, keep this, else adamw_hf
    bf16=True, # underlying precision for 8bit
    output_dir=f"./{model_name}-vsr",
    hub_model_id=f"{model_name}-vsr",
    report_to="tensorboard",
    remove_unused_columns=False,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_ds,
)

In [43]:
model.load()

AttributeError: 'Idefics3ForConditionalGeneration' object has no attribute 'load'

In [33]:
trainer.train()

Step,Training Loss
25,0.7556
50,0.2721
75,0.0994
100,0.0689
125,0.0648
150,0.059
175,0.0554
200,0.0541
225,0.0496
250,0.0464


TrainOutput(global_step=480, training_loss=0.10079757341494162, metrics={'train_runtime': 6832.6476, 'train_samples_per_second': 1.124, 'train_steps_per_second': 0.07, 'total_flos': 1.3671337188088934e+17, 'train_loss': 0.10079757341494162, 'epoch': 1.0})

## Re-Test and push to hub

In [42]:
import re
import os
from tqdm import tqdm
from datasets import load_dataset

data_files = {"train": "train.jsonl", "dev": "dev.jsonl", "test": "test.jsonl"}
dataset = load_dataset("cambridgeltl/vsr_random", data_files=data_files)

# Load dataset from JSONL file
test_data = dataset['test']

# Regular expression pattern to extract assistant response
answer_pattern = re.compile(r"Assistant:\s*(\w+)")

correct = 0
preds = []
total = len(test_data)

# Function to process each entry
def evaluate(entry):
    global correct
    global preds
    image_path = os.path.join("visual-spatial-reasoning/images", entry["image"])
    output = run_inference(image_path, entry["caption"])
    # print(output)

    match = answer_pattern.search(output)
    answer = match.group(1) if match else None

    if answer == "True":
      preds.append(1)
    else:
      preds.append(0)

    # Validate answer
    if answer in {"True", "False"} and (answer == "True") == (entry["label"] == 1):
        correct += 1

# Process dataset with tqdm for progress tracking
for entry in tqdm(test_data, desc="Processing images"):
    evaluate(entry)

# save preds
with open("preds.txt", "w") as f:
    for i in range(len(preds)):
        f.write(str(preds[i])+"\n")

# Print results
print(f"Total images: {total}")
print(f"Correct: {correct} ({correct / total:.2%} accuracy)")

Processing images:   0%|          | 0/2195 [00:00<?, ?it/s]


RuntimeError: FlashAttention only support fp16 and bf16 data type
Exception raised from mha_fwd at /home/runner/work/flash-attention/flash-attention/csrc/flash_attn/flash_api.cpp:373 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xac (0x76a2a3f91d5c in /usr/lib/python3/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, char const*) + 0x7d (0x76a2a3f365a9 in /usr/lib/python3/dist-packages/torch/lib/libc10.so)
frame #2: flash::mha_fwd(at::Tensor&, at::Tensor const&, at::Tensor const&, std::optional<at::Tensor>&, std::optional<at::Tensor>&, float, float, bool, int, int, float, bool, std::optional<at::Generator>) + 0x100e (0x769fecda308e in /home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so)
frame #3: <unknown function> + 0x1bfffe (0x769fecdbfffe in /home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so)
frame #4: <unknown function> + 0x1bc629 (0x769fecdbc629 in /home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so)
frame #5: <unknown function> + 0x18b282 (0x5bc816c3c282 in /usr/bin/python3)
frame #6: _PyObject_MakeTpCall + 0x25b (0x5bc816c32b4b in /usr/bin/python3)
frame #7: _PyEval_EvalFrameDefault + 0x61f8 (0x5bc816c2c4c8 in /usr/bin/python3)
frame #8: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #9: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #10: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #11: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #12: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #13: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #14: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #15: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #16: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #17: <unknown function> + 0x98f37f (0x76a29618f37f in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #18: <unknown function> + 0xd0481f (0x76a29650481f in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #19: <unknown function> + 0xcf8ca1 (0x76a2964f8ca1 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #20: <unknown function> + 0xcf9078 (0x76a2964f9078 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #21: <unknown function> + 0x48dd3f (0x76a295c8dd3f in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #22: <unknown function> + 0x18b282 (0x5bc816c3c282 in /usr/bin/python3)
frame #23: _PyObject_MakeTpCall + 0x25b (0x5bc816c32b4b in /usr/bin/python3)
frame #24: <unknown function> + 0x199010 (0x5bc816c4a010 in /usr/bin/python3)
frame #25: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #26: <unknown function> + 0x198d2e (0x5bc816c49d2e in /usr/bin/python3)
frame #27: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #28: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #29: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #30: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #31: <unknown function> + 0xd03fd1 (0x76a296503fd1 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #32: <unknown function> + 0xd047b6 (0x76a2965047b6 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #33: <unknown function> + 0x573aefd (0x76a26c93aefd in /usr/lib/python3/dist-packages/torch/lib/libtorch_cpu.so)
frame #34: torch::jit::invokeOperatorFromPython(std::vector<std::shared_ptr<torch::jit::Operator>, std::allocator<std::shared_ptr<torch::jit::Operator> > > const&, pybind11::args const&, pybind11::kwargs const&, std::optional<c10::DispatchKey>) + 0x1d4 (0x76a296262c44 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #35: torch::jit::_get_operation_for_overload_or_packet(std::vector<std::shared_ptr<torch::jit::Operator>, std::allocator<std::shared_ptr<torch::jit::Operator> > > const&, c10::Symbol, pybind11::args const&, pybind11::kwargs const&, bool, std::optional<c10::DispatchKey>) + 0x269 (0x76a296263059 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #36: <unknown function> + 0x9395a5 (0x76a2961395a5 in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #37: <unknown function> + 0x48dd3f (0x76a295c8dd3f in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #38: <unknown function> + 0x18b282 (0x5bc816c3c282 in /usr/bin/python3)
frame #39: PyObject_Call + 0xbb (0x5bc816c4a81b in /usr/bin/python3)
frame #40: _PyEval_EvalFrameDefault + 0x6c1d (0x5bc816c2ceed in /usr/bin/python3)
frame #41: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #42: _PyObject_FastCallDictTstate + 0x16d (0x5bc816c31dbd in /usr/bin/python3)
frame #43: _PyObject_Call_Prepend + 0xc1 (0x5bc816c46db1 in /usr/bin/python3)
frame #44: <unknown function> + 0x29e054 (0x5bc816d4f054 in /usr/bin/python3)
frame #45: _PyObject_MakeTpCall + 0x25b (0x5bc816c32b4b in /usr/bin/python3)
frame #46: _PyEval_EvalFrameDefault + 0x671a (0x5bc816c2c9ea in /usr/bin/python3)
frame #47: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #48: THPFunction_apply(_object*, _object*) + 0xd0c (0x76a296050e7c in /usr/lib/python3/dist-packages/torch/lib/libtorch_python.so)
frame #49: <unknown function> + 0x18b2a8 (0x5bc816c3c2a8 in /usr/bin/python3)
frame #50: PyObject_Call + 0xbb (0x5bc816c4a81b in /usr/bin/python3)
frame #51: _PyEval_EvalFrameDefault + 0x6c1d (0x5bc816c2ceed in /usr/bin/python3)
frame #52: <unknown function> + 0x198be1 (0x5bc816c49be1 in /usr/bin/python3)
frame #53: _PyEval_EvalFrameDefault + 0x58aa (0x5bc816c2bb7a in /usr/bin/python3)
frame #54: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #55: PyObject_Call + 0x122 (0x5bc816c4a882 in /usr/bin/python3)
frame #56: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #57: _PyFunction_Vectorcall + 0x7c (0x5bc816c3caec in /usr/bin/python3)
frame #58: _PyEval_EvalFrameDefault + 0x1a22 (0x5bc816c27cf2 in /usr/bin/python3)
frame #59: <unknown function> + 0x198be1 (0x5bc816c49be1 in /usr/bin/python3)
frame #60: PyObject_Call + 0x122 (0x5bc816c4a882 in /usr/bin/python3)
frame #61: _PyEval_EvalFrameDefault + 0x2c89 (0x5bc816c28f59 in /usr/bin/python3)
frame #62: <unknown function> + 0x198be1 (0x5bc816c49be1 in /usr/bin/python3)


In [None]:
trainer.push_to_hub()