## Import libraries

In [None]:
import os
import json
import transformers
import datasets
import torch
import torchvision

from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image

2025-04-09 11:32:23.255570: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-09 11:32:23.272034: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744187543.291575  219109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744187543.297576  219109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 11:32:23.319010: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

## Finetuning Florence-2 VQA

In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base-ft",trust_remote_code=True,revision='refs/pr/6').to(device) 
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base-ft", trust_remote_code=True, revision='refs/pr/6')

In [2]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="finetuning_smolvlm/passengers_bus_vlm_dataset_modified.json")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'answer'],
        num_rows: 309
    })
})


In [3]:
#split the dataset
dataset = load_dataset("json", data_files="finetuning_smolvlm/passengers_bus_vlm_dataset_modified.json")

split_ds = dataset["train"].train_test_split(test_size=0.2, seed=42)  # 80% train, 20% validation
validation_test_split = split_ds["test"].train_test_split(test_size=0.5, seed=42)

train = split_ds["train"]
val = validation_test_split["train"]
test = validation_test_split["test"]
print(f"Train size: {len(train)}, Validation size: {len(val)}, Test size: {len(test)}")

Train size: 247, Validation size: 31, Test size: 31


In [None]:
class VQADataset(Dataset): 

    def __init__(self, data): 
        self.data = data
        
    def __len__(self): 
        return len(self.data)
        
    def __getitem__(self, idx):
        example = self.data[idx]
        question = "<VQA>" + example['question'] 
        first_answer = example['answer']
        image = Image.open(example['image'])
        image = image.convert("RGB")
        return question, first_answer, image

In [None]:
def collate_fn(batch): 
    questions, answers, images = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True).to(device)
    return inputs, answers 

train_dataset = VQADataset(train)
val_dataset = VQADataset(val) 
batch_size = 1
num_workers = 0

train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

### Test finetuned model

In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model_path = "./model_checkpoints/epoch_7"

model = AutoModelForCausalLM.from_pretrained(model_path,trust_remote_code=True).to(device) 
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

In [22]:
def run_example(image, task_prompt):
    prompt = task_prompt 
    inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda')
    generated_ids = model.generate(
      input_ids=inputs["input_ids"].cuda(),
      pixel_values=inputs["pixel_values"].cuda(),
      max_new_tokens=1024,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text,task=task_prompt,image_size=(image.width, image.height))

    return parsed_answer

In [23]:
test_dataset = VQADataset(test) 
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers)

In [25]:
test_dataset[0]

('<VQA>How many people are sitting?',
 'The number of people sitting in the bus is 23.',
 <PIL.Image.Image image mode=RGB size=2560x1440>)

In [24]:
image = test_dataset[0][2]

task_prompt = test_dataset[0][0]
run_example(image,task_prompt)

{'<VQA>How many people are sitting?': 'The number of people sitting in the bus is 22.'}