In [None]:
# !pip install peft accelerate bitsandbytes
!pip install --upgrade datasets fsspec trl wandb transformers

In [None]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
import torch
from datasets import features
from PIL import Image

In [None]:
dataset_name = "openbmb/RLAIF-V-Dataset"
# model_name = "liuhaotian/llava-v1.6-mistral-7b"
# model_name = 'llava-hf/llava-v1.6-mistral-7b-hf'
# model_name = 'SurfaceData/llava-v1.6-mistral-7b-sglang'
# model_name = 'google/paligemma-3b-pt-224'

In [None]:
try:
    ds = load_dataset(dataset_name, split="train[:10%]")
    print("Dataset loaded successfully!")
except ValueError as e:
    print(f"Failed to load dataset: {e}")

In [None]:
ds

Dataset({
    features: ['ds_name', 'image', 'question', 'chosen', 'rejected', 'origin_dataset', 'origin_split', 'idx', 'image_path'],
    num_rows: 8313
})

In [None]:
ds[1]

{'ds_name': 'RLAIF-V',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1024x848>,
 'question': 'how many families?',
 'chosen': 'The image shows a Union Organization table setup with 18,000 families.',
 'rejected': 'The image does not provide any information about families.',
 'origin_dataset': 'TextVQA',
 'origin_split': '{"model": "OmniLMM-12B", "feedback_model": "OmniLMM-12B", "type": "question_answering"}',
 'idx': 'OmniLMM-12B_OmniLMM-12B_1',
 'image_path': 'TextVQA/train_images/8733d0a1351be922.jpg'}

In [None]:
from transformers import AutoProcessor
from transformers import AutoModelForVision2Seq

In [None]:
# from transformers BitsAndBytesConfig
# from peft import get_peft_model, LoraConfig

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

In [None]:
model_name = 'HuggingFaceTB/SmolVLM-256M-Instruct'

processor = AutoProcessor.from_pretrained(model_name, do_image_splitting=False)
model = AutoModelForVision2Seq.from_pretrained(model_name,
                                              device_map="auto", torch_dtype=torch.bfloat16)

In [None]:
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     lora_dropout=0.1,
#     bias="none",
#     target_modules="all-linear" # Depends on model
# )

# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()

In [None]:
def format(example):
    # Prepare the input for the chat template
    prompt = [
        {
            "role": "user",
            "content": [{"type": "image"}, {"type": "text", "text": example["question"]}],
        },
    ]
    chosen = [
        {
            "role": "assistant",
            "content": [{"type": "text", "text": example["chosen"]}],
        },
    ]
    rejected = [
        {
            "role": "assistant",
            "content": [{"type": "text", "text": example["rejected"]}],
        },
    ]
    # Apply the chat template
    prompt = processor.apply_chat_template(prompt, tokenize=False)
    chosen = processor.apply_chat_template(chosen, tokenize=False)
    rejected = processor.apply_chat_template(rejected, tokenize=False)
    # Resize the image to ensure it fits within the maximum allowable
    # size of the processor to prevent OOM errors.
    max_size = processor.image_processor.size["longest_edge"]
    example["image"].thumbnail((max_size, max_size))
    return {"images": [example["image"]], "prompt": prompt, "chosen": chosen, "rejected": rejected}

dataset = ds.map(format, remove_columns=ds.column_names)


In [None]:
print(dataset[1])

{'chosen': '<|im_start|>Assistant: The image shows a Union Organization table setup with 18,000 families.<end_of_utterance>\n', 'rejected': '<|im_start|>Assistant: The image does not provide any information about families.<end_of_utterance>\n', 'images': [<PIL.JpegImagePlugin.JpegImageFile image mode=L size=1024x848 at 0x7D7F7604A890>], 'prompt': '<|im_start|>User:<image>how many families?<end_of_utterance>\n'}


In [None]:
# Make sure that the images are decoded, it prevents from storing bytes.
f = dataset.features
f["images"] = features.Sequence(features.Image(decode=True))  # to avoid bytes
dataset = dataset.cast(f)

# Split the dataset
splits = dataset.train_test_split(test_size=0.1)

# Access the splits
train_dataset = splits["train"]
test_dataset = splits["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
from trl import ORPOConfig, ORPOTrainer

In [None]:
!wandb login

In [None]:
config = ORPOConfig(
    output_dir="smolvlm_finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,  # ↑ changed
    lr_scheduler_type='cosine',
    warmup_ratio=0.03,  # ↑ added
    num_train_epochs=5,  # ↓ reduced
    logging_steps=50,
    bf16=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    remove_unused_columns=False,
    max_prompt_length=512,
    max_length=1024,
    report_to="wandb",
    do_eval=True,
    eval_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)

In [None]:
train_dataset, test_dataset

(Dataset({
     features: ['chosen', 'rejected', 'images', 'prompt'],
     num_rows: 7481
 }),
 Dataset({
     features: ['chosen', 'rejected', 'images', 'prompt'],
     num_rows: 832
 }))

In [None]:
trainer = ORPOTrainer(model=model, args=config, train_dataset=train_dataset,
                      processing_class = processor.tokenizer, eval_dataset = test_dataset)

In [None]:
trainer.train()

In [None]:
save_dir = '/content/ft_smolvlm'

model.save_pretrained(save_dir)
processor.save_pretrained(save_dir)

['/content/ft_smolvlm/processor_config.json']

Quantization

In [None]:
!pip install quanto
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install optimum-quanto

In [None]:
from transformers import AutoProcessor
from transformers import AutoModelForVision2Seq, QuantoConfig
import torch

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "/content/ft_smolvlm"

ft_processor = AutoProcessor.from_pretrained(model_id)

quantization_config = QuantoConfig(weights="int8")

quantized_model = AutoModelForVision2Seq.from_pretrained(model_id, device_map="auto", quantization_config=quantization_config)

In [None]:
print(quantized_model)

In [None]:
ds

Dataset({
    features: ['ds_name', 'image', 'question', 'chosen', 'rejected', 'origin_dataset', 'origin_split', 'idx', 'image_path'],
    num_rows: 8313
})

In [None]:
s = np.random.randint(0, len(ds))

user_message = ds[s]['question']
image = ds[s]['image']

print("Sample number: ",s, "\nQuestion: " + user_message,)

Sample number:  7971 
Question: How many signs are there?


In [None]:
image

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": user_message}
        ]
    },
]

# Prepare inputs
prompt = ft_processor.apply_chat_template(messages, add_generation_prompt=True)
print("Prompt : ", prompt, "\n")
inputs = ft_processor(text=prompt, images=[image], return_tensors="pt")

inputs = inputs.to(DEVICE)


# Generate outputs
generated_ids = quantized_model.generate(**inputs, max_new_tokens=250)
generated_texts = ft_processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)

Prompt :  <|im_start|>User:<image>How many signs are there?<end_of_utterance>
Assistant: 



In [None]:
print(generated_texts[0])

User:How many signs are there?
Assistant: There are four signs.


##Hosting

In [None]:
!pip install gradio

In [None]:
import gradio as gr

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def inference(image, user_message):
    # Compose the multimodal prompt as required by your processor
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": user_message}
            ]
        },
    ]

    # Apply chat template to format the input prompt
    prompt = ft_processor.apply_chat_template(messages, add_generation_prompt=True)

    # Process the prompt and image to model inputs
    inputs = ft_processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)

    # Generate outputs from the quantized model
    generated_ids = quantized_model.generate(**inputs, max_new_tokens=250)

    # Decode output tokens to text
    generated_texts = ft_processor.batch_decode(generated_ids, skip_special_tokens=True)

    return generated_texts[0]  # Return the first generated text


In [None]:
iface = gr.Interface(
    fn=inference,
    inputs=[
        gr.Image(type="pil", label="Input Image"),
        gr.Textbox(lines=2, placeholder="Ask something about the image...", label="User Message")
    ],
    outputs=gr.Textbox(label="Model Response"),
    title="(Quantized) Multimodal Chat",
    description="Upload an image and ask a question to get a response from the model.",
    allow_flagging="never"
)


In [None]:
iface.launch(debug=True)