In [1]:
import transformers
transformers.__version__

'4.50.0.dev0'

In [2]:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    _attn_implementation="eager"
).to("cuda")

model.safetensors.index.json:   0%|          | 0.00/63.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.03G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [37]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
            {"type": "text", "text": "Can you describe this image?"},
        ]
    },
    
]


answer = "It is an image of a bee."

In [38]:
inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
)

In [39]:
inputs

'<|im_start|>User:<image>Can you describe this image?<end_of_utterance>\nAssistant:'

In [40]:
inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
).to(model.device, dtype=torch.bfloat16)


In [41]:
inputs.keys()

dict_keys(['pixel_values', 'pixel_attention_mask', 'input_ids', 'attention_mask'])

In [43]:
answer_tokenized = processor.tokenizer(answer,padding=True,return_tensors="pt")

In [36]:
processor.tokenizer.batch_decode(inputs['input_ids'])

['<|im_start|>User:<fake_token_around_image><row_1_col_1><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image><row_1_col_2><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><i

In [44]:
answer_tokenized

{'input_ids': tensor([[ 1589,   314,   354,  2443,   282,   253, 11426,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [ ]:
generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)
generated_texts = processor.batch_decode(
    generated_ids,
    skip_special_tokens=True,
)
print(generated_texts[0])


In [45]:
from dataclasses import dataclass
import torch
from transformers import ProcessorMixin
from typing import Dict, List, Optional, Union

@dataclass
class DataCollatorForVL:
    processor: ProcessorMixin
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, examples: List[Dict]) -> Dict[str, torch.Tensor]:
        # Separate images and texts
        pixel_values = [example["pixel_values"] for example in examples]
        pixel_attention_mask = [example["pixel_attention_mask"] for example in examples]
        texts = [example["input_ids"] for example in examples]
        
        # Process images
        image_batch = {
            "pixel_values": torch.stack(pixel_values),
            "pixel_attention_mask": torch.stack(pixel_attention_mask),
        }
        
        # Process texts
        text_batch = self.processor.tokenizer(
            texts,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
            add_special_tokens=False,
        )
        
        # Create labels for training
        labels = None
        if "answer" in examples[0]:
            labels = text_batch["input_ids"].clone()
            for idx, example in enumerate(examples):
                # Find where the answer starts (after the last user content)
                user_content_length = len(self.processor.tokenizer(
                    example["messages"][0]["content"],
                    add_special_tokens=False
                )["input_ids"])
                
                # Mask all tokens before answer with -100
                labels[idx, :user_content_length] = -100
                
                # Set padding tokens to -100
                labels[idx, text_batch["attention_mask"][idx] == 0] = -100
        
        batch = {
            **image_batch,
            "input_ids": text_batch["input_ids"],
            "attention_mask": text_batch["attention_mask"],
        }
        
        if labels is not None:
            batch["labels"] = labels
            
        return batch


In [46]:
collator = DataCollatorForVL(processor)

In [44]:
collator

In [53]:
import PIL
import requests
from io import BytesIO

image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"

response = requests.get(image_path)
image = PIL.Image.open(BytesIO(response.content)).resize((256, 256)).convert('RGB')

In [54]:
question = "What is your name?"
answer = "I am jonathan"

In [59]:
messages = [
          {
              "role": "user",
              "content": [
                  {"type": "image"},
                  {"type": "text", "text": question}
              ]
          },
          {
              "role": "assistant",
              "content": [
                  {"type": "text", "text": answer}
              ]
          }
      ]

In [69]:
texts = []
images = []
image_token_id = processor.tokenizer.additional_special_tokens_ids[processor.tokenizer.additional_special_tokens.index("<image>")]
text = processor.apply_chat_template(messages, add_generation_prompt=False)
texts.append(text.strip())
images.append([image])
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
labels[labels == processor.tokenizer.pad_token_id] = -100
labels[labels == image_token_id] = -100
batch["labels"] = labels

In [71]:
batch.keys()

dict_keys(['pixel_values', 'pixel_attention_mask', 'input_ids', 'attention_mask', 'labels'])

In [68]:
image_token_id

49190

In [ ]:
texts = []
images = []
text = processor.apply_chat_template(messages, add_generation_prompt=False)
texts.append(text.strip())
images.append([image])
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
labels[labels == processor.tokenizer.pad_token_id] = -100
labels[labels == image_token_id] = -100
batch["labels"] = labels

In [72]:
from transformers import AutoTokenizer

def build_message(question,answer = None):

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": question}
            ]
        }
    ]

    if answer is not None:
        messages.append({
            "role": "assistant",
            "content": [
                {"type": "text", "text": answer}
            ]
        })
    return messages


In [74]:
question = "What is your name?"
answer = "I am jonathan"

question2 = "What is your name 2?"
answer2 = "I am jonathan the best pilote"

data = [{"question": question, "answer": answer, "images": image, "is_train": True},
        {"question" : question2, "answer": answer2, "images": image, "is_train": True}
        ]

In [142]:

is_train = data[0]['is_train']
texts = []
texts2 = []
images = []



texts = [processor.apply_chat_template(build_message(item["question"], item["answer"]), add_generation_prompt=False).strip() for item in data]
texts2 = [processor.apply_chat_template(build_message(item["question"], None), add_generation_prompt=True).strip() for item in data]
images = [[item["images"]] for item in data ]


if is_train:
    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
    batch_2 = processor(text=texts2, images=images, return_tensors="pt", padding=True)
    inputs_max = batch_2["input_ids"].shape[1]    
    labels = batch["input_ids"].clone()
    labels[:,:inputs_max] = -100
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels
else:
    batch = batch_2
return batch

# for item in data:
#     if item['is_train']:
#         text = 
#         text2 = processor.apply_chat_template(build_message(item["question"], None), add_generation_prompt=False)
#     else:
#         text = processor.apply_chat_template(build_message(item["question"], None), add_generation_prompt=False)
#     texts.append(text.strip())
#     images.append([item["images"]])
#     
#     
#     
#     
# 
# 
# labels = batch["input_ids"].clone()
# labels[labels == processor.tokenizer.pad_token_id] = -100
# labels[labels == image_token_id] = -100
# batch["labels"] = labels
# batch['answer'] = [item['answer'] for item in data]


In [143]:
texts2

['<|im_start|>User:<image>What is your name?<end_of_utterance>\nAssistant:',
 '<|im_start|>User:<image>What is your name 2?<end_of_utterance>\nAssistant:']

In [144]:
texts

['<|im_start|>User:<image>What is your name?<end_of_utterance>\nAssistant: I am jonathan<end_of_utterance>',
 '<|im_start|>User:<image>What is your name 2?<end_of_utterance>\nAssistant: I am jonathan the best pilote<end_of_utterance>']

In [145]:
batch["input_ids"].shape

torch.Size([2, 1441])

In [147]:
inputs_max = batch_2["input_ids"].shape[1]


In [153]:
processor.tokenizer.batch_decode(batch["input_ids"][:,:inputs_max])

['<|im_start|>User:<fake_token_around_image><row_1_col_1><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image><row_1_col_2><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><i

In [156]:
batch["input_ids"].shape

torch.Size([2, 1441])

In [160]:
labels = batch["input_ids"].clone()
labels[:,:inputs_max] = -100
labels[labels == processor.tokenizer.pad_token_id] = -100

In [161]:
labels

tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  6451,  1520, 49279]])