# Setup

In [1]:
import os
os.environ['HF_HOME'] = "/workspace/.cache/huggingface"
from dataclasses import dataclass
from open_flamingo.src.factory import create_model_and_transforms
from open_flamingo.train.sft_data_utils import LazySupervisedDataset
from open_flamingo.train.train_utils import random_seed
import torch
from open_flamingo.train.sft_data_utils import DataCollatorForSupervisedDataset
from torch.utils.data import DataLoader

[2025-05-13 18:58:35,877] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
@dataclass
class QvhDataArgs:
    image_aspect_ratio = "anyres"
    conv_template_name = "phi_3"
    anyres_grids = [(1, 2), (2, 1), (2, 2), (3, 1), (1, 3)]
    data_config = {
        "qvhighlights": {
            "train": {
                "annotations": {
                    "../datasets/qvhighlights-sample/annotations/processed/train.json": 3,
                },
                "videos": "../datasets/qvhighlights/videos/processed",
            },
            "val": {
                "annotations": {
                    "../datasets/qvhighlights-sample/annotations/processed/train.json": 3,
                },
                "videos": "../datasets/qvhighlights/videos/processed",
            },
        }
    }

In [3]:
model, image_processor, text_tokenizer = create_model_and_transforms(
    vision_encoder_path="google/siglip-so400m-patch14-384",
    lang_model_path="microsoft/Phi-3-mini-4k-instruct",
    anyres_grids=[(1, 2), (2, 1), (2, 2), (3, 1), (1, 3)],
    tokenizer_path="microsoft/Phi-3-mini-4k-instruct",
    model_family="xgenmm_v1",
    pretrained_vision_tokenizer=None,
    use_local_files=False,
    verbose=True,
    use_flash_attention_2=True,
    image_aspect_ratio="anyres",
    num_vision_tokens=128,
    anyres_patch_sampling=True,
    gradient_checkpointing=True,
)
random_seed(42)
# model = model.to("cuda", dtype=torch.bfloat16)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

xgenmm_v1 model initialized with 3,931,031,619 trainable parameters
Vision encoder: 0 trainable parameters
Vision tokenizer: 109,901,568 trainable parameters
Language model: 3,821,130,051 trainable parameters
Vision encoder: 428,225,600 parameters
Vision tokenizer: 109,901,568 parameters
Language model: 3,821,130,051 parameters


In [4]:
text_tokenizer

LlamaTokenizer(name_or_path='microsoft/Phi-3-mini-4k-instruct', vocab_size=32000, model_max_length=4096, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '<image placeholder>', '<|endofchunk|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rs

In [5]:
dataset_name = "qvhighlights"

In [6]:
text_tokenizer.is_fast

False

In [7]:
data_args = QvhDataArgs() if dataset_name == "qvhighlights" else "charades-sta"
train_dataset = LazySupervisedDataset(
    tokenizer=text_tokenizer,
    image_processor=image_processor,
    split="train",
    data_args=data_args,
    data_config=data_args.data_config,
)

val_dataset = LazySupervisedDataset(
    tokenizer=text_tokenizer,
    image_processor=image_processor,
    split="val",
    data_args=data_args,
    data_config=data_args.data_config,
)

In [8]:
(text_tokenizer.batch_decode(val_dataset[0]['input_ids'].unsqueeze(0)),
text_tokenizer.batch_decode(val_dataset[1]['input_ids'].unsqueeze(0)),
text_tokenizer.batch_decode(val_dataset[2]['input_ids'].unsqueeze(0))
)

(["<|system|> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|end|> <|user|> <image> \nDescribe the woman in the image <|end|> <|assistant|>"],
 ["<|system|> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|end|> <|user|> <image> \nWhat can be seen in the image? <|end|> <|assistant|>"],
 ["<|system|> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|end|> <|user|> <image> \nWhat happens in the image? <|end|> <|assistant|>"])

In [9]:
val_dataset.split

'val'

In [10]:
train_sample = train_dataset[0]
print(train_sample.keys())
for k, v in train_sample.items():
    print(f" {k=}, {type(v)=}")

dict_keys(['input_ids', 'labels', 'qid', 'image', 'image_size'])
 k='input_ids', type(v)=<class 'torch.Tensor'>
 k='labels', type(v)=<class 'torch.Tensor'>
 k='qid', type(v)=<class 'int'>
 k='image', type(v)=<class 'list'>
 k='image_size', type(v)=<class 'list'>


In [11]:
train_sample['input_ids']

tensor([29871, 32006, 29909, 13563,  1546,   263, 12758,  1404,   322,   385,
        23116, 21082, 20255, 29889,   450, 20255,  4076,  8444, 29892, 13173,
        29892,   322,  1248,   568,  6089,   304,   278,  1404, 29915, 29879,
         5155, 29889, 32007, 32010, 32012,    13,  4002, 29581,   278,  6114,
          297,   278,  1967, 32007, 32001, 29956,  2480,   297,  2174,   333,
          528,  2728,   269,  1169,  1603,   491,   263,  2654,   274,  3222,
        29889, 32007])

In [12]:
train_sample['labels']

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100, 29956,  2480,   297,  2174,   333,
          528,  2728,   269,  1169,  1603,   491,   263,  2654,   274,  3222,
        29889, 32007])

In [13]:
val_sample = val_dataset[2]
print(val_sample.keys())
for k, v in val_sample.items():
    print(f" {k=}, {type(v)=}")

dict_keys(['input_ids', 'labels', 'duration', 'vid', 'qid', 'image', 'image_size'])
 k='input_ids', type(v)=<class 'torch.Tensor'>
 k='labels', type(v)=<class 'str'>
 k='duration', type(v)=<class 'int'>
 k='vid', type(v)=<class 'str'>
 k='qid', type(v)=<class 'int'>
 k='image', type(v)=<class 'list'>
 k='image_size', type(v)=<class 'list'>


In [14]:
val_sample['labels']

'It looks like two person is cooking with facetime'

In [15]:
train_collator = DataCollatorForSupervisedDataset(
    tokenizer=text_tokenizer, image_aspect_ratio="anyres", split="train"
)

val_collator = DataCollatorForSupervisedDataset(
    tokenizer=text_tokenizer, image_aspect_ratio="anyres", split="val"
)

In [23]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=2,
    num_workers=2,
    pin_memory=True,
    shuffle=True,
    collate_fn=train_collator,
    persistent_workers=False,
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=2,
    num_workers=2,
    pin_memory=True,
    shuffle=True,
    collate_fn=val_collator,
    persistent_workers=False,
)

# Test Dataloader

In [25]:
batch = next(iter(val_dataloader))

val
val




val
val




In [26]:
batch.keys()

dict_keys(['input_ids', 'labels', 'attention_mask', 'metadata', 'image_size', 'images'])

In [27]:
(
    batch['input_ids'],
    batch['attention_mask']
)

(tensor([[29871, 32006, 29909, 13563,  1546,   263, 12758,  1404,   322,   385,
          23116, 21082, 20255, 29889,   450, 20255,  4076,  8444, 29892, 13173,
          29892,   322,  1248,   568,  6089,   304,   278,  1404, 29915, 29879,
           5155, 29889, 32007, 32010, 32012,    13,  5618,   508,   367,  3595,
            297,   278,  1967, 29973, 32007, 32001],
         [32011, 32011, 29871, 32006, 29909, 13563,  1546,   263, 12758,  1404,
            322,   385, 23116, 21082, 20255, 29889,   450, 20255,  4076,  8444,
          29892, 13173, 29892,   322,  1248,   568,  6089,   304,   278,  1404,
          29915, 29879,  5155, 29889, 32007, 32010, 32012,    13,  5618,  5930,
            297,   278,  1967, 29973, 32007, 32001]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1

In [28]:
batch['labels']

["Words that says 'Even though I still live close by,'",
 'It looks like two person is cooking with facetime']

In [29]:
text_tokenizer.batch_decode(batch['input_ids'])

["<|system|> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|end|> <|user|> <image> \nWhat can be seen in the image? <|end|> <|assistant|>",
 "<pad><pad>  <|system|> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|end|> <|user|> <image> \nWhat happens in the image? <|end|> <|assistant|>"]

In [30]:
batch['input_ids'] = batch['input_ids'].to('cuda')
batch['attention_mask'] = batch['attention_mask'].to('cuda')
batch['images'][0][0] = batch['images'][0][0].to('cuda')
batch['images'][1][0] = batch['images'][1][0].to('cuda')

In [None]:
val_dataset[0]['labels']

In [None]:
batch['labels']

In [None]:
batch['input_ids']

In [None]:
batch.keys()

In [None]:
batch['input_ids'].shape, batch['labels'].shape, batch['attention_mask'].shape

In [None]:
batch['image_size']

In [None]:
len(batch['images']), len(batch['images'][0])

In [None]:
batch['images'][0][0].shape

In [None]:
text_tokenizer.batch_decode(batch['input_ids'])

In [None]:
batch['input_ids']

In [None]:
batch['labels']

In [None]:
pad_labels = torch.where(batch['labels'] == -100, torch.ones_like(batch['labels']) * 32011, batch['labels'])

In [None]:
text_tokenizer.batch_decode(
    pad_labels, skip_special_tokens=True
)

In [31]:
ckpt = torch.load("../base_model_weight/xgen-mm-phi3-mini-base-r-v1.5.pt")
model.load_state_dict(ckpt, strict=True)
torch.cuda.empty_cache()
model = model.to("cuda", dtype=torch.bfloat16)
vars(model)

  ckpt = torch.load("../base_model_weight/xgen-mm-phi3-mini-base-r-v1.5.pt")


{'_special_tokens': {'media_token': '<image>',
  'image_placeholder_token': '<image placeholder>',
  'end_of_trunk_token': '<|endofchunk|>'},
 'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_forward_pre_hooks_with_kwargs': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': OrderedDict([('vision_encoder',
               SiglipVisionTransformer(
                 (embeddings): SiglipVisionEmbeddings(
                   (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)

In [32]:
!nvidia-smi

Tue May 13 19:04:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01              Driver Version: 565.57.01      CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5000               On  |   00000000:D1:00.0 Off |                  Off |
| 30%   34C    P0             79W /  230W |    8610MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1)

with torch.no_grad():
    generated_text = model.generate(
        vision_x=batch['images'],
        lang_x=batch['input_ids'],
        image_size=batch['image_size'],
        attention_mask=batch['attention_mask'],
        **kwargs_default
    )

In [40]:
text_tokenizer.batch_decode(generated_text, skip_special_tokens=True)

['The image presents a minimalist design with a white background and black text. The text, centered and in a sans-serif font, reads, "Even though I still live close by." This design, with its clean and uncluttered appearance, effectively conveys a sense of simplicity and clarity.',
 'The image appears to be a screenshot of a video call or a live stream. On the left side, there is a video feed showing a person cooking in a kitchen. The person seems to be frying or boiling potatoes in a skillet. On the right side, there is a smaller video feed showing a close-up of the same skillet with the potatoes. The person in the kitchen is wearing a pink sweater and has a watch on their wrist. There are also some icons and controls at the bottom of the image, suggesting that this might be a screenshot from a video conferencing application. The image contains poster (in the center), text (above the center), text (below the center)']