In [1]:
%%bash
export HF_HOME="/workspace/.cache/huggingface"

In [2]:
import os

from omegaconf import OmegaConf
from functools import partial
from PIL import Image
import torch

from open_flamingo import create_model_and_transforms 
from open_flamingo.train.any_res_data_utils import process_images
from open_flamingo.train.sft_data_utils import make_supervised_data_module
from dataclasses import dataclass, field

In [3]:
from typing import Optional, List, Tuple

In [38]:
@dataclass
class TrainingConfig:
    # Data args
    data_path = {'/workspace/detail_23k.json': 2000}
    batch_size: int = 1
    workers: int = 2
    data_sampler_group_by_length: bool = True
    is_multimodal: bool = True
    mm_use_im_start_end: bool = False
    conv_template_name: Optional[str] = "phi_3"
    image_aspect_ratio: str = "pad"
    anyres_patch_sampling: bool = False
    anyres_grids: List[Tuple[int, int]] = None
    num_vision_tokens = 128
    image_aspect_ratio = "anyres"
    anyres_patch_sampling = True
    batch_size = 2
    world_size = 1
    gradient_accumulation_steps = 8
    rank = 0
    anyres_grids: List[List[int]] = field(default_factory=lambda: [[1,2],[2,1],[2,2],[3,1],[1,3]])

## Inference code

In [5]:
# Set model configs.
model_ckpt="base_model_weight/xgen-mm-phi3-mini-base-r-v1.5.pt"
cfg = dict(
    model_family = 'xgenmm_v1',
    lm_path = 'microsoft/Phi-3-mini-4k-instruct',
    vision_encoder_path = 'google/siglip-so400m-patch14-384',
    vision_encoder_pretrained = 'google',
    num_vision_tokens = 128,
    image_aspect_ratio = 'anyres',
    anyres_patch_sampling = True,
    anyres_grids = [(1,2),(2,1),(2,2),(3,1),(1,3)],
    ckpt_pth = model_ckpt,
)
cfg = OmegaConf.create(cfg)

additional_kwargs = {
    "num_vision_tokens": cfg.num_vision_tokens,
    "image_aspect_ratio": cfg.image_aspect_ratio,
    "anyres_patch_sampling": cfg.anyres_patch_sampling,
}

# Initialize the model.
model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path=cfg.vision_encoder_path,
    clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained,
    lang_model_path=cfg.lm_path,
    tokenizer_path=cfg.lm_path,
    model_family=cfg.model_family,
    **additional_kwargs)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

xgenmm_v1 model initialized with 3,931,031,619 trainable parameters
Vision encoder: 0 trainable parameters
Vision tokenizer: 109,901,568 trainable parameters
Language model: 3,821,130,051 trainable parameters
Vision encoder: 428,225,600 parameters
Vision tokenizer: 109,901,568 parameters
Language model: 3,821,130,051 parameters


In [6]:
ckpt = torch.load(cfg.ckpt_pth)
model.load_state_dict(ckpt, strict=True)
torch.cuda.empty_cache()
model.train()
model = model.cuda().to(torch.bfloat16)

In [7]:
base_img_size = model.base_img_size
anyres_grids = []
for (m,n) in cfg.anyres_grids:
    anyres_grids.append([base_img_size*m, base_img_size*n])
model.anyres_grids = anyres_grids

In [8]:
model.anyres_grids

[[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]

In [9]:
# test dataloader
data_config = OmegaConf.load("/workspace/LAVIS/data_configs/detail_23k.yaml")
data_config

{'data_path': {'/workspace/detail_23k.json': 2000}}

In [10]:
data_path = dict(data_config.data_path)
data_path

{'/workspace/detail_23k.json': 2000}

In [39]:
args = TrainingConfig()
args.anyres_grids

[[1, 2], [2, 1], [2, 2], [3, 1], [1, 3]]

In [40]:
# train_dataloader, num_samples = make_supervised_data_module(
#     tokenizer=tokenizer,
#     image_processor=image_processor,
#     data_args=args
# ) # crashes

# one_data = next(iter(train_dataloader.dataloader))
# crashes

In [41]:
from open_flamingo.train.sft_data_utils import LazySupervisedDataset

train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
                            data_path=data_path,
                            image_processor=image_processor,
                            data_args=args)
# data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, 
#                                                     image_aspect_ratio=data_args.image_aspect_ratio)


In [42]:
train_dataset.tokenizer

LlamaTokenizer(name_or_path='microsoft/Phi-3-mini-4k-instruct', vocab_size=32000, model_max_length=4096, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<image>', '<image placeholder>', '<|endofchunk|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rs

In [43]:
train_dataset.image_processor

Compose(
    Resize(size=(384, 384), interpolation=bicubic, max_size=None, antialias=True)
    Lambda()
    ToTensor()
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
)

In [44]:
train_dataset.list_data_dict

[{'id': '000000504284',
  'image': '000000504284.jpg',
  'conversations': [{'from': 'human',
    'value': "What's happening in the scene?\n<image>"},
   {'from': 'gpt',
    'value': 'The image features a couple riding bicycles in a park, holding hands as they enjoy a leisurely ride together. The two bicycles and the couple are positioned next to each other, with the man on the left and the woman on the right. \n\nOther people can be seen around the park, but they appear to be farther away and not involved with the couple. A bench and a handbag are visible in the scene, indicating a relaxed and casual atmosphere in the park.'}]},
 {'id': '000000432727',
  'image': '000000432727.jpg',
  'conversations': [{'from': 'human',
    'value': '<image>\nWhat do you think is going on in this snapshot?'},
   {'from': 'gpt',
    'value': 'The image depicts a crowded room with a young boy at the center, holding a Wii remote in his hand. The boy is wearing glasses and a sports fan sweatshirt, which is

In [45]:
len(train_dataset.list_data_dict)

2000

In [46]:
train_dataset.data_args

TrainingConfig(batch_size=2, workers=2, data_sampler_group_by_length=True, is_multimodal=True, mm_use_im_start_end=False, conv_template_name='phi_3', image_aspect_ratio='anyres', anyres_patch_sampling=True, anyres_grids=[[1, 2], [2, 1], [2, 2], [3, 1], [1, 3]])

In [47]:
train_dataset.anyres_grids

[[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]

In [48]:
train_dataset.conv_template_name

'phi_3'

In [49]:
train_dataset[0]

{'input_ids': tensor([29871, 32006, 29909, 13563,  1546,   263, 12758,  1404,   322,   385,
         23116, 21082, 20255, 29889,   450, 20255,  4076,  8444, 29892, 13173,
         29892,   322,  1248,   568,  6089,   304,   278,  1404, 29915, 29879,
          5155, 29889, 32007, 32010,  5618, 29915, 29879, 10464,   297,   278,
          9088, 29973,    13, 32012, 32007, 32001,  1576,  1967,  5680,   263,
          7303,   364,  4821,   289,  4245,  7799,   297,   263, 14089, 29892,
         13587,  6567,   408,   896, 13389,   263,   454,   275,   545,   368,
         22203,  4208, 29889,   450,  1023,   289,  4245,  7799,   322,   278,
          7303,   526,  2602,   287,  2446,   304,  1269,   916, 29892,   411,
           278,   767,   373,   278,  2175,   322,   278,  6114,   373,   278,
          1492, 29889, 29871,    13,    13, 16107,  2305,   508,   367,  3595,
          2820,   278, 14089, 29892,   541,   896,  2615,   304,   367, 26645,
          3448,   322,   451,  9701,   

In [50]:
train_dataset[0]['input_ids'].shape

torch.Size([154])

In [37]:
tokenizer.batch_decode([train_dataset[0]['input_ids']])

["<|system|> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <|end|> <|user|> What's happening in the scene?\n <image> <|end|> <|assistant|> The image features a couple riding bicycles in a park, holding hands as they enjoy a leisurely ride together. The two bicycles and the couple are positioned next to each other, with the man on the left and the woman on the right. \n\nOther people can be seen around the park, but they appear to be farther away and not involved with the couple. A bench and a handbag are visible in the scene, indicating a relaxed and casual atmosphere in the park. <|end|>"]

In [51]:
from open_flamingo.train.sft_data_utils import DataCollatorForSupervisedDataset

data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, 
                                                     image_aspect_ratio=args.image_aspect_ratio)
    

In [52]:
from torch.utils.data import DataLoader

data_loader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    num_workers=args.workers,
    pin_memory=True, 
    collate_fn=data_collator,
)

In [54]:
input_ids = next(iter(data_loader))['input_ids']

In [55]:
input_ids.shape

torch.Size([2, 192])

In [None]:
# Preprocessing utils.

image_proc = partial(process_images, image_processor=image_processor, model_cfg=cfg)

def apply_prompt_template(prompt, cfg):
    if 'Phi-3' in cfg.lm_path:
        s = (
                '<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
                "The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
                f'<|user|>\n{prompt}<|end|>\n<|assistant|>\n'
            )
    else:
        raise NotImplementedError
    return s

In [12]:
# Prep image input.
image_path_1 = 'example_images/image-1.jpeg'
image_path_2 = 'example_images/image-2.jpeg'

image_1 = Image.open(image_path_1).convert('RGB')
image_2 = Image.open(image_path_2).convert('RGB')
images = [image_1, image_2]
image_size = [image_1.size, image_2.size]
image_size = [image_size]
vision_x = [image_proc([img]) for img in images]
vision_x = [vision_x]

In [13]:
# Prep language input.
prompt = "Look at this image <image> and this image <image>. What is in the second image?"
prompt = apply_prompt_template(prompt, cfg)
lang_x = tokenizer([prompt], return_tensors="pt")

In [14]:
# Run inference.
kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=1024, top_p=None, num_beams=1)

generated_text = model.generate(
    vision_x=vision_x, 
    lang_x=lang_x['input_ids'].to(torch.device('cuda:0')), 
    image_size=image_size,
    attention_mask=lang_x['attention_mask'].to(torch.device('cuda:0')), 
    **kwargs_default)
    
generated_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
if 'Phi-3' in cfg.lm_path:
    text = generated_text.split('<|end|>')[0]
else:
    text=generated_text

print(text)

You are not running the flash-attention implementation, expect numerical differences.


A black and white cat. 
