In [10]:
from open_flamingo import create_model_and_transforms

model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
    tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
    cross_attn_every_n_layers=1
)

KeyboardInterrupt: 

In [None]:
# grab model checkpoint from huggingface hub
from huggingface_hub import hf_hub_download
import torch

checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path), strict=False)

_IncompatibleKeys(missing_keys=['vision_encoder.class_embedding', 'vision_encoder.positional_embedding', 'vision_encoder.proj', 'vision_encoder.conv1.weight', 'vision_encoder.ln_pre.weight', 'vision_encoder.ln_pre.bias', 'vision_encoder.transformer.resblocks.0.ln_1.weight', 'vision_encoder.transformer.resblocks.0.ln_1.bias', 'vision_encoder.transformer.resblocks.0.attn.in_proj_weight', 'vision_encoder.transformer.resblocks.0.attn.in_proj_bias', 'vision_encoder.transformer.resblocks.0.attn.out_proj.weight', 'vision_encoder.transformer.resblocks.0.attn.out_proj.bias', 'vision_encoder.transformer.resblocks.0.ln_2.weight', 'vision_encoder.transformer.resblocks.0.ln_2.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_fc.bias', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.weight', 'vision_encoder.transformer.resblocks.0.mlp.c_proj.bias', 'vision_encoder.transformer.resblocks.1.ln_1.weight', 'vision_encoder.transformer.resbloc

In [18]:
from PIL import Image
import requests
import torch

"""
Step 1: Load images
"""
# demo_image_one = Image.open(
#     requests.get(
#         "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True
#     ).raw
# )

# demo_image_two = Image.open(
#     requests.get(
#         "http://images.cocodataset.org/test-stuff2017/000000028137.jpg",
#         stream=True
#     ).raw
# )

# query_image = Image.open(
#     requests.get(
#         "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", 
#         stream=True
#     ).raw
# )
demo_image_one = Image.open(
    requests.get(
        "https://condition-report-images.acvauctions.com/public/photo_555e7f93-05a6-443f-8e0f-5c0a6be08577.jpg", stream=True
    ).raw
)

demo_image_two = Image.open(
    requests.get(
        "https://condition-report-images.acvauctions.com/public/photo_565e79c5-9f94-432d-93c5-12d614fe2f30.jpg",
        stream=True
    ).raw
)

demo_image_three = Image.open(
    requests.get(
        "https://condition-report-images.acvauctions.com/public/photo_a326644b-3171-41da-bd3f-f546fd33db4f.jpg",
        stream=True
    ).raw
)


# query_image = Image.open(
#     requests.get(
#         "https://condition-report-images.acvauctions.com/public/photo_21844d91-ea58-47a7-8e99-02ad0cb181c8.jpg", 
#         stream=True
#     ).raw
# )
query_image = Image.open(
    requests.get(
        "https://condition-report-images.acvauctions.com/public/photo_3b34bb98-7ed0-46ca-9366-7f007f93cb4f.jpg", 
        stream=True
    ).raw
)

"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
vision_x = [image_processor(demo_image_one).unsqueeze(0),
            image_processor(demo_image_two).unsqueeze(0),
            image_processor(demo_image_three).unsqueeze(0),
            image_processor(query_image).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
tokenizer.padding_side = "left" # For generation padding tokens should be on the left
# lang_x = tokenizer(
#     ["<image>An image of two cats.<|endofchunk|><image>An image of a bathroom sink.<|endofchunk|><image>An image of"],
#     return_tensors="pt",
# )
lang_x = tokenizer(
    ["<image>The seat has stains all over and the lower side bolster is worn and torn.<|endofchunk|>\
     <image>The seat does not have any damage.<|endofchunk|>\
     <image>The seat has cracked and dry leather with significant wear on the outside edge of the bolster.<|endofchunk|>\
     <image>The condition of the seat is "],
    return_tensors="pt",
)


"""
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=40,
    num_beams=3,
)

print("Generated text: ", tokenizer.decode(generated_text[0]))

Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Generated text:  <image>The seat has stains all over and the lower side bolster is worn and torn.<|endofchunk|>     <image>The seat does not have any damage.<|endofchunk|>     <image>The seat has cracked and dry leather with significant wear on the outside edge of the bolster.<|endofchunk|>     <image>The condition of the seat is icing on the cake.<|endofchunk|>
