In [1]:
pip install open_flamingo

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from open_flamingo import create_model_and_transforms

model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="anas-awadalla/mpt-7b",
    tokenizer_path="anas-awadalla/mpt-7b",
    cross_attn_every_n_layers=4
)

# grab model checkpoint from huggingface hub
from huggingface_hub import hf_hub_download
import torch

checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-9B-vitl-mpt7b", "checkpoint.pt")
model.load_state_dict(torch.load(checkpoint_path), strict=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")

Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Flamingo model initialized with 1384781840 trainable parameters
Using device: cuda


In [5]:
import json

imagesPath = "/scratch/nmachav1/CLEVR_v1.0/images/val/"
quesPath = "/scratch/nmachav1/CLEVR_v1.0/questions/CLEVR_val_questions.json"

jsonFile = open(quesPath, 'r')
questions  = json.load(jsonFile)

In [22]:
from tqdm import tqdm
from PIL import Image
import requests

for i in range(1,4):
    img_file = open(f"/scratch/nmachav1/CLEVR_v1.0/datasetSplits/val_images_objectsNum{i}.txt", "r") 
    img_names = img_file.readlines()
    
    ansFile = open(f"/scratch/nmachav1/CLEVR_v1.0/OpenFlamingo/answers_difficulty_based_val/num_{i}.txt", "a")
    object = []
    for img_name in tqdm(img_names[:1]):
        image_ques = []
        image_ans = []
        count = 0
        img = Image.open(imagesPath+img_name[:-1]).convert("RGB")
        vision_x = [image_processor(img).unsqueeze(0)]
        vision_x = torch.cat(vision_x, dim=0).to(device)
        vision_x = vision_x.unsqueeze(1).unsqueeze(0)

        for q in questions["questions"]:
            if q["image_filename"] == img_name[:-1]:
                image_ques.append(q["question"])
                image_ans.append(q["answer"])
        for iqs in image_ques:
            ques_for_model = "Question: " + iqs + " Answer: "
            tokenizer.padding_side = "left"  # For generation, padding tokens should be on the left
            lang_x = tokenizer(
                ["<image>" + ques_for_model],
                return_tensors="pt",
            ).to(device)

            generated_text = model.generate(
                vision_x=vision_x,
                lang_x=lang_x["input_ids"],
                attention_mask=lang_x["attention_mask"],
                max_new_tokens=50,  # You can adjust this to control the length of the generated text
                num_beams=6,
            )

            model_output = tokenizer.decode(generated_text[0])
            print(model_output)
            answer_index = model_output.split("Answer:")

            if "<|" in answer_index[-1]:
                answer = answer_index[-1].split("<|")[0].strip()
            else:
                answer = answer_index[-1].strip()
            print(answer)
            object.append({'image_id': img_name,
                          'Question': iqs,
                          'Ground truth': image_ans[count],
                          'Model generated answer': answer
            })

            count+=1
    json.dump(object, ansFile)
    ansFile.close()

0it [00:00, ?it/s]
0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: What is the material of the thing that is left of the blue block and on the right side of the big green matte block? Answer:  The material of the thing that is left of the blue block and on the right side of the big green matte block is the material of the big green matte block.<|endofchunk|>
The material of the thing that is left of the blue block and on the right side of the big green matte block is the material of the big green matte block.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: Is the shape of the small gray matte thing the same as the object behind the big green rubber object? Answer:  Yes.<|endofchunk|>
Yes.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: What is the big thing that is in front of the block that is behind the block that is in front of the large shiny block made of? Answer:  The big thing that is in front of the block that is behind the block that is in front of the large shiny block made of.<|endofchunk|>
The big thing that is in front of the block that is behind the block that is in front of the large shiny block made of.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: How many other objects are the same size as the green rubber object? Answer:  3<|endofchunk|>
3


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: Is the color of the large shiny object the same as the small ball? Answer:  Yes.<|endofchunk|>
Yes.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: There is a cube to the left of the rubber thing that is on the right side of the large green matte block; how many big blue metallic objects are right of it? Answer:  2.<|endofchunk|>
2.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: There is a large block that is behind the block to the left of the big blue thing; what is it made of? Answer:  It is made of the same material as the big blue thing.<|endofchunk|>
It is made of the same material as the big blue thing.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: Is there a small gray thing that has the same shape as the big green object? Answer:  Yes.<|endofchunk|>
Yes.


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: What is the color of the metal object that is the same size as the green rubber block? Answer:  Blue<|endofchunk|>
Blue


100%|██████████| 1/1 [01:21<00:00, 81.18s/it]

<image>Question: What number of things are objects behind the big green matte cube or things that are in front of the big shiny thing? Answer:  1.<|endofchunk|>
1.





In [19]:
from PIL import Image
import requests

"""
Step 1: Load images
val_images_objectsNum1.txt
"""
demo_image_one = Image.open("/scratch/nmachav1/CLEVR_v1.0/images/val/CLEVR_val_000006.png").convert("RGB")


"""
Step 2: Preprocessing images
Details: For OpenFlamingo, we expect the image to be a torch tensor of shape 
 batch_size x num_media x num_frames x channels x height x width. 
 In this case batch_size = 1, num_media = 3, num_frames = 1,
 channels = 3, height = 224, width = 224.
"""
vision_x = [image_processor(demo_image_one).unsqueeze(0)]
vision_x = torch.cat(vision_x, dim=0).to(device)
vision_x = vision_x.unsqueeze(1).unsqueeze(0)

"""
Step 3: Preprocessing text
Details: In the text we expect an <image> special token to indicate where an image is.
 We also expect an <|endofchunk|> special token to indicate the end of the text 
 portion associated with an image.
"""
question = "Question: What is the big thing that is in front of the block that is behind the block that is in front of the large shiny block made of? Answer: "
tokenizer.padding_side = "left"  # For generation, padding tokens should be on the left
lang_x = tokenizer(
    ["<image>" + question],
    return_tensors="pt",
).to(device)


""" 
Step 4: Generate text
"""
generated_text = model.generate(
    vision_x=vision_x,
    lang_x=lang_x["input_ids"],
    attention_mask=lang_x["attention_mask"],
    max_new_tokens=50,  # You can adjust this to control the length of the generated text
    num_beams=6,
)
model_output = tokenizer.decode(generated_text[0])
print(model_output)


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Question: What is the big thing that is in front of the block that is behind the block that is in front of the large shiny block made of? Answer:  The big thing that is in front of the block that is behind the block that is in front of the large shiny block made of.<|endofchunk|>


In [13]:
answer_index = model_output.split("Answer:")
if "<|" in answer_index[-1]:
    answer = answer_index[-1].split("<|")[0].strip()
else:
    answer = answer_index[-1].strip()
print(answer)


The material of the thing that is left of the blue block and on the right side of the big green matte block is the material of the big green matte block.


In [12]:
ansFile = open(f"/scratch/nmachav1/CLEVR_v1.0/OpenFlamingo/answers_difficulty_based_val/num_{i}.txt", "a+")
ansFile.seek(0)

# Check if the file is empty
is_empty = not bool(ansFile.read())

# If the file is not empty, truncate it to remove existing content
if not is_empty:
    ansFile.truncate(0)