In [1]:
import torch
import json
from PIL import Image
import requests
from tqdm import tqdm
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

In [2]:
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")


processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

cuda


In [3]:
IMAGES_PATH = "/scratch/averma90/CLEVR_v1.0/images/val/"
QUES_PATH = "/home/averma90/CSE576/github/CLEVR_v1.0/questions/CLEVR_val_questions.json"

json_file = open(QUES_PATH, 'r')
questions  = json.load(json_file)['questions']

In [None]:
for i in range(1,4):
    img_file = open(f"/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/datasetSplits/val_images_objectsNum{i}.txt", "r") 
    img_names = img_file.readlines()
    
    ansFile = open(f"/home/averma90/CSE576/github/CLEVR_v1.0/answers/val/json_num_{i}.json", "a")
    object = []

    for img_name in tqdm(img_names[:100])
        image_ques = []
        image_ans = []
        count = 0
        img = Image.open(IMAGES_PATH+img_name[:-1]).convert("RGB")
        for q in questions:
            if q["image_filename"] == img_name[:-1]:
                image_ques.append(q["question"])
                image_ans.append(q["answer"])
        for iqs in image_ques:
            prompt=iqs
            inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
            outputs = model.generate(
                **inputs,
                do_sample=False,
                num_beams=5,
                max_length=256,
                min_length=1,
                top_p=0.9,
                repetition_penalty=1.5,
                length_penalty=1.0,
                temperature=1,
            )
            generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
            # print(generated_text)
            object.append({'image_id': img_name,
                          'Question': prompt,
                          'Ground truth': image_ans[count],
                          'Model generated answer': generated_text
            })
            # ansFile.write(f"{prompt} Answer: {image_ans[count]}. Output: {generated_text}\n")
            
            # ansFile.write(prompt +" " + image_ans[count] + ": " + generated_text+"\n")
            count+=1

    json.dump(object, ansFile)
    ansFile.close()

In [4]:
def prepareAllQuestionsAnswers(all_questions, image_file_name_list):
    print(len(all_questions))

    questions = []
    answers = []
    images = []
    
    for img_name in tqdm(image_file_name_list[:1]):                                                       ######## CHANGE THIS ###########
        for q in all_questions:
            if q["image_filename"] == img_name[:-1]:
                questions.append(q["question"])
                answers.append(q["answer"])
                images.append(q["image_filename"])

    return images, questions, answers

In [5]:
def prepareDataset(image_list_file, questions):
    image_file_name_list = open(image_list_file, "r").readlines()
    input_images_array, input_questions_array, input_answers_array = prepareAllQuestionsAnswers(questions, image_file_name_list)
    
    print(len(input_images_array), len(input_questions_array), len(input_answers_array))
    
    return input_images_array, input_questions_array, input_answers_array

In [6]:
def inference():
    batch_size = 1
    # for i in range(1,11):
    
    
    for i in range(1,4):
        answers = []
        image_array = []
        imgs_file = f"/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/datasetSplits/val_images_objectsNum{i}.txt"
        save_file = f"/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/answers/val/json_answers_{i}.json"

        answerObj = []
        
        input_images_array, input_questions_array, input_answers_array = prepareDataset(imgs_file, questions)

        for img_name in tqdm(input_images_array):                                                      
            image_array.append(Image.open(IMAGES_PATH+img_name).convert("RGB"))
            
        image_batches = [image_array[i:i+batch_size] for i in range(0, len(input_images_array), batch_size)]
        questions_batches = [input_questions_array[i:i+batch_size] for i in range(0, len(input_images_array), batch_size)]
        answers_batches = [input_answers_array[i:i+batch_size] for i in range(0, len(input_images_array), batch_size)]

        for index in tqdm(range(len(image_batches))):
            print(image_batches[index], questions_batches[index], answers_batches[index])
            # print(image_batches)
            inputs = processor(images=image_batches[index], text=questions_batches[index], return_tensors="pt", padding=True, truncation=True).to(device)
            print(len(inputs))
            
            outputs = model.generate(
                **inputs,
                do_sample=False,
                use_cache= True,
                num_beams=5,
                max_length=256,
                min_length=None,
                top_p=1.0,
                top_k= 50,
                repetition_penalty=1.0,
                length_penalty=1.0,
                max_padding_length= None,
                temperature=1e-05,
            )

            generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

            

        with open(save_file) as f:
            json.dump(answerObj, f)
            
        # print(answers)
    # return answers




In [7]:
inference()

149991


0it [00:00, ?it/s]


0 0 0


0it [00:00, ?it/s]
0it [00:00, ?it/s]


149991


0it [00:00, ?it/s]


0 0 0


0it [00:00, ?it/s]
0it [00:00, ?it/s]


149991


100%|██████████| 1/1 [00:00<00:00, 21.44it/s]


10 10 10


100%|██████████| 10/10 [00:00<00:00, 32.35it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

[<PIL.Image.Image image mode=RGB size=480x320 at 0x154D0C57C990>, <PIL.Image.Image image mode=RGB size=480x320 at 0x154D0C57A4D0>] ['What is the material of the thing that is left of the blue block and on the right side of the big green matte block?', 'Is the shape of the small gray matte thing the same as the object behind the big green rubber object?'] ['rubber', 'no']
5


  0%|          | 0/5 [00:04<?, ?it/s]

tensor([[   2,  715, 6288,    2,    1],
        [   2,  694,    2,    1,   -1]], device='cuda:0')





OverflowError: out of range integral type conversion attempted

In [None]:
for iqs in image_ques[:1]:
    prompt=iqs
    inputs = processor(images=[img, img], text=[prompt, prompt], return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        do_sample=False,
        use_cache= True,
        num_beams=5,
        max\length=256,
        min_length=None,
        top_p=1.0,
        top_k= 50,
        repetition_penalty=1.0,
        length_penalty=1.0,
        max_padding_length= None,
        temperature=1e-05,
    )
    generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
    # print(generated_text)

In [14]:
print(processor.batch_decode(outputs, skip_special_tokens=True))

['plastic', 'plastic']


In [7]:
# for i in range(1,11):
    img_file = open(f"/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/datasetSplits/val_images_objectsNum{i}.txt", "r") 
    img_names = img_file.readlines()
    
    ansFile = open(f"/home/averma90/CSE576/github/CLEVR_v1.0/answers/val/num_{i}.txt", "a")
    
    for img_name in tqdm(img_names[:100]):
        image_ques = []
        image_ans = []
        count = 0
        img = Image.open(imagesPath+img_name[:-1]).convert("RGB")
        for q in questions["questions"]:
            if q["image_filename"] == img_name[:-1]:
                image_ques.append(q["question"])
                image_ans.append(q["answer"])
        for iqs in image_ques:
            prompt=iqs
            inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
            outputs = model.generate(
                **inputs,
                do_sample=False,
                num_beams=5,
                max_length=256,
                min_length=1,
                top_p=0.9,
                repetition_penalty=1.5,
                length_penalty=1.0,
                temperature=1,
            )
            generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
            # print(generated_text)
            ansFile.write(f"{prompt} Answer: {image_ans[count]}. Output: {generated_text}\n")
            # ansFile.write(prompt +" " + image_ans[count] + ": " + generated_text+"\n")
            count+=1
    ansFile.close()

0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 100/100 [23:19<00:00, 14.00s/it]
100%|██████████| 100/100 [20:38<00:00, 12.38s/it]
100%|██████████| 100/100 [20:35<00:00, 12.35s/it]
100%|██████████| 100/100 [21:10<00:00, 12.71s/it]
100%|██████████| 100/100 [20:21<00:00, 12.22s/it]
100%|██████████| 100/100 [18:04<00:00, 10.84s/it]
100%|██████████| 100/100 [21:03<00:00, 12.64s/it]
100%|██████████| 100/100 [19:55<00:00, 11.96s/it]


In [6]:
# for objects_type in ['inter', 'intra']:
for objects_type in ['intra']:
    img_file = open(f"/home/averma90/CSE576/github/MLLM_Hallucinations/CLEVR_v1/datasetSplits/val_images_{objects_type}.txt", "r") 
    img_names = img_file.readlines()
    
    ansFile = open(f"/home/averma90/CSE576/github/CLEVR_v1.0/answers/val/type_{objects_type}.txt", "a")
    
    for img_name in tqdm(img_names[:315]):
        image_ques = []
        image_ans = []
        count = 0
        img = Image.open(imagesPath+img_name[:-1]).convert("RGB")
        for q in questions["questions"]:
            if q["image_filename"] == img_name[:-1]:
                image_ques.append(q["question"])
                image_ans.append(q["answer"])
        for iqs in image_ques:
            prompt=iqs
            inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
            outputs = model.generate(
                **inputs,
                do_sample=False,
                num_beams=5,
                max_length=256,
                min_length=1,
                top_p=0.9,
                repetition_penalty=1.5,
                length_penalty=1.0,
                temperature=1,
            )
            generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
            # print(generated_text)
            ansFile.write(f"{prompt} Answer: {image_ans[count]}. Output: {generated_text}\n")
            # ansFile.write(prompt +" " + image_ans[count] + ": " + generated_text+"\n")
            count+=1
    ansFile.close()

100%|██████████| 315/315 [1:04:35<00:00, 12.30s/it]


In [4]:
!nvidia-smi

Sat Nov  4 00:01:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   30C    P0    71W / 500W |  72199MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces