# Set-up Colab

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install accelerate bitsandbytes

In [None]:
!git clone https://github.com/bliminate/Caption-Anything-CSCI-7000.git

In [None]:
!pip install -r Caption_Anything/requirements.txt

In [1]:
import tensorflow as tf
from PIL import Image
import torch
import numpy as np
from transformers import AutoConfig, AutoProcessor, AutoModelForVision2Seq, Blip2ForConditionalGeneration
from accelerate import infer_auto_device_map, init_empty_weights
from caption_anything.captioner import build_captioner, BaseCaptioner
from caption_anything.utils.parser import parse_augment

tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
!pwd
!hostname
!ls
!mkdir data
!tar -xzf 100-images-dataset.tgz -C data/
!tar -xzf 100-images-bitmasks.tgz -C data/

/content
1e6185480f26
100-images-bitmasks.tgz  caption_anything  sample_data
100-images-dataset.tgz	 Caption_Anything


In [None]:
config = AutoConfig.from_pretrained("Salesforce/blip2-opt-2.7b")
with init_empty_weights():
    model = AutoModelForVision2Seq.from_config(config)

model.tie_weights()

device_map = infer_auto_device_map(model)

print(device_map)

config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

{'': 0}


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
max_mem = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-4}GB'
n_gpus = torch.cuda.device_count()
print("Device: ", device)
print("Cuda Mem: ", max_mem)
max_memory = max_memory = {i: max_mem for i in range(n_gpus)}
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
captioning_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                                 device_map={"": 0},
                                                                 load_in_8bit = True,
                                                                 max_memory=max_memory,
                                                                 low_cpu_mem_usage=True,
                                                                 offload_folder="offload",
                                                                 offload_state_dict = True)


Device:  cuda
Cuda Mem:  10GB


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Experiments

In [2]:
import sys
sys.argv.clear()
sys.argv.append("colab_kernel_launcher.py") # this is to assign the first command line argument
sys.argv.append("--clip_filter")

parsed_args = parse_augment()

captioner = build_captioner(parsed_args.captioner, parsed_args.device, parsed_args)

Initializing ImageCaptioning to cuda:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
image = Image.open("data/sa_223769.jpg")
seg_mask = Image.open("data/sa_223769_0_bitmask.jpg")

cap_args = {
    'return_ppl': False,
    'clip_filter': True,
    'reference_caption': "",
    'text_prompt': "",  # 'Question: what does the image show? Answer:'
    'seg_crop_mode': 'w_bg',
    # 'text_prompt': "",
    # 'seg_crop_mode': 'wo_bg',
    'disable_regular_box': False,
    'topN': 1,
    'min_ppl_score': -1.8,
    'min_clip_score': 0.30,
    'min_mask_area': 2500,
}

result = captioner.inference_seg(image, seg_mask,
                                  crop_mode=cap_args['seg_crop_mode'],
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=False,
                                  caption_args=cap_args)

print(result)

Using text prompt: ...
reference caption: , caption: a statue of a woman standing in the dark with a starry sky behind her
Clip score of the caption is 0.341552734375

Processed ImageCaptioning by BLIP2Captioner, Output Text: a statue of a woman standing in the dark with a starry sky behind her
{'clip_score': 0.341552734375, 'caption': 'a statue of a woman standing in the dark with a starry sky behind her', 'crop_save_path': None}


In [6]:
def visual_chain_of_thought(image, seg_mask, cap_args, verb = False):
  if 'text_prompt' in cap_args:
    del cap_args['text_prompt']
  raw_cap = captioner.inference_seg(image, seg_mask,
                                  crop_mode='wo_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=verb,
                                  caption_args=cap_args)

  cot_text_prompt = "Describe the " + raw_cap['caption'] + " in the picture. Answer:"
  cap_args['text_prompt'] = cot_text_prompt
  cot_cap = captioner.inference_seg(image, seg_mask,
                                  crop_mode='w_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=verb,
                                  caption_args=cap_args)
  return cot_cap, raw_cap

In [None]:
image = Image.open("data/sa_224638.jpg")
seg_mask = Image.open("data/sa_224638_1_bitmask.jpg")
caption = visual_chain_of_thought(image, seg_mask, cap_args, True)
print(caption)

uncrop_cap = visual_chain_of_thought_wo_crop(image, seg_mask, cap_args, True)
print(uncrop_cap)

croped image saved in result/crop_1702700569.9584138.png
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: a man in a suit
Clip score of the caption is 0.27880859375

Processed ImageCaptioning by BLIP2Captioner, Output Text: a man in a suit
croped image saved in result/crop_1702700571.6884444.png
Using text prompt: Describe the a man in a suit in the picture. Answer:...
reference caption: , caption: a man in a suit
Clip score of the caption is 0.278076171875

Processed ImageCaptioning by BLIP2Captioner, Output Text: a man in a suit
({'clip_score': 0.278076171875, 'caption': 'a man in a suit', 'crop_save_path': 'result/crop_1702700571.6884444.png'}, {'clip_score': 0.27880859375, 'caption': 'a man in a suit', 'crop_save_path': 'result/crop_1702700569.9584138.png'})
croped image saved in result/crop_1702700573.526685.png
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: a man in a suit
Clip score o

In [None]:
caption = visual_chain_of_thought(image, seg_mask, cap_args)
print(caption)

reference caption: , caption: a white square with a black background
Clip score of the caption is 0.263427734375

Processed ImageCaptioning by BLIP2Captioner, Output Text: a white square with a black background
reference caption: , caption: the statue of the virgin mary
Clip score of the caption is 0.291259765625

Processed ImageCaptioning by BLIP2Captioner, Output Text: the statue of the virgin mary
{'clip_score': 0.291259765625, 'caption': 'the statue of the virgin mary', 'crop_save_path': None}


In [None]:
images = []
with open("image_file_names.txt") as f:
  for l in f.readlines():
    l = l.strip()
    maskSuffix = "_bitmask.jpg"
    imagePath = l + ".jpg"
    mask1 = l + "_0" + maskSuffix
    mask2 = l + "_1" + maskSuffix
    mask3 = l + "_2" + maskSuffix
    mask4 = l + "_3" + maskSuffix
    mask5 = l + "_4" + maskSuffix
    images.append([imagePath, mask1, mask2, mask3, mask4, mask5])

print(images)

[['data/sa_223769.jpg', 'data/sa_223769_0_bitmask.jpg', 'data/sa_223769_1_bitmask.jpg', 'data/sa_223769_2_bitmask.jpg', 'data/sa_223769_3_bitmask.jpg', 'data/sa_223769_4_bitmask.jpg'], ['data/sa_223795.jpg', 'data/sa_223795_0_bitmask.jpg', 'data/sa_223795_1_bitmask.jpg', 'data/sa_223795_2_bitmask.jpg', 'data/sa_223795_3_bitmask.jpg', 'data/sa_223795_4_bitmask.jpg'], ['data/sa_223829.jpg', 'data/sa_223829_0_bitmask.jpg', 'data/sa_223829_1_bitmask.jpg', 'data/sa_223829_2_bitmask.jpg', 'data/sa_223829_3_bitmask.jpg', 'data/sa_223829_4_bitmask.jpg'], ['data/sa_223883.jpg', 'data/sa_223883_0_bitmask.jpg', 'data/sa_223883_1_bitmask.jpg', 'data/sa_223883_2_bitmask.jpg', 'data/sa_223883_3_bitmask.jpg', 'data/sa_223883_4_bitmask.jpg'], ['data/sa_224139.jpg', 'data/sa_224139_0_bitmask.jpg', 'data/sa_224139_1_bitmask.jpg', 'data/sa_224139_2_bitmask.jpg', 'data/sa_224139_3_bitmask.jpg', 'data/sa_224139_4_bitmask.jpg'], ['data/sa_224249.jpg', 'data/sa_224249_0_bitmask.jpg', 'data/sa_224249_1_bitmas

# Base VOT Implementation

In [None]:
#load numpy array from string?
!rm result/crop*

In [None]:
clip_sum = 0
count = 0
image = images[0]

print("generating captions for image: " +  str(count) + " " + image[0] )

for i in range(1,6):
  print("Generating caption for mask ", i)
  img = Image.open(image[0])
  seg_mask = Image.open(image[i])

  result = visual_chain_of_thought(img, seg_mask, cap_args)
  clip_sum += result["clip_score"]

avg_clip = clip_sum/5
print("Average clip score: ", avg_clip)




In [None]:
# Calculate base clip score
clip_sum = 0
count = 0

outfile = open("base_VCT_out.txt", "w")
outfile.writelines("Caption 1: " + 'Question: what does the image show? Answer:' + "\n")
outfile.writelines("Caption 2: " + "Describe the {} in the picture. Answer:" + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought(img, seg_mask, cap_args)
    clip_sum += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip = clip_sum/500
print("Average clip score: ", avg_clip)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(avg_clip))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.296142578125

Processed ImageCaptioning by BLIP2Captioner, Output Text: a white shirt with a red and blue belt
Generating caption for mask  5
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: the image shows a pair of white pants with a yellow stripe
Clip score of the caption is 0.347900390625

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows a pair of white pants with a yellow stripe
Using text prompt: Describe the the image shows a pair of white pants with a yellow stripe in the picture. Answer:...
reference caption: , caption: a pair of white pants with a yellow stripe
Clip score of the caption is 0.30224609375

Processed ImageCaptioning by BLIP2Captioner, Output Text: a pair of white pants with a yellow stripe
generating captions for image: 11 data/sa_224845.jpg
Generating caption for mask  1
Using text prompt: Questi

# Base prompted caption

In [None]:
single_cap_clip_sum = 0
count = 0

outfile = open("base_single_prompted_caption_out.txt", "w")
outfile.writelines("Caption 1: " + 'Question: what does the image show? Answer:' + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    if 'text_prompt' in cap_args:
      del cap_args['text_prompt']
    result = captioner.inference_seg(img, seg_mask,
                                  crop_mode='w_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=False,
                                  caption_args=cap_args)

    single_cap_clip_sum += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n'])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

single_cap_avg_clip = single_cap_clip_sum/500
print("Average clip score: ", single_cap_avg_clip)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(single_cap_avg_clip))
outfile.close()

generating captions for image: 0 data/sa_223769.jpg
Generating caption for mask  1
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: the statue of a woman holding a baby
Clip score of the caption is 0.275390625

Processed ImageCaptioning by BLIP2Captioner, Output Text: the statue of a woman holding a baby
Generating caption for mask  2
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: the statue of a woman holding a child
Clip score of the caption is 0.270263671875

Processed ImageCaptioning by BLIP2Captioner, Output Text: the statue of a woman holding a child
Generating caption for mask  3
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: a lion in a tree
Clip score of the caption is 0.22021484375

Processed ImageCaptioning by BLIP2Captioner, Output Text: a lion in a tree
Generating caption for mask  4
Using text prompt: Question: what does the imag

# VOT Without Cropping

In [5]:
def visual_chain_of_thought_wo_crop(image, seg_mask, cap_args, verb=False):
  cap_args['text_prompt'] = 'Question: what does the image show? Answer:'
  raw_cap = captioner.inference_seg(image, seg_mask,
                                  crop_mode='wo_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=verb,
                                  caption_args=cap_args)

  cot_text_prompt = "Describe the " + raw_cap['caption'] + " in the picture. Answer:"
  cap_args['text_prompt'] = cot_text_prompt
  cot_cap = captioner.inference(image,
                                  filter=True,
                                  args=cap_args)
  return cot_cap, raw_cap

In [None]:
# Calculate base clip score
clip_sum_nocrop = 0
count = 0

outfile = open("base_VCT_wo_crop_out.txt", "w")
outfile.writelines("Caption 1: " + 'Question: what does the image show? Answer:' + "\n")
outfile.writelines("Caption 2: " + "Describe the {} in the picture. Answer:" + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought_wo_crop(img, seg_mask, cap_args)
    clip_sum_nocrop += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip_nocrop = clip_sum_nocrop/500
print("Average clip score: ", avg_clip_nocrop)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(avg_clip_nocrop))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.24560546875

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows a white shirt with a black collar in the picture
Generating caption for mask  5
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: the image shows a pair of white pants with a yellow stripe
Clip score of the caption is 0.347900390625

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows a pair of white pants with a yellow stripe
Using text prompt: Describe the the image shows a pair of white pants with a yellow stripe in the picture. Answer:...
reference caption: , caption: the image shows a pair of white pants with a yellow stripe in the picture
Clip score of the caption is 0.279541015625

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows a pair of white pants with a yellow stripe in the picture
generating captions f

# VCT With Different Captions

In [4]:
def visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2):
  cap_args['text_prompt'] = cap1
  raw_cap = captioner.inference_seg(image, seg_mask,
                                  crop_mode='wo_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=False,
                                  caption_args=cap_args)

  cap_args['text_prompt'] = cap2.format(raw_cap['caption'])
  cot_cap  = captioner.inference_seg(image, seg_mask,
                                  crop_mode='w_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=False,
                                  caption_args=cap_args)
  return cot_cap, raw_cap

In [None]:
# sa_224638 Investigation (trombone player)
image = Image.open("data/sa_224638.jpg")
seg_mask = Image.open("data/sa_224638_1_bitmask.jpg")

#alt 4
cap1 = 'Question: Describe this object? Answer:'
cap2 = "Tell me about the {}. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print(caption)

#alt 1
cap1 = 'Question: What is this? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print(caption)

# alt 5
cap1 = 'Question: what does the image show? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
caption = visual_chain_of_thought_alt_cap_w_bg(image, seg_mask, cap_args, cap1, cap2)
print(caption)

Using text prompt: Question: Describe this object? Answer:...
reference caption: , caption: A man in a suit
Clip score of the caption is 0.27880859375

Processed ImageCaptioning by BLIP2Captioner, Output Text: A man in a suit
Using text prompt: Tell me about the A man in a suit. Answer:...
reference caption: , caption: he's a trombonist
Clip score of the caption is 0.2403564453125

Processed ImageCaptioning by BLIP2Captioner, Output Text: he's a trombonist
({'clip_score': 0.2403564453125, 'caption': "he's a trombonist", 'crop_save_path': None}, {'clip_score': 0.27880859375, 'caption': 'A man in a suit', 'crop_save_path': None})
Using text prompt: Question: What is this? Answer:...
reference caption: , caption: A man in a suit
Clip score of the caption is 0.27880859375

Processed ImageCaptioning by BLIP2Captioner, Output Text: A man in a suit
Using text prompt: Describe the A man in a suit in the picture. Answer:...
reference caption: , caption: A man in a suit
Clip score of the caption

In [8]:
# 224859 facemask investigation

image = Image.open("data/sa_224859.jpg")
seg_mask = Image.open("data/sa_224859_2_bitmask.jpg")

# Base VCT
caption = visual_chain_of_thought(image, seg_mask, cap_args, True)
print("Baseline VCT:")
print(caption)

# Base w/o crop
uncrop_cap = visual_chain_of_thought_wo_crop(image, seg_mask, cap_args, True)
print("VCT w/o crop:")
print(uncrop_cap)

#alt 1
cap1 = 'Question: What is this? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print("Alt 1")
print(caption)

# alt 2
cap1 = 'Question: what does the image show? Answer:'
cap2 = "What is the {} doing in the image. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print("Alt 2")
print(caption)

# alt 3
cap1 = 'Question: What is this? Answer:'
cap2 = "What is the {} doing in the image. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print("Alt 3")
print(caption)

#alt 4
cap1 = 'Question: Describe this object? Answer:'
cap2 = "Tell me about the {}. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print("Alt 4")
print(caption)

# alt 5
cap1 = 'Question: what does the image show? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
caption = visual_chain_of_thought_alt_cap_w_bg(image, seg_mask, cap_args, cap1, cap2)
print("Alt 5")
print(caption)

croped image saved in result/crop_1702847055.1174333.png
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: a cat
Clip score of the caption is 0.243408203125

Processed ImageCaptioning by BLIP2Captioner, Output Text: a cat
croped image saved in result/crop_1702847056.1493406.png
Using text prompt: Describe the a cat in the picture. Answer:...
reference caption: , caption: a cat
Clip score of the caption is 0.212158203125

Processed ImageCaptioning by BLIP2Captioner, Output Text: a cat
Baseline VCT:
({'clip_score': 0.212158203125, 'caption': 'a cat', 'crop_save_path': 'result/crop_1702847056.1493406.png'}, {'clip_score': 0.243408203125, 'caption': 'a cat', 'crop_save_path': 'result/crop_1702847055.1174333.png'})
croped image saved in result/crop_1702847057.2380269.png
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: a cat
Clip score of the caption is 0.243408203125

Processed ImageCaptioning by B

In [None]:
# sa_233276 Investigation (trombone player)
image = Image.open("data/sa_233276.jpg")
seg_mask = Image.open("data/sa_233276_2_bitmask.jpg")

#alt 4
cap1 = 'Question: Describe this object? Answer:'
cap2 = "Tell me about the {}. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print(caption)

#alt 1
cap1 = 'Question: What is this? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
caption = visual_chain_of_thought_alt_cap(image, seg_mask, cap_args, cap1, cap2)
print(caption)

# alt 5
cap1 = 'Question: what does the image show? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
caption = visual_chain_of_thought_alt_cap_w_bg(image, seg_mask, cap_args, cap1, cap2)
print(caption)

Using text prompt: Question: Describe this object? Answer:...
reference caption: , caption: A black and pink tie dye shirt with the word juventus on the front
Clip score of the caption is 0.365478515625

Processed ImageCaptioning by BLIP2Captioner, Output Text: A black and pink tie dye shirt with the word juventus on the front
Using text prompt: Tell me about the A black and pink tie dye shirt with the word juventus on the front. Answer:...
reference caption: , caption: it's a juventus shirt
Clip score of the caption is 0.2734375

Processed ImageCaptioning by BLIP2Captioner, Output Text: it's a juventus shirt
({'clip_score': 0.2734375, 'caption': "it's a juventus shirt", 'crop_save_path': None}, {'clip_score': 0.365478515625, 'caption': 'A black and pink tie dye shirt with the word juventus on the front', 'crop_save_path': None})
Using text prompt: Question: What is this? Answer:...
reference caption: , caption: This is a shirt with a black and pink tie dye print
Clip score of the capt

In [None]:
# Calculate alt cap clip score
clip_sum_altcap1 = 0
count = 0
cap1 = 'Question: What is this? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
outfile = open("alternate_captions1_out.txt", "w")
outfile.writelines("Caption 1: " + cap1 + "\n")
outfile.writelines("Caption 2: " + cap2 + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought_alt_cap(img, seg_mask, cap_args, cap1, cap2)
    clip_sum_altcap1 += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip_altcap1 = clip_sum_altcap1/500
print("Average clip score: ", avg_clip_altcap1)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(avg_clip_altcap1))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.2166748046875

Processed ImageCaptioning by BLIP2Captioner, Output Text: a white shirt with a white collar and a white shirt with a white collar
Generating caption for mask  5
Using text prompt: Question: What is this? Answer:...
reference caption: , caption: This is a pair of pants that is made of a material that is very soft and comfortable.
Clip score of the caption is 0.3154296875

Processed ImageCaptioning by BLIP2Captioner, Output Text: This is a pair of pants that is made of a material that is very soft and comfortable.
Using text prompt: Describe the This is a pair of pants that is made of a material that is very soft and comfortable. in the picture. Answer:...
reference caption: , caption: 
Clip score of the caption is 0.2440185546875

Processed ImageCaptioning by BLIP2Captioner, Output Text: 
generating captions for image: 11 data/sa_224845.jpg
Generating caption for mask  1
Using 

In [None]:
# Calculate alt cap clip score
clip_sum_altcap2 = 0
count = 0
cap1 = 'Question: what does the image show? Answer:'
cap2 = "What is the {} doing in the image. Answer:"
outfile = open("alternate_captions2_out.txt", "w")
outfile.writelines("Caption 1: " + cap1 + "\n")
outfile.writelines("Caption 2: " + cap2 + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought_alt_cap(img, seg_mask, cap_args, cap1, cap2)
    clip_sum_altcap2 += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip_altcap2 = clip_sum_altcap2/500
print("Average clip score: ", avg_clip_altcap2)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(avg_clip_altcap2))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.2471923828125

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows a white shirt with a black collar doing in the image.
Generating caption for mask  5
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: the image shows a pair of white pants with a yellow stripe
Clip score of the caption is 0.347900390625

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows a pair of white pants with a yellow stripe
Using text prompt: What is the the image shows a pair of white pants with a yellow stripe doing in the image. Answer:...
reference caption: , caption: judo
Clip score of the caption is 0.2470703125

Processed ImageCaptioning by BLIP2Captioner, Output Text: judo
generating captions for image: 11 data/sa_224845.jpg
Generating caption for mask  1
Using text prompt: Question: what does the image show? Answer:...
r

In [None]:
# Calculate alt cap clip score
clip_sum_altcap3 = 0
count = 0
cap1 = 'Question: What is this? Answer:'
cap2 = "What is the {} doing in the image. Answer:"
outfile = open("alternate_captions3_out.txt", "w")
outfile.writelines("Caption 1: " + cap1 + "\n")
outfile.writelines("Caption 2: " + cap2 + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought_alt_cap(img, seg_mask, cap_args, cap1, cap2)
    clip_sum_altcap3 += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip_altcap3 = clip_sum_altcap3/500
print("Average clip score: ", avg_clip_altcap3)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(avg_clip_altcap3))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.27001953125

Processed ImageCaptioning by BLIP2Captioner, Output Text: he is doing a kata
Generating caption for mask  5
Using text prompt: Question: What is this? Answer:...
reference caption: , caption: This is a pair of pants that is made of a material that is very soft and comfortable.
Clip score of the caption is 0.3154296875

Processed ImageCaptioning by BLIP2Captioner, Output Text: This is a pair of pants that is made of a material that is very soft and comfortable.
Using text prompt: What is the This is a pair of pants that is made of a material that is very soft and comfortable. doing in the image. Answer:...
reference caption: , caption: 
Clip score of the caption is 0.2440185546875

Processed ImageCaptioning by BLIP2Captioner, Output Text: 
generating captions for image: 11 data/sa_224845.jpg
Generating caption for mask  1
Using text prompt: Question: What is this? Answer:...
refe

In [None]:
# Calculate alt cap clip score
clip_sum_altcap4 = 0
count = 0
cap1 = 'Question: Describe this object? Answer:'
cap2 = "Tell me about the {}. Answer:"
outfile = open("alternate_captions4_out.txt", "w")
outfile.writelines("Caption 1: " + cap1 + "\n")
outfile.writelines("Caption 2: " + cap2 + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought_alt_cap(img, seg_mask, cap_args, cap1, cap2)
    clip_sum_altcap4 += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip_altcap4 = clip_sum_altcap4/500
print("Average clip score: ", avg_clip_altcap4)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines(f"Max Clip score was {max_clip} for {max_clip_img}\n")
outfile.writelines("Average clip score: " + str(avg_clip_altcap4))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.26318359375

Processed ImageCaptioning by BLIP2Captioner, Output Text: it's a karate uniform
Generating caption for mask  5
Using text prompt: Question: Describe this object? Answer:...
reference caption: , caption: A white pair of pants with a yellow stripe
Clip score of the caption is 0.34619140625

Processed ImageCaptioning by BLIP2Captioner, Output Text: A white pair of pants with a yellow stripe
Using text prompt: Tell me about the A white pair of pants with a yellow stripe. Answer:...
reference caption: , caption: karate
Clip score of the caption is 0.243896484375

Processed ImageCaptioning by BLIP2Captioner, Output Text: karate
generating captions for image: 11 data/sa_224845.jpg
Generating caption for mask  1
Using text prompt: Question: Describe this object? Answer:...
reference caption: , caption: A person's head
Clip score of the caption is 0.287353515625

Processed ImageCaptionin

Potential Experiments left:
- Try VCT with strongest captions w/o removing background of intermediate caption

In [3]:
def visual_chain_of_thought_alt_cap_w_bg(image, seg_mask, cap_args, cap1, cap2):
  cap_args['text_prompt'] = cap1
  raw_cap = captioner.inference_seg(image, seg_mask,
                                  crop_mode='w_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=False,
                                  caption_args=cap_args)

  cap_args['text_prompt'] = cap2.format(raw_cap['caption'])
  cot_cap = captioner.inference_seg(image, seg_mask,
                                  crop_mode='w_bg',
                                  filter=True,
                                  disable_regular_box=cap_args['disable_regular_box'],
                                  verbose=False,
                                  caption_args=cap_args)
  return cot_cap, raw_cap

In [None]:
# Calculate alt cap clip score
clip_sum_altcap5 = 0
count = 0
cap1 = 'Question: what does the image show? Answer:'
cap2 = "Describe the {} in the picture. Answer:"
outfile = open("alternate_captions5_out.txt", "w")
outfile.writelines("Caption 1: " + cap1 + "\n")
outfile.writelines("Caption 2: " + cap2 + "\n")
max_clip = 0
max_clip_img = ""

for image in images:
  print("generating captions for image: " +  str(count) + " " + image[0] )
  outfile.writelines("Image: " + image[0] + "\n")
  count += 1
  for i in range(1,6):
    print("Generating caption for mask ", i)
    outfile.writelines("Mask: " + str(i) + "\n")
    img = Image.open(image[0])
    seg_mask = Image.open(image[i])

    result, raw_cap = visual_chain_of_thought_alt_cap_w_bg(img, seg_mask, cap_args, cap1, cap2)
    clip_sum_altcap5 += result["clip_score"]
    outfile.writelines(["Final Result: ", str(result), '\n', "Intermediate Result: ", str(raw_cap), "\n"])
    if result["clip_score"] > max_clip:
      max_clip = result["clip_score"]
      max_clip_img = image[0]

avg_clip_altcap5 = clip_sum_altcap5/500
print("Average clip score: ", avg_clip_altcap5)
print(f"Max Clip score was {max_clip} for {max_clip_img}")
outfile.writelines("Average clip score: " + str(avg_clip_altcap5))
outfile.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clip score of the caption is 0.30419921875

Processed ImageCaptioning by BLIP2Captioner, Output Text: a karate match is a competition in which two people fight each other
Generating caption for mask  5
Using text prompt: Question: what does the image show? Answer:...
reference caption: , caption: the image shows the karate player in the middle of a match
Clip score of the caption is 0.25

Processed ImageCaptioning by BLIP2Captioner, Output Text: the image shows the karate player in the middle of a match
Using text prompt: Describe the the image shows the karate player in the middle of a match in the picture. Answer:...
reference caption: , caption: the karate player is in the middle of a match
Clip score of the caption is 0.239501953125

Processed ImageCaptioning by BLIP2Captioner, Output Text: the karate player is in the middle of a match
generating captions for image: 11 data/sa_224845.jpg
Generating caption for mask  1