<a href="https://colab.research.google.com/github/baroodya/llm-captions/blob/main/llm_captions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m pip install transformers
!python -m pip install pycocoevalcap
!python -m pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [2]:
# import some common libraries
import numpy as np
import os, json
import cv2
from tqdm import tqdm
import random

import torch
from itertools import islice

from transformers import pipeline
from transformers import GenerationConfig
from transformers import AutoTokenizer, AutoModelForCausalLM

from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap

import os
import openai

## Set Up LLM

In [24]:
model_name = "gpt-3"
warmup = True
max_new_tokens = 50
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if model_name == "gpt-3":
    openai.organization = "org-INV4GdfBST6aC9DFNHjcLtcT"
    openai.api_key = "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"
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    print(f"On device: {device}")
    model.to(device)
    print(model.device)

    generation_config = GenerationConfig.from_pretrained(model_name)
    generation_config.pad_token_id = 50256
    generation_config.early_stopping = True
    generation_config.num_beams = 3
    generation_config.max_new_tokens = max_new_tokens

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

cat_file = "/content/gdrive/MyDrive/Semester 8/COS 485/Final Project/detectron2_val2017_categories.json"
outputs_file = "/content/gdrive/MyDrive/Semester 8/COS 485/Final Project/detectron2_val2017_outputs_captions.json"

with open(cat_file, "r") as f:
    cat_names = json.load(f)
with open(outputs_file, "r") as f:
    outputs_captions = json.load(f)

Mounted at /content/gdrive


In [14]:
def create_prompt_features(classes, boxes, k=5):
    features = ""

    for i in range(min(k, len(classes))):
        label = classes[i]
        box = boxes[i]

        center_x = abs(box[0])
        center_y = abs(box[1])

        width = abs(box[2])
        height = abs(box[3])
        
        features += f"A {width:.0f} by {height:.0f} \"{label}\" centered at ({center_x:.0f}, {center_y:.0f})."
        if i < k-1: features += "\n"
    return features

def create_background_features(stuff_list):
    stuff_list = ["\"" + x + "\"" for x in stuff_list]
    return "The background contains objects with the following labels: " + ", ".join(stuff_list)


def create_prompt(output_data, k=5):
    im_width, im_height, _ = output_data["im_dims"]
    
    background = ""
    if "background_stuff" in output_data:
        background = create_background_features(output_data["background_stuff"])

    features = create_prompt_features(output_data["detected_cats"], output_data["pred_boxes"], k)
    prompt = f"Caption a {im_width} pixel by {im_height} pixel image with the following features:\n{features}\n{background}. In your caption, make assumptions about what else could be in the image.\nA short, informative caption for this image is:"
    return prompt

def create_warmed_up_prompt(output_data, k=5):
    im_width, im_height, _ = output_data["im_dims"]

    background = ""
    if "background_stuff" in output_data:
        background = create_background_features(output_data["background_stuff"])

    features = create_prompt_features(output_data["detected_cats"], output_data["pred_boxes"], k)
    prompt = f"Based on those steps, caption a {im_width} pixel by {im_height} pixel image with the following features:\n{features}\nThe background contains objects with the following labels: {background}. In your caption, make assumptions about what else could be in the image. Do no output the steps you followed, only output the caption.\nA short, conceptual caption for this image is:"
    return prompt

def create_few_shot_prompt(output_data, examples, k=5):
    im_width, im_height, _ = output_data["im_dims"]
    test_features = create_prompt_features(output_data["detected_cats"], output_data["pred_boxes"], k)
    example_prompts = ""
    for example in examples:
        example_prompt = create_prompt(example, k=k)
        example_prompts += f"The prompt \"{example_prompt}\" would yield the caption \"{example['captions'][0]}\""
    prompt = f"Create a simple caption for a {im_width} pixel by {im_height} pixel image with the following features:\n{test_features}\n Use the following examples:\n{example_prompts}\n A good caption is:"
    return prompt

In [71]:
# Select a random image to use to test the code and warmup the model
id, output = random.choice(list(outputs_captions.items()))
print(id)
print(output["captions"])

prompt = create_warmed_up_prompt(output, k=5)
answer = ""
if model_name == "gpt-3":
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_new_tokens,
        temperature=0.7
    )
    answer = response['choices'][0]['message']['content']
else:
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    llm_output = model.generate(**inputs, generation_config=generation_config)

    decoded_output = tokenizer.batch_decode(llm_output, skip_special_tokens=True)

    answer = decoded_output[0][len(prompt):]
    answer = answer.split("\n")[0]
print(f"""
Using {model_name}.
Example Prompt: {prompt}

Example Caption: {answer}""")

573626
['Wild animals standing in a  forest next to a river.', 'Two horses stand on rocks near a river. ', 'Two dogs walking on rocks next to a creek.', 'There are horses standing in the rocks near the water.', 'Two large animals walking on rocks along the side of river. ']

Using gpt-3.
Example Prompt: Based on those steps, caption a 375 pixel by 500 pixel image with the following features:
A 31 by 59 "bird" centered at (300, 175).
A 63 by 56 "bear" centered at (212, 156).
A 64 by 41 "bird" centered at (74, 233).

The background contains objects with the following labels: The background contains objects with the following labels: "river", "tree", "rock". In your caption, make assumptions about what else could be in the image. Do no output the steps you followed, only output the caption.
A short, conceptual caption for this image is:

Example Caption: "Nature's Harmony: A serene river flows amidst a lush green forest, where a majestic bear roams and two birds take flight in the distanc

In [20]:
messages = []
if model_name == "gpt-3" and warmup:
    interaction_guide = "give a step by step guide on how you, chatGPT, could figure out how two objects in an image could be determined to be interacting. You are given their width and height in pixels, their label, and their centerpoint in pixels."
    messages.append({"role": "user", "content": interaction_guide})
    guide_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    guide_answer = guide_response['choices'][0]['message']['content']
    messages.append({"role": "assistant", "content": guide_answer})

    im_width, im_height, _ = output["im_dims"]
    example_interactions = f"Caption a {im_width} pixel by {im_height} pixel image with the following features:\n{create_prompt_features(output['detected_cats'], output['pred_boxes'])}.\nGive a step by step guide to how you could determine the relative positions of each object, then give the relative positions of each object to each other object."
    messages.append({"role": "user", "content": example_interactions})
    example_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    example_answer = example_response['choices'][0]['message']['content']
    messages.append({"role": "assistant", "content": example_answer})
    print(messages)


[{'role': 'user', 'content': 'give a step by step guide on how you, chatGPT, could figure out how two objects in an image could be determined to be interacting. You are given their width and height in pixels, their label, and their centerpoint in pixels.'}, {'role': 'assistant', 'content': 'As an AI language model, I can give you a step-by-step guide on how you can determine if two objects in an image are interacting based on their width and height in pixels, label, and their center points in pixels:\n\n1. First, you need to visually inspect the image and identify the two objects you want to check for interaction. You can do this by looking for shapes that are close together, partially overlapping, or seem to be connected in some way.\n\n2. Once you have identified the objects, you can use their width and height in pixels to estimate their size and how much they occupy the image area.\n\n3. Next, check the center points of the objects in pixels. If the two center points are close toget

In [26]:
def prompting(imgs, device, num_examples=0, k=5, n=200, warmup=False):
    img_list = list(imgs.items())
    example_imgs = None
    prompt_func = create_prompt
    if warmup and model_name == "gpt-3":
        prompt_func = create_warmed_up_prompt
    if num_examples > 0:
        example_imgs = [img for _, img in img_list[:num_examples]]
        img_list = img_list[num_examples:]

        prompt_func = create_few_shot_prompt
    
    results = []
    for img_id, img in tqdm(img_list[:n]):
        prompt = ""
        if num_examples == 0:
            prompt = prompt_func(img, k=k)
        else:
            prompt = prompt_func(img, example_imgs, k=k)
        
        gen_cap = ""
        if model_name == "gpt-3":
            this_message = messages + [{"role": "user", "content": prompt}]
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=this_message,
                max_tokens=max_new_tokens,
                temperature=0.7
            )
            gen_cap = response['choices'][0]['message']['content']
        else:
            llm_input = tokenizer(prompt, return_tensors="pt").to(device)
            llm_output = model.generate(**llm_input, generation_config=generation_config)
            decoded_llm_output = tokenizer.batch_decode(llm_output, skip_special_tokens=True)

            gen_cap = decoded_output[0][len(prompt):].split("\n")[0]

        results.append({"image_id": int(img_id), "caption": gen_cap})

    return results        


In [27]:
results = prompting(outputs_captions, device, n=100, num_examples=0, k=5, warmup=warmup)

res_file = f"/content/gdrive/MyDrive/Semester 8/COS 485/Final Project/captions_val2017_{model_name}.json"
with open(res_file, "w") as outfile:
    json.dump(results, outfile)

100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


In [28]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [29]:
#Download captions
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
#Extract archive 
!unzip annotations_trainval2017.zip
!rm annotations_trainval2017.zip

--2023-05-03 00:28:17--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.194.137, 3.5.28.242, 54.231.233.33, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.194.137|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2023-05-03 00:28:35 (14.3 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  


In [30]:
requested_model_name = model_name
res_file = f"/content/gdrive/MyDrive/Semester 8/COS 485/Final Project/captions_val2017_{requested_model_name}.json"
with open(res_file, "r") as f:
    anns = json.load(f)
annsImgIds = [ann['image_id'] for ann in anns]

ann_file = './annotations/captions_val2017.json'
            
coco_caps=COCO(ann_file)
cocoRes = coco_caps.loadRes(res_file)
cocoEval = COCOEvalCap(coco_caps, cocoRes)
cocoEval.params['image_id'] = cocoRes.getImgIds()
cocoEval.evaluate()

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
tokenization...
setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...
{'testlen': 2004, 'reflen': 1273, 'guess': [2004, 1904, 1804, 1704], 'correct': [743, 213, 47, 14]}
ratio: 1.574234092693186
Bleu_1: 0.371
Bleu_2: 0.204
Bleu_3: 0.103
Bleu_4: 0.055
computing METEOR score...
METEOR: 0.174
computing Rouge score...
ROUGE_L: 0.327
computing CIDEr score...
CIDEr: 0.195
computing SPICE score...
SPICE: 0.117
