## Captioning / Scene Description

In [None]:
import json
import torch

from openai import OpenAI
from os import listdir, makedirs, path
from PIL import Image as PImage
from transformers import AutoModel, AutoTokenizer, pipeline

from envars import OPENAI_API_KEY

ARQUI_IMAGE_URL = "https://www.arquigrafia.org.br/arquigrafia-images/IDID_view.jpg"

IMAGES_IN_PATH = "../../imgs/arquigrafia"

OUT_PATH = "./metadata/json/captions"
makedirs(OUT_PATH, exist_ok=True)

### Init Models

In [None]:
SF_MODEL_NAME = "Salesforce/blip-image-captioning-large"
blip_pipe = pipeline("image-to-text", model=SF_MODEL_NAME, device="cuda", torch_dtype=torch.float16)

VIT_MODEL_NAME = "nlpconnect/vit-gpt2-image-captioning"
vit_pipe = pipeline("image-to-text", model=VIT_MODEL_NAME, device="cuda", torch_dtype=torch.float16)

POS_MODEL_NAME = "QCRI/bert-base-multilingual-cased-pos-english"
pos_pipe = pipeline(model=POS_MODEL_NAME, device="cuda")

ENPT_MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-pt"
ENPT_PIPELINE = pipeline(model=ENPT_MODEL_NAME, device="cuda")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
CAP_MODEL_NAME = "openbmb/MiniCPM-V-2"
CAP_MODEL_REV = "187851962daa9b63072d40ec802f597b71bff532"

BINARIES = [
  ["horizontal", "vertical"],
  ["translucent", "opaque"],
  ["symmetric", "asymmetric"],
  ["complex", "simple"],
  ["internal", "external"],
  ["open", "closed"],
]

CAP_COND = [
  {'role': 'user', 'content': "The following image is a picture taken in Brazil."},
  {'role': 'user', 'content': "Give a short, precise, terse and objective description of the image without using superlatives."},
  {'role': 'user', 'content': "Don't mention sports or winter."},
  {'role': 'user', 'content': "Describe the image using only 8 nouns."},
]

CAP_MODEL = {
  "model": AutoModel.from_pretrained(CAP_MODEL_NAME, revision=CAP_MODEL_REV, trust_remote_code=True, torch_dtype=torch.bfloat16).to("cuda", dtype=torch.bfloat16),
  "pre": AutoTokenizer.from_pretrained(CAP_MODEL_NAME, revision=CAP_MODEL_REV, trust_remote_code=True),
  "chat": CAP_COND
}

### Helper Functions

In [None]:
def run_caption_qa(img, model):
  caption, _, _ = model["model"].chat(
    image=img,
    msgs=model["chat"],
    max_length=32,
    context=None,
    tokenizer=model["pre"],
    sampling=True,
    temperature=0.1
  )
  caption += "."
  caption = caption[:caption.find(".") + 1]
  caption = caption[:caption.find(", possibly")]
  return "Picture of " + caption

In [None]:
def run_caption_pipeline(img, pipe):
  caption = pipe(img, max_new_tokens=200)[0]["generated_text"].lower()

  pos = pos_pipe(caption)

  nouns = []
  for o in pos:
    if o["entity"].startswith("NN"):
      if o["word"].startswith("#") and len(nouns) > 1:
        nouns[-1] = nouns[-1] + o["word"].replace("#", "")
      elif not o["word"].startswith("#"):
        nouns.append(o["word"])

  return "Picture of " + ", ".join(nouns)

In [None]:
def clean_gpt_caption(cap):
  return cap.strip().lower().replace("english: ", "").replace("portuguese: ", "")

def run_caption_gpt(img_url, client):
  LSEP = "SEPARATOR"
  CAP_PREFIX = ["Picture of ", "Imagem de "]

  response = client.chat.completions.create(
    #model="gpt-4o-mini",
    model="gpt-4o-2024-08-06",
    messages=[{
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image? Answer using only nouns. Answer in english and portuguese."},
        {"type": "text", "text": f"Separate english and portuguese descriptions with the word {LSEP}"},
        {"type": "image_url", "image_url": {"url": img_url,},
        },
      ],
    }],
    max_tokens=200,
  )

  caps = response.choices[0].message.content.split(LSEP)
  return tuple([p + clean_gpt_caption(c) for p,c in zip(CAP_PREFIX, caps)])

In [None]:
def bin2float(b, b0, b1):
  if b0 in b:
    return 0.0
  elif b1 in b:
    return 1.0
  elif "neither" in b:
    return 0.5
  else:
    print("b2f error: ", b)
    return 0.5

def run_binaries(img, model, bins):
  bin_results = {}
  for b in bins:
    chat = [{
      "role": "user",
      "content": f"Is the architecture pictured in the image more {b[0]}, {b[1]} or neither? Answer using only the words {b[0]}, {b[1]} or neither"
    }]
    response, _, _ = model["model"].chat(
      image=img,
      msgs=chat,
      max_length=4,
      context=None,
      tokenizer=model["pre"],
      sampling=True,
      temperature=0.005
    )

    response_list = ' '.join(response.split()).split()
    if len(response_list) != 1:
      print("wtf", response_list)
    bin_results["/".join(b)] = bin2float(response_list[0].lower(), b[0], b[1])

  return bin_results

### Run Captions

In [None]:
%%time

input_files = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

for io_file in input_files[:4096]:
  img_id = io_file.replace(".jpg", "")
  input_file_path = path.join(IMAGES_IN_PATH, io_file)
  output_file_path = path.join(OUT_PATH, io_file.replace(".jpg", ".json"))

  if path.isfile(output_file_path):
    continue

  print(IMAGES_IN_PATH, io_file)

  image = PImage.open(input_file_path).convert("RGB")

  image_captions = {}
  image_captions["pt"] = {}

  image_captions["en"] = {
    "cpm": run_caption_qa(image, CAP_MODEL),
    "blip": run_caption_pipeline(image, blip_pipe),
    "vit": run_caption_pipeline(image, vit_pipe),
  }

  for k,v in image_captions["en"].items():
    to_pt = ">>por<< " + v
    image_captions["pt"][k] = ENPT_PIPELINE(to_pt)[0]["translation_text"]

  try:
    gpt_cap = run_caption_gpt(ARQUI_IMAGE_URL.replace("IDID", img_id), openai_client)
    image_captions["en"]["gpt"], image_captions["pt"]["gpt"] = gpt_cap
  except:
    print(img_id, gpt_cap)
  else:
    with open(output_file_path, "w", encoding="utf-8") as of:
      json.dump(image_captions, of, sort_keys=True, separators=(',',':'), ensure_ascii=False)

### Post-Process: Create output json file

In [None]:
from export_utils import export_objs_caps

OBJECTS_PATH = "./metadata/json/objects"
CAPTIONS_PATH = "./metadata/json/captions"
OBJECTS_DB_FILE_PATH = "./metadata/json/objects.json"

export_objs_caps(OBJECTS_PATH, CAPTIONS_PATH, OBJECTS_DB_FILE_PATH)

### Post-Process: Create separate json files

In [None]:
from export_utils import export_all_captions, export_by_keys

CAPTIONS_PATH = "./metadata/json/captions"

In [None]:
keys = ["captions"]
export_by_keys(CAPTIONS_PATH, keys)

In [None]:
export_all_captions(CAPTIONS_PATH)

### TEST: EN/PT Translation

In [None]:
PHRASES = [
  "I like to eat rice.",
  "Tom tried to stab me.",
  "He has been to Hawaii several times.",
  "The image features a white house with black trim, windows on the front and side walls.",
  "This image features a modern, open-concept living space with an eye-catching staircase and various furniture pieces.",
  "The image depicts an interior space with a staircase, furniture such as chairs and tables.",
  "The image showcases a modern building with glass walls, concrete stairs leading to it and greenery surrounding the area.",
  "The image shows a view through glass panes, revealing indoor furniture and plants outside.",
  "The image is of a modern building with large windows and columns."
]

In [None]:
from transformers import pipeline

ENPT_MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-pt"
ENPT_PIPELINE = pipeline(model=ENPT_MODEL_NAME, device="cuda")

for p in PHRASES:
  print(ENPT_PIPELINE(p))

### TEST: Description Translation

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[10]
io_file = "10026.jpg"
input_file_path = path.join(IMAGES_IN_PATH, io_file)

In [None]:
image = PImage.open(input_file_path).convert("RGB")
cap = run_caption(image, CAP_MODEL)

to_pt = ">>por<< " + cap
cap_pt = ENPT_PIPELINE(to_pt)[0]["translation_text"]

display(image)
cap, cap_pt

### TEST: Caption Models

In [None]:
import numpy as np
import torch

from os import listdir, path
from PIL import Image as PImage

from transformers import pipeline

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[10]
io_file = "10027.jpg"
io_file = "10000.jpg"
input_file_path = path.join(IMAGES_IN_PATH, io_file)
image = PImage.open(input_file_path).convert("RGB")

In [None]:
SF_MODEL_NAME = "Salesforce/blip-image-captioning-large"
blip_pipe = pipeline("image-to-text", model=SF_MODEL_NAME, device="cuda", torch_dtype=torch.float16)

VIT_MODEL_NAME = "nlpconnect/vit-gpt2-image-captioning"
vit_pipe = pipeline("image-to-text", model=VIT_MODEL_NAME, device="cuda", torch_dtype=torch.float16)

VIT2_MODEL_NAME = "ydshieh/vit-gpt2-coco-en"
vit2_pipe = pipeline("image-to-text", model=VIT2_MODEL_NAME, device="cuda", torch_dtype=torch.float16)

POS_MODEL_NAME = "QCRI/bert-base-multilingual-cased-pos-english"
pos_pipe = pipeline(model=POS_MODEL_NAME, device="cuda")

In [None]:
blip_pipe(image, max_new_tokens=200),\
vit_pipe(image, max_new_tokens=200),\
vit2_pipe(image, max_new_tokens=200)

In [None]:
cap = blip_pipe(image, max_new_tokens=200)[0]["generated_text"]
pos = pos_pipe(cap)
cap,pos

In [None]:
# https://platform.openai.com/docs/guides/vision
# https://github.com/openai/openai-python

from openai import OpenAI
from msecrets import OPENAI_API_KEY

client = OpenAI(api_key=OPENAI_API_KEY)

response = client.chat.completions.create(
  #model="gpt-4o-mini",
  model="gpt-4o-2024-08-06",
  messages=[{
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image? Answer using only nouns. Answer in english and portuguese."},
        {"type": "text", "text": "Separate english and portuguese descriptions with the word SEPARATOR"},
        {"type": "image_url",
          "image_url": {"url": "https://www.arquigrafia.org.br/arquigrafia-images/10026_view.jpg",},
        },
      ],
    }],
  max_tokens=200,
)

print(response.choices[0])
response.choices[0].message.content

### TEST: Binaries

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[100]
input_file_path = path.join(IMAGES_IN_PATH, io_file)

In [None]:
%%time

image = PImage.open(input_file_path).convert("RGB")
image_binaries = run_binaries(image, CAP_MODEL, BINARIES)

display(image)
image_binaries