## Object Detection / Captioning / Scene Description

In [None]:
import json
import torch

from os import listdir, makedirs, path

from PIL import Image as PImage

from dominant_colors import get_dominant_colors, resize_PIL

IMAGES_IN_PATH = "../../imgs/arquigrafia"

OUT_PATH = "./metadata/objects"
makedirs(OUT_PATH, exist_ok=True)

In [None]:
OBJS = {
  "awning": 0.3,
  "balcony": 0.2,
  "chair": 0.2,
  "chimney": 0.33,
  "door portico": 0.3,
  "door": 0.2,
  "masonry": 0.3,
  "overhang": 0.3,
  "painting": 0.4,
  "person": 0.3,
  "hand rail": 0.3,
  "pedestrian ramp": 0.3,
  "sculpture": 0.4,
  "stairs": 0.2,
  "steps": 0.2,
  "support arch": 0.2,
  "support column": 0.3,
  "table": 0.2,
  "brick": 0.2,
  "concrete wall": 0.2,
  "stone wall": 0.2,
  "window": 0.2,
}

OBJS_LABELS = sorted(OBJS.keys())
OBJS_THOLDS = [OBJS[k] for k in OBJS_LABELS]

### Init Models

In [None]:
from transformers import AutoModel, AutoTokenizer, pipeline

CAP_MODEL_NAME = "openbmb/MiniCPM-V-2"
CAP_MODEL_REV = "187851962daa9b63072d40ec802f597b71bff532"

CAP_COND = [
  {'role': 'user', 'content': "The following image is a picture taken in Brazil."},
  {'role': 'user', 'content': "Give a short description of the image."},
  {'role': 'user', 'content': "Don't mention sports or winter."},
]

CAP_MODEL = {
  "model": AutoModel.from_pretrained(CAP_MODEL_NAME, revision=CAP_MODEL_REV, trust_remote_code=True, torch_dtype=torch.bfloat16).to("cuda", dtype=torch.bfloat16),
  "pre": AutoTokenizer.from_pretrained(CAP_MODEL_NAME, revision=CAP_MODEL_REV, trust_remote_code=True),
  "chat": CAP_COND
}

In [None]:
ENPT_MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-pt"
ENPT_PIPELINE = pipeline(model=ENPT_MODEL_NAME, device="cuda")

In [None]:
def run_caption(img, model):
  caption, _, _ = model["model"].chat(
    image=img,
    msgs=model["chat"],
    max_length=32,
    context=None,
    tokenizer=model["pre"],
    sampling=True,
    temperature=0.1
  )
  caption = caption[:caption.find(".") + 1]
  caption = caption[:caption.find(", possibly")]
  return caption

In [None]:
from transformers import Owlv2Processor, Owlv2ForObjectDetection

OBJ_TARGET_SIZE = torch.Tensor([500, 500])
OBJ_MODEL = "google/owlv2-base-patch16-ensemble"

obj_model = Owlv2ForObjectDetection.from_pretrained(OBJ_MODEL).to("cuda")
obj_processor = Owlv2Processor.from_pretrained(OBJ_MODEL)

In [None]:
def box_px_to_pct(box, img_w, img_h, model_dims):
  scale_factor = torch.tensor([max(img_w, img_h) / img_w , max(img_w, img_h) / img_h])
  return [round(x, 4) for x in (box.cpu().reshape(2, -1) / model_dims * scale_factor).reshape(-1).tolist()]

In [None]:
def run_object_detection(img, obj_labels, obj_tholds):
  input = obj_processor(text=obj_labels, images=img, return_tensors="pt").to("cuda")
  with torch.no_grad():
    obj_out = obj_model(**input)

  res = obj_processor.post_process_object_detection(outputs=obj_out, target_sizes=[OBJ_TARGET_SIZE])
  slbs = zip(res[0]["scores"], res[0]["labels"], res[0]["boxes"])
  iw, ih = img.size

  # filter if box "too large" or "too small"
  def good_thold_and_size(s, l, b):
    box_pct = box_px_to_pct(b, iw, ih, OBJ_TARGET_SIZE)
    box_width = box_pct[2] - box_pct[0]
    box_height = box_pct[3] - box_pct[1]
    good_width = box_width > 0.05 and box_width < 0.5 
    good_height = box_height > 0.05 and box_height < 0.5
    return good_width and good_height and s > obj_tholds[l.item()]

  detected_objs = [{"score": s, "label": obj_labels[l.item()], "box": box_px_to_pct(b, iw, ih, OBJ_TARGET_SIZE)} for s,l,b in slbs if good_thold_and_size(s, l, b)]

  # only keep the box with highest score per object
  detected_objs_boxes = {}
  high_score = {}

  for o in detected_objs:
    ol = o["label"]
    if (ol not in detected_objs_boxes) or (o["score"] > high_score[ol]):
      detected_objs_boxes[ol] = o["box"]
      high_score[ol] = o["score"]

  return detected_objs_boxes

### Run Caption

In [None]:
%%time

input_files = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

for io_file in input_files[:100]:
  input_file_path = path.join(IMAGES_IN_PATH, io_file)
  output_file_path = path.join(OUT_PATH, io_file.replace(".jpg", ".json"))

  if path.isfile(output_file_path):
    continue

  print(IMAGES_IN_PATH, io_file)

  image = PImage.open(input_file_path).convert("RGB")

  rgb_by_count, rgb_by_hls = get_dominant_colors(resize_PIL(image))

  image_data = {}
  image_data["caption"] = {}
  image_data["caption"]["en"] = run_caption(image, CAP_MODEL)
  image_data["caption"]["pt"] = ENPT_PIPELINE(image_data["caption"]["en"])[0]["translation_text"]
  image_data["boxes"] = run_object_detection(image, OBJS_LABELS, OBJS_THOLDS)

  image_data["dominant_color"] = {
    "by_count": [int(v) for v in rgb_by_count[0]],
    "by_hue": [int(v) for v in rgb_by_hls[0]]
  }

  with open(output_file_path, "w", encoding="utf-8") as of:
    json.dump(image_data, of, sort_keys=True, separators=(',',':'), ensure_ascii=False)

### Post-Process: Create output json file

In [None]:
import json

from os import listdir, path

from dominant_colors import hls_order_from_rgb255

CAPTIONS_PATH = "./metadata/objects"
OBJECTS_DB_FILE_PATH = "./metadata/objects.json"

In [None]:
# filename -> image info
img_data = {}

# obj name -> image name
obj_data = {}

# image name -> color order key
color_key = {}

input_files = sorted([f for f in listdir(CAPTIONS_PATH) if f.endswith("json")])

for io_file in input_files:
  input_file_path = path.join(CAPTIONS_PATH, io_file)
  with open(input_file_path, "r", encoding="utf8") as f:
    id = int(io_file.replace(".json", ""))
    img_data[id] = json.load(f)

    for l in img_data[id]["boxes"].keys():
      obj_data[l] = obj_data.get(l, []) + [id]

    color_key[id] = hls_order_from_rgb255(img_data[id]["dominant_color"]["by_hue"])

# order each object's file list by color order
for k in obj_data.keys():
  obj_data[k] = sorted(obj_data[k], key=lambda x: color_key[x])

out_data = {
  "objects": obj_data,
  "images": img_data,
}

In [None]:
with open(OBJECTS_DB_FILE_PATH, "w", encoding="utf8") as f:
  json.dump(out_data, f, separators=(',',':'), sort_keys=True, ensure_ascii=False)

### TEST: boxes from JSON

In [None]:
import json

from os import path
from PIL import Image as PImage, ImageDraw as PImageDraw

OBJECTS_DB_FILE_PATH = "./metadata/objects.json"
IMAGES_PATH = "../../imgs/arquigrafia"

with open(OBJECTS_DB_FILE_PATH, "r") as f:
  json_data = json.load(f)
  img_data = json_data["images"]
  obj_data = json_data["objects"]

for id, d in list(img_data.items())[:3]:
  img_path = path.join(IMAGES_PATH, f"{id}.jpg")
  img = PImage.open(img_path).convert("RGBA")
  iw,ih = img.size
  draw = PImageDraw.Draw(img)
  for _, (x0,y0,x1,y1) in d["boxes"].items():
    draw.rectangle(((x0*iw, y0*ih), (x1*iw, y1*ih)), outline=(255, 0, 0))
  print(list(d["boxes"].keys()), "\n", d["caption"])
  display(img)

### TEST: EN -> PT

In [None]:
PHRASES = [
  "I like to eat rice.",
  "Tom tried to stab me.",
  "He has been to Hawaii several times.",
  "The image features a white house with black trim, windows on the front and side walls.",
  "This image features a modern, open-concept living space with an eye-catching staircase and various furniture pieces.",
  "The image depicts an interior space with a staircase, furniture such as chairs and tables.",
  "The image showcases a modern building with glass walls, concrete stairs leading to it and greenery surrounding the area.",
  "The image shows a view through glass panes, revealing indoor furniture and plants outside.",
  "The image is of a modern building with large windows and columns."
]

In [None]:
from transformers import pipeline

ENPT_MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-pt"
ENPT_PIPELINE = pipeline(model=ENPT_MODEL_NAME, device="cuda")

for p in PHRASES:
  print(ENPT_PIPELINE(p))

### TEST: Dominant Color

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

from dominant_colors import get_dominant_colors, resize_PIL

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[0]
input_file_path = path.join(IMAGES_IN_PATH, io_file)

In [None]:
image = PImage.open(input_file_path).convert("RGB")
image_s = resize_PIL(image)
rgb_by_count, rgb_by_hls = get_dominant_colors(image_s)

In [None]:
iw, ih = [(d // 2) * 2 for d in image_s.size]
image_shape = (ih, iw, 3)
ppc = int(ih * iw / len(rgb_by_count))

count_np_image = np.array([ppc * [c] for c in rgb_by_count]).reshape(image_shape)
hls_np_image = np.array([ppc * [c] for c in rgb_by_hls]).reshape(image_shape)

display(image_s)
display(PImage.fromarray(count_np_image))
display(PImage.fromarray(hls_np_image))