## Object Detection / Captioning / Scene Description

In [None]:
import json
import torch

from os import listdir, makedirs, path

from PIL import Image as PImage

from dominant_colors import get_dominant_colors, resize_PIL

IMAGES_IN_PATH = "../../imgs/arquigrafia"

OUT_PATH = "./metadata/objects"
makedirs(OUT_PATH, exist_ok=True)

In [None]:
OBJS = [
  {
    "minaret": 0.25,
    "tower": 0.6,
    "railing": 0.4,
    "stair railing": 0.41,
    "guard railing": 0.4,
    "table": 0.45,
    "desk": 0.25,
    "chair": 0.24,
    "inclined walkway": 0.32,
    "sculpture": 0.4,
    "painting": 0.4,
    "vertical pillar": 0.35,
    "stairs": 0.4,
    "stoop steps": 0.35,
    "stoop stairs": 0.35,
  },
  {
    "window": 0.2,
    "room door": 0.25,
    "building door": 0.22,
    "masonry": 0.2,

    "concrete wall": 0.2,
    "exposed concrete": 0.2,
    "concrete structure": 0.2,
    "poured concrete": 0.2,
  
    "glass window": 0.2,
    "glass door": 0.2,
    "mirror": 0.2,
  },
  {
    "wood fence": 0.3,
    "wood railing": 0.35,
    "wood pilar": 0.3,
    "wood door": 0.21,
    "wood board": 0.21,

    "metal fence": 0.4,
    "metal railing": 0.22,
    "wrought": 0.2,
  },
  {
    "tree": 0.2,
    "grass": 0.2,
    "shrub": 0.2,
    "bush": 0.2,
    "flower": 0.2,
    "vegetation": 0.2,
    "greenery": 0.2,
  }
]

LABEL2LABEL = {
  "minaret": "tower",
  "stair railing": "railing",
  "guard railing": "railing",
  "stoop steps": "stairs",
  "stoop stairs": "stairs",
  "desk": "table",
  "room door": "building door",

  "exposed concrete": "concrete wall",
  "concrete structure": "concrete wall",
  "poured concrete": "concrete wall",

  "glass window": "mirror",
  "glass door": "mirror",

  "wood railing": "wood fence",
  "wood pilar":"wood fence",
  "wood door": "wood fence",
  "wood board": "wood fence",

  "metal fence": "wrought",
  "metal railing": "wrought",

  "tree": "greenery",
  "grass": "greenery",
  "shrub": "greenery",
  "bush": "greenery",
  "flower": "greenery",
  "vegetation": "greenery",
}

OBJS_LABELS_IN = [sorted(o.keys()) for o in OBJS]
OBJS_LABELS_OUT = [[LABEL2LABEL.get(l, l) for l in oli] for oli in OBJS_LABELS_IN]
OBJS_THOLDS = [[OBJS[i][k] for k in oli] for i,oli in enumerate(OBJS_LABELS_IN)]

### Init Models

In [None]:
from transformers import AutoModel, AutoTokenizer, pipeline

CAP_MODEL_NAME = "openbmb/MiniCPM-V-2"
CAP_MODEL_REV = "187851962daa9b63072d40ec802f597b71bff532"

BINARIES = [
  ["horizontal", "vertical"],
  ["translucent", "opaque"],
  ["symmetric", "asymmetric"],
  ["complex", "simple"],
  ["internal", "external"],
  ["open", "closed"],
]

CAP_COND = [
  {'role': 'user', 'content': "The following image is a picture taken in Brazil."},
  {'role': 'user', 'content': "Give a short, precise, terse and objective description of the image without using superlatives."},
  {'role': 'user', 'content': "Don't mention sports or winter."},
  {'role': 'user', 'content': "Describe the image using only 8 nouns."},
]

CAP_MODEL = {
  "model": AutoModel.from_pretrained(CAP_MODEL_NAME, revision=CAP_MODEL_REV, trust_remote_code=True, torch_dtype=torch.bfloat16).to("cuda", dtype=torch.bfloat16),
  "pre": AutoTokenizer.from_pretrained(CAP_MODEL_NAME, revision=CAP_MODEL_REV, trust_remote_code=True),
  "chat": CAP_COND
}

In [None]:
ENPT_MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-pt"
ENPT_PIPELINE = pipeline(model=ENPT_MODEL_NAME, device="cuda")

In [None]:
def run_caption(img, model):
  caption, _, _ = model["model"].chat(
    image=img,
    msgs=model["chat"],
    max_length=32,
    context=None,
    tokenizer=model["pre"],
    sampling=True,
    temperature=0.1
  )
  caption += "."
  caption = caption[:caption.find(".") + 1]
  caption = caption[:caption.find(", possibly")]
  return "Picture of " + caption

In [None]:
def bin2float(b, b0, b1):
  if b0 in b:
    return 0.0
  elif b1 in b:
    return 1.0
  elif "neither" in b:
    return 0.5
  else:
    print("b2f error: ", b)
    return 0.5

def run_binaries(img, model, bins):
  bin_results = {}
  for b in bins:
    chat = [{
      "role": "user",
      "content": f"Is the architecture pictured in the image more {b[0]}, {b[1]} or neither? Answer using only the words {b[0]}, {b[1]} or neither"
    }]
    response, _, _ = model["model"].chat(
      image=img,
      msgs=chat,
      max_length=4,
      context=None,
      tokenizer=model["pre"],
      sampling=True,
      temperature=0.005
    )

    response_list = ' '.join(response.split()).split()
    if len(response_list) != 1:
      print("wtf", response_list)
    bin_results["/".join(b)] = bin2float(response_list[0].lower(), b[0], b[1])

  return bin_results

In [None]:
from transformers import Owlv2Processor, Owlv2ForObjectDetection

OBJ_TARGET_SIZE = torch.Tensor([500, 500])
OBJ_MODEL = "google/owlv2-base-patch16-ensemble"

obj_model = Owlv2ForObjectDetection.from_pretrained(OBJ_MODEL).to("cuda")
obj_processor = Owlv2Processor.from_pretrained(OBJ_MODEL)

In [None]:
def box_px_to_pct(box, img_w, img_h, model_dims):
  scale_factor = torch.tensor([max(img_w, img_h) / img_w , max(img_w, img_h) / img_h])
  return [round(x, 4) for x in (box.cpu().reshape(2, -1) / model_dims * scale_factor).reshape(-1).tolist()]

In [None]:
def run_object_detection(img, obj_labels_in, obj_labels_out, obj_tholds):
  input = obj_processor(text=obj_labels_in, images=img, return_tensors="pt").to("cuda")
  with torch.no_grad():
    obj_out = obj_model(**input)

  res = obj_processor.post_process_object_detection(outputs=obj_out, target_sizes=[OBJ_TARGET_SIZE])
  slbs = zip(res[0]["scores"], res[0]["labels"], res[0]["boxes"])
  iw, ih = img.size

  # filter if box "too large" or "too small"
  def good_thold_and_size(s, l, b):
    box_pct = box_px_to_pct(b, iw, ih, OBJ_TARGET_SIZE)
    box_width = box_pct[2] - box_pct[0]
    box_height = box_pct[3] - box_pct[1]
    good_min = box_width > 0.05 and box_height > 0.05
    good_max = box_width < 0.8 or box_height < 0.8
    return good_min and good_max and s > obj_tholds[l.item()]

  detected_objs = [{"score": s, "label": obj_labels_out[l.item()], "box": box_px_to_pct(b, iw, ih, OBJ_TARGET_SIZE)} for s,l,b in slbs if good_thold_and_size(s, l, b)]

  # only keep the box with highest score per object
  detected_objs_boxes = {}
  high_score = {}

  for o in detected_objs:
    ol = o["label"]
    if (ol not in detected_objs_boxes) or (o["score"] > high_score[ol]):
      detected_objs_boxes[ol] = o["box"]
      high_score[ol] = o["score"]

  return detected_objs_boxes

### Run Caption

In [None]:
%%time

input_files = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

for io_file in input_files[:4096]:
  input_file_path = path.join(IMAGES_IN_PATH, io_file)
  output_file_path = path.join(OUT_PATH, io_file.replace(".jpg", ".json"))

  if path.isfile(output_file_path):
    continue

  if int(io_file.replace(".jpg", "")) % 50 == 0:
    print(IMAGES_IN_PATH, io_file)

  image = PImage.open(input_file_path).convert("RGB")

  rgb_by_count, rgb_by_hls = get_dominant_colors(resize_PIL(image))

  image_data = {}
  image_data["caption"] = {}
  image_data["caption"]["en"] = run_caption(image, CAP_MODEL)
  to_pt = ">>por<< " + image_data["caption"]["en"]
  image_data["caption"]["pt"] = ENPT_PIPELINE(to_pt)[0]["translation_text"]

  image_data["binaries"] = run_binaries(image, CAP_MODEL, BINARIES)

  image_data["boxes"] = {}
  for i in range(0, len(OBJS_LABELS_IN)):
    obj_boxes = run_object_detection(image, OBJS_LABELS_IN[i], OBJS_LABELS_OUT[i], OBJS_THOLDS[i])
    image_data["boxes"] = image_data["boxes"] | obj_boxes

  image_data["dominant_color"] = {
    "by_count": [int(v) for v in rgb_by_count[0]],
    "by_hue": [int(v) for v in rgb_by_hls[0]],
    "palette": [[int(v) for v in c] for c in rgb_by_hls[:4]],
  }

  with open(output_file_path, "w", encoding="utf-8") as of:
    json.dump(image_data, of, sort_keys=True, separators=(',',':'), ensure_ascii=False)

### Post-Process: Create output json file

In [None]:
import json

from os import listdir, path

from dominant_colors import hls_order_from_rgb255

CAPTIONS_PATH = "./metadata/objects"
OBJECTS_DB_FILE_PATH = "./metadata/objects.json"

In [None]:
# filename -> image info
img_data = {}

# obj name -> image name
obj_data = {}

# image name -> color order key
color_key = {}

input_files = sorted([f for f in listdir(CAPTIONS_PATH) if f.endswith("json")])

for io_file in input_files:
  input_file_path = path.join(CAPTIONS_PATH, io_file)
  with open(input_file_path, "r", encoding="utf8") as f:
    id = int(io_file.replace(".json", ""))
    img_data[id] = json.load(f)

    for l in img_data[id]["boxes"].keys():
      obj_data[l] = obj_data.get(l, []) + [id]

    color_key[id] = hls_order_from_rgb255(img_data[id]["dominant_color"]["by_hue"])

# order each object's file list by color order
for k in obj_data.keys():
  obj_data[k] = sorted(obj_data[k], key=lambda x: color_key[x])

out_data = {
  "objects": obj_data,
  "images": img_data,
}

In [None]:
with open(OBJECTS_DB_FILE_PATH, "w", encoding="utf8") as f:
  json.dump(out_data, f, separators=(',',':'), sort_keys=True, ensure_ascii=False)

### TEST: boxes from JSON

In [None]:
import json

from os import path
from PIL import Image as PImage, ImageDraw as PImageDraw, ImageFont as PImageFont

MFONT = PImageFont.load_default(20)


OBJECTS_DB_FILE_PATH = "./metadata/objects.json"
IMAGES_PATH = "../../imgs/arquigrafia"

with open(OBJECTS_DB_FILE_PATH, "r") as f:
  json_data = json.load(f)
  img_data = json_data["images"]
  obj_data = json_data["objects"]

for id, d in list(img_data.items())[:3]:
  img_path = path.join(IMAGES_PATH, f"{id}.jpg")
  img = PImage.open(img_path).convert("RGBA")
  iw,ih = img.size
  draw = PImageDraw.Draw(img)
  for label, (x0,y0,x1,y1) in d["boxes"].items():
    draw.rectangle(((x0*iw, y0*ih), (x1*iw, y1*ih)), outline=(255, 0, 0), width=2)
  print(list(d["boxes"].keys()), "\n", d["caption"])
  display(img)

### TEST: EN -> PT

In [None]:
PHRASES = [
  "I like to eat rice.",
  "Tom tried to stab me.",
  "He has been to Hawaii several times.",
  "The image features a white house with black trim, windows on the front and side walls.",
  "This image features a modern, open-concept living space with an eye-catching staircase and various furniture pieces.",
  "The image depicts an interior space with a staircase, furniture such as chairs and tables.",
  "The image showcases a modern building with glass walls, concrete stairs leading to it and greenery surrounding the area.",
  "The image shows a view through glass panes, revealing indoor furniture and plants outside.",
  "The image is of a modern building with large windows and columns."
]

In [None]:
from transformers import pipeline

ENPT_MODEL_NAME = "Helsinki-NLP/opus-mt-tc-big-en-pt"
ENPT_PIPELINE = pipeline(model=ENPT_MODEL_NAME, device="cuda")

for p in PHRASES:
  print(ENPT_PIPELINE(p))

### TEST: Dominant Color

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

from dominant_colors import get_dominant_colors, resize_PIL

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[0]
input_file_path = path.join(IMAGES_IN_PATH, io_file)

In [None]:
image = PImage.open(input_file_path).convert("RGB")
image_s = resize_PIL(image)
rgb_by_count, rgb_by_hls = get_dominant_colors(image_s)

In [None]:
iw, ih = [(d // 2) * 2 for d in image_s.size]
image_shape = (ih, iw, 3)
ppc = int(ih * iw / len(rgb_by_count))

count_np_image = np.array([ppc * [c] for c in rgb_by_count]).reshape(image_shape)
hls_np_image = np.array([ppc * [c] for c in rgb_by_hls]).reshape(image_shape)

display(image_s)
display(PImage.fromarray(count_np_image))
display(PImage.fromarray(hls_np_image))

### Adjust Thresholds

In [None]:
import json
import shutil

from os import path, listdir, makedirs

OBJECTS_DB_FILE_PATH = "./metadata/objects.json"
IMAGES_PATH = "../../imgs"
IMG_IN_DIR = "arquigrafia"

with open(OBJECTS_DB_FILE_PATH, "r") as f:
  json_data = json.load(f)
  img_data = json_data["images"]
  obj_data = json_data["objects"]

print(obj_data.keys(), len(obj_data.keys()))

In [None]:
tocopy = [
  'inclined walkway',
]

for o in tocopy:
  print(o, len(obj_data[o]), obj_data[o], "\n")
  img_out_dir = f"test-{o.replace(' ', '-')}"
  img_out_dir_path = path.join(IMAGES_PATH, img_out_dir)
  makedirs(img_out_dir_path, exist_ok=True)
  for i in obj_data[o]:
    img_in_path = path.join(IMAGES_PATH, IMG_IN_DIR, f"{i}.jpg")
    shutil.copy2(img_in_path, img_out_dir_path)

In [None]:
from PIL import Image as PImage, ImageDraw as PImageDraw, ImageFont as PImageFont

MFONT = PImageFont.load_default(20)

TEST_PATH = "../../imgs/test-inclined-walkway"

OBJS = {
  "inclined walkway": 0.41,
}

LABEL2LABEL = {}

OBJS_LABELS_IN = sorted(OBJS.keys())
OBJS_LABELS_OUT = [LABEL2LABEL.get(l, l) for l in OBJS_LABELS_IN]
OBJS_THOLDS = [OBJS[k] for k in OBJS_LABELS_IN]

In [None]:
def run_object_detection(img, obj_labels_in, obj_labels_out, obj_tholds):
  input = obj_processor(text=obj_labels_in, images=img, return_tensors="pt").to("cuda")
  with torch.no_grad():
    obj_out = obj_model(**input)

  res = obj_processor.post_process_object_detection(outputs=obj_out, target_sizes=[OBJ_TARGET_SIZE])
  slbs = zip(res[0]["scores"], res[0]["labels"], res[0]["boxes"])
  iw, ih = img.size

  # filter if box "too large" or "too small"
  def good_thold_and_size(s, l, b):
    box_pct = box_px_to_pct(b, iw, ih, OBJ_TARGET_SIZE)
    box_width = box_pct[2] - box_pct[0]
    box_height = box_pct[3] - box_pct[1]
    good_min = box_width > 0.05 and box_height > 0.05
    good_max = box_width < 0.8 or box_height < 0.8
    return good_min and good_max and s > obj_tholds[l.item()]

  detected_objs = [{"score": s.item(), "label": obj_labels_out[l.item()], "box": box_px_to_pct(b, iw, ih, OBJ_TARGET_SIZE)} for s,l,b in slbs if good_thold_and_size(s, l, b)]

  # only keep the box with highest score per object
  detected_objs_boxes = {}
  high_score = {}

  for o in detected_objs:
    ol = o["label"]
    if (ol not in detected_objs_boxes) or (o["score"] > high_score[ol]):
      detected_objs_boxes[ol] = o["box"]
      high_score[ol] = o["score"]

  return detected_objs #detected_objs_boxes

In [None]:
ppath = TEST_PATH
input_files = sorted([f for f in listdir(ppath) if f.endswith("jpg")])

for io_file in input_files:
  input_file_path = path.join(ppath, io_file)

  image = PImage.open(input_file_path).convert("RGB")
  iw,ih = image.size
  print(image.size)

  objs = run_object_detection(image, OBJS_LABELS_IN, OBJS_LABELS_OUT, OBJS_THOLDS)
  print([f'{o["label"]}: {o["score"]}' for o in objs])

  draw = PImageDraw.Draw(image)
  for o in objs:
    (x0,y0,x1,y1) = o["box"]
    score, label = o["score"], o["label"]
    draw.rectangle(((x0*iw, y0*ih), (x1*iw, y1*ih)), outline=(255, 0, 0), width=2)
    draw.text((x0*iw, y0*ih + 20), f"{round(score, 3)}", (255, 255, 255), font=MFONT)
    draw.text((x0*iw, y0*ih - 0), f"{label}", (255, 0, 0), font=MFONT)
  display(image)

### Test Description

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[10]
io_file = "10026.jpg"
input_file_path = path.join(IMAGES_IN_PATH, io_file)

In [None]:
image = PImage.open(input_file_path).convert("RGB")
cap = run_caption(image, CAP_MODEL)

to_pt = ">>por<< " + cap
cap_pt = ENPT_PIPELINE(to_pt)[0]["translation_text"]

display(image)
cap, cap_pt

### Test Binaries

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

IMAGES_IN_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_IN_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[100]
input_file_path = path.join(IMAGES_IN_PATH, io_file)

In [None]:
%%time

image = PImage.open(input_file_path).convert("RGB")
image_binaries = run_binaries(image, CAP_MODEL, BINARIES)

display(image)
image_binaries