## Object Detection / Captioning / Scene Description

In [None]:
import cv2
import json
import numpy as np
import re
import scipy.fftpack as fftpack
import torch

from datetime import timedelta
from imagehash import ImageHash
from os import listdir, makedirs, path

from PIL import Image as PImage

VIDEO_DB_PATH = "./metadata/keyframe-500/videos.json"
OUT_PATH = "./metadata/objects-1152"
makedirs(OUT_PATH, exist_ok=True)

VIDEO_PATH = "../../vids/0801-1152"
DIR_PATTERN = re.compile("^[0-3][0-9]-")

In [None]:
STATIC_OBJS = {
  "cape": 0.2,
  "chair": 0.2,
  "fire extinguisher": 0.3,
  "flag": 0.4,
  "painting": 0.2,
  "person": 0.2,
  "pedestrian ramp": 0.3,
  "sculpture": 0.3,
  "stairs": 0.4,
  "support column": 0.45,
  "table": 0.2,
  "window": 0.2,
}

NOT_STATIC_OBJS = {
  "cape": 2,
  "person": 2
}

DYNAMIC_OBJS = {
  "bus": 0.4,
  "cape": 0.2,
  "car": 0.4,
  "person": 0.2,
  "truck": 0.4,
}

STATIC_LABELS = sorted(STATIC_OBJS.keys())
STATIC_THOLD = [STATIC_OBJS[k] for k in STATIC_LABELS]

DYNAMIC_LABELS = sorted(DYNAMIC_OBJS.keys())
DYNAMIC_THOLD = [DYNAMIC_OBJS[k] for k in DYNAMIC_LABELS]

### Init Models

In [None]:
from transformers import AutoModel, AutoTokenizer

CAP_MODEL_NAME = "openbmb/MiniCPM-V-2"

CAP_COND = [
  {'role': 'user', 'content': "The following image was taken during a protest."},
  {'role': 'user', 'content': "Give a short description of the image."},
  {'role': 'user', 'content': "Don't mention sports or winter."},
]

CAP_MODEL = {
  "model": AutoModel.from_pretrained(CAP_MODEL_NAME, trust_remote_code=True, torch_dtype=torch.bfloat16).to("cuda", dtype=torch.bfloat16),
  "pre": AutoTokenizer.from_pretrained(CAP_MODEL_NAME, trust_remote_code=True),
  "chat": CAP_COND
}

CAP_MODEL["post"] = CAP_MODEL["pre"]

In [None]:
def run_caption(img, model):
  caption, _, _ = model["model"].chat(
    image=img,
    msgs=model["chat"],
    max_length=32,
    context=None,
    tokenizer=model["pre"],
    sampling=True,
    temperature=0.1
  )
  caption = caption[:caption.find(".") + 1]
  caption = caption[:caption.find(", possibly")]
  return caption

In [None]:
from transformers import Owlv2Processor, Owlv2ForObjectDetection

OBJ_TARGET_SIZE = torch.Tensor([500, 500])
OBJ_MODEL = "google/owlv2-base-patch16-ensemble"

obj_model = Owlv2ForObjectDetection.from_pretrained(OBJ_MODEL).to("cuda")
obj_processor = Owlv2Processor.from_pretrained(OBJ_MODEL)

In [None]:
def run_object_detection(img, obj_labels, obj_tholds, not_labels={}):
  input = obj_processor(text=obj_labels, images=img, return_tensors="pt").to("cuda")
  with torch.no_grad():
    obj_out = obj_model(**input)

  obj_results = obj_processor.post_process_object_detection(outputs=obj_out, target_sizes=[OBJ_TARGET_SIZE])
  scores, labels = obj_results[0]["scores"], obj_results[0]["labels"]

  all_detect_labels = [obj_labels[l.item()] for s,l in zip(scores, labels) if s > obj_tholds[l.item()]]

  obj_detect_counts = {l:int(c) for l,c in zip(*np.unique(all_detect_labels, return_counts=True))}
  obj_detect_labels = sorted(set(all_detect_labels))

  for nl,nc in not_labels.items():
    if obj_detect_counts.get(nl, 0) > nc:
      # if not_objects are present, return their labels and counts
      obj_detect_labels = [l for l in obj_detect_labels if l in not_labels]
      obj_detect_counts = {l:c for l,c in obj_detect_counts.items() if l in not_labels}
      break

  return obj_detect_labels, obj_detect_counts

In [None]:
hash_params = dict(
  hash_size=8,
  highfreq_factor=4
)

def phash(im, hash_size=8, highfreq_factor=4):
  """from vframe: https://github.com/vframeio/vframe/blob/master/src/vframe/utils/im_utils.py#L37-L48"""
  """Perceptual hash rewritten from https://github.com/JohannesBuchner/imagehash/blob/master/imagehash.py#L197"""
  wh = hash_size * highfreq_factor
  im = cv2.resize(im, (wh, wh), interpolation=cv2.INTER_NEAREST)
  if len(im.shape) > 2 and im.shape[2] > 1:
    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
  mdct = fftpack.dct(fftpack.dct(im, axis=0), axis=1)
  dctlowfreq = mdct[:hash_size, :hash_size]
  med = np.median(dctlowfreq)
  diff = dctlowfreq > med
  return ImageHash(diff)

In [None]:
def duplicate_hash(h0, h_list, thold=4):
  for h in h_list:
    if abs(h - h0) < thold:
      return True
  return False

### Open Video Data

In [None]:
with open(VIDEO_DB_PATH, "r") as f:
  video_data = json.load(f)

### Run Caption

In [None]:
%%time

input_dirs = sorted([d for d in listdir(VIDEO_PATH) if DIR_PATTERN.search(d) is not None])
mLangs = ["en"]

for io_dir in input_dirs:
  output_dir_path = path.join(OUT_PATH, io_dir)
  makedirs(output_dir_path, exist_ok=True)

  input_dir_path = path.join(VIDEO_PATH, io_dir)
  input_files = sorted([f for f in listdir(input_dir_path) if f.endswith("mp4")])

  processed_hashes = []

  for io_file in input_files:
    input_file_path = path.join(input_dir_path, io_file)
    output_file_path = path.join(output_dir_path, io_file.replace(".mp4", ".json"))

    if io_file not in video_data:
      print(io_file, "not in video_data")
      continue

    if path.isfile(output_file_path):
      continue

    print(io_dir, io_file)

    vid = cv2.VideoCapture(input_file_path)
    vw = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    vh = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_count = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = round(vid.get(cv2.CAP_PROP_FPS))
    min_frame_diff = 1 * 60 * fps

    static_frames = video_data[io_file]["static_frames"]

    rep_frames_data = []
    last_processed_frame = -min_frame_diff

    vid.set(cv2.CAP_PROP_POS_FRAMES, 0)
    for frameIdx in video_data[io_file]["representative_frames"]:
      if frameIdx - last_processed_frame < min_frame_diff:
        continue

      vid.set(cv2.CAP_PROP_POS_FRAMES, frameIdx)
      _, frame = vid.read()
      frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      image = PImage.fromarray(frame)

      frame_hash = phash(frame, **hash_params)
      if duplicate_hash(frame_hash, processed_hashes, thold=12):
        continue

      processed_hashes.append(frame_hash)
      last_processed_frame = frameIdx

      frame_data = {}
      frame_data["index"] = frameIdx
      frame_data["caption"] = {}
      for mLang in mLangs:
        frame_data["caption"][mLang] = []

      frame_data["caption"]["en"] = run_caption(image, CAP_MODEL)
      # TODO: translate

      if frameIdx in static_frames:
        objs, counts = run_object_detection(image, STATIC_LABELS, STATIC_THOLD, not_labels=NOT_STATIC_OBJS)
      else:
        objs, counts = run_object_detection(image, DYNAMIC_LABELS, DYNAMIC_THOLD)

      frame_data["objects"] = objs
      frame_data["counts"] = counts
      rep_frames_data.append(frame_data)

    print(len(video_data[io_file]["representative_frames"]), "->", len(rep_frames_data))
    with open(output_file_path, "w") as of:
      json.dump(rep_frames_data, of, sort_keys=True, separators=(',',':'))

    vid.release()

### Post-Process: add to metadata

In [None]:
import cv2
import json
import re

from os import listdir, makedirs, path

VIDEO_DB_PATH_IN = "./metadata/keyframe-500/videos.json"

CAPTION_PATH = "./metadata/objects-1152"
VIDEO_DB_PATH_OUT = path.join(CAPTION_PATH, "videos.json")

VIDEO_PATH = "../../vids/0801-500"
IMAGE_PATH = "../../imgs/0801-500"
makedirs(IMAGE_PATH, exist_ok=True)

DIR_PATTERN = re.compile("^[0-3][0-9]-")

In [None]:
# open all caption files
cap_data = {}

input_dirs = sorted([d for d in listdir(CAPTION_PATH) if DIR_PATTERN.search(d) is not None])

for io_dir in input_dirs:
  input_dir_path = path.join(CAPTION_PATH, io_dir)
  input_files = sorted([f for f in listdir(input_dir_path) if f.endswith("json")])

  for io_file in input_files:
    input_file_path = path.join(input_dir_path, io_file)
    video_key = io_file.replace("json", "mp4")
    with open(input_file_path, "r") as f:
      cap_data[video_key] = json.load(f)

In [None]:
with open(VIDEO_DB_PATH_IN, "r") as f:
  video_data = json.load(f)

In [None]:
for k, vdata in video_data.items():
  if k not in cap_data:
    print(k, "has no caption info")
  else:
    video_data[k]["representative_frames"] = cap_data[k]
    video_data[k]["representative_frames_count"] = len(cap_data[k])

In [None]:
# write out frame images

lastc = ""
for vd in video_data.values():
  if vd["camera"] != lastc:
    lastc = vd["camera"]
    print(vd["camera"])
  input_video_path = path.join(VIDEO_PATH, vd["camera"], vd["name"])
  output_dir_path = input_video_path.replace(VIDEO_PATH, IMAGE_PATH).replace(".mp4", "")
  makedirs(output_dir_path, exist_ok=True)
  vd["representative_frames"] = [rf for rf in vd["representative_frames"] if len(rf["objects"]) > 0 or len(rf["counts"]) > 0]
  if len(vd["representative_frames"]) > 0:
    vid = cv2.VideoCapture(input_video_path)
    vid.set(cv2.CAP_PROP_POS_FRAMES, 0)
    for rf in vd["representative_frames"]:
      frameIdx = rf["index"]
      vid.set(cv2.CAP_PROP_POS_FRAMES, frameIdx)
      _, frame = vid.read()

      output_image_filename = f"0000000{frameIdx}.jpg"[-11:]
      output_image_path = path.join(output_dir_path, output_image_filename)
      cv2.imwrite(output_image_path, frame)

In [None]:
with open(VIDEO_DB_PATH_OUT, "w") as f:
  json.dump(video_data, f, separators=(',',':'))

### Post-Process: create objects json

In [None]:
import json

from os import path

CAPTION_PATH = "./metadata/objects-1152"
VIDEO_DB_PATH_IN = path.join(CAPTION_PATH, "videos.json")
OBJ_PATH_OUT = path.join(CAPTION_PATH, "objects.json")

with open(VIDEO_DB_PATH_IN, "r") as f:
  video_data = json.load(f)

In [None]:
def get_timestamp(mpos, seek):
  ts0, pos0 = seek[0]
  ts1, pos1 = seek[-1]
  for ts,pos in seek[1:]:
    if pos >= mpos:
      ts1, pos1 = ts, pos
      break
    else:
      ts0, pos0 = ts, pos
  if pos0 == pos1 and int(mpos) == int(pos1) and mpos > pos1:
    pos1 = mpos
  return ((mpos - pos0) / (pos1 - pos0)) * (ts1 - ts0) + ts0

### Files/Frames/Objects

In [None]:
file_data = []
frame_data = []
obj_data = {}

def has_objects(frame_data):
  return len(frame_data["objects"]) > 0 or len(frame_data["counts"]) > 0

for vid,data in video_data.items():
  m_path = f"{data['camera']}/{data['name']}"
  m_file_key = len(file_data)
  file_data.append(m_path)

  # export frame images
  input_video_path = path.join(VIDEO_PATH, data["camera"], data["name"])
  vid = cv2.VideoCapture(input_video_path)
  vid.set(cv2.CAP_PROP_POS_FRAMES, 0)
  output_dir_path = path.join(IMAGE_PATH, data["camera"])
  makedirs(output_dir_path, exist_ok=True)

  data["representative_frames"] = [f for f in data["representative_frames"] if has_objects(f)]
  for f in data["representative_frames"]:
    m_frame = f["index"]
    m_pos = f["index"] / data["fps"]
    m_timestamp = get_timestamp(m_pos, data["seek"])
    m_caption = f["caption"]["en"].replace("The image shows ", "")
    m_counts = f["counts"]
    m_frame_key = len(frame_data)
    frame_data.append({
      "file": m_file_key,
      "frame": m_frame,
      "time": round(m_pos, 5),
      "timestamp": round(m_timestamp, 5),
      "caption": m_caption,
      "counts": m_counts
    })
    # export frame
    vid.set(cv2.CAP_PROP_POS_FRAMES, m_frame)
    _, frame = vid.read()

    output_image_filename = f"{int(m_timestamp)}.jpg"
    output_image_path = path.join(output_dir_path, output_image_filename)
    cv2.imwrite(output_image_path, frame)

    for o in f["objects"]:
      if o not in obj_data:
        obj_data[o] = []
      obj_data[o].append({
        "frame": m_frame_key,
        "timestamp": m_timestamp
      })

In [None]:
for k,v in obj_data.items():
  sorted_by_ts = sorted(v, key=lambda x: x["timestamp"])
  obj_data[k] = [x["frame"] for x in sorted_by_ts]

In [None]:
out_data = {
  "files": file_data,
  "frames": frame_data,
  "objects": obj_data
}

In [None]:
with open(OBJ_PATH_OUT, "w") as f:
  json.dump(out_data, f, separators=(',',':'), sort_keys=True)

### Model tests

In [None]:
import torch
from IPython.display import display, Image
from transformers import Owlv2Processor, Owlv2ForObjectDetection

MODEL = "google/owlv2-base-patch16-ensemble"

model = Owlv2ForObjectDetection.from_pretrained(MODEL).to("cuda")
processor = Owlv2Processor.from_pretrained(MODEL)

In [None]:
VIDEO_PATH = "../../vids/0801-500"
input_dir = sorted([d for d in listdir(VIDEO_PATH) if DIR_PATTERN.search(d) is not None])[0]
input_dir_path = path.join(VIDEO_PATH, input_dir)
input_files = sorted([f for f in listdir(input_dir_path) if f.endswith("mp4")])
input_file_path = path.join(input_dir_path, input_files[0])

vid = cv2.VideoCapture(input_file_path)
vid.set(cv2.CAP_PROP_POS_FRAMES, 80)
_, frame = vid.read()

frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = PImage.fromarray(frame)

vid.release()
display(image)

In [None]:
FIND_LABELS = sorted(OBS_FIND.keys())
COUNT_LABELS = sorted(OBS_COUNT.keys())

FIND_THOLD = [OBS_FIND[k] for k in FIND_LABELS]
COUNT_THOLD = [OBS_COUNT[k] for k in COUNT_LABELS]

In [None]:
%%time

target_sizes = torch.Tensor([500, 500])
inputs = processor(text=FIND_LABELS, images=image, return_tensors="pt").to("cuda")

with torch.no_grad():
  outputs = model(**inputs)

results = processor.post_process_object_detection(outputs=outputs, target_sizes=[target_sizes])
scores, labels = results[0]["scores"], results[0]["labels"]

result_labels = [FIND_LABELS[l.item()] for s,l in zip(scores, labels) if s > FIND_THOLD[l.item()]]

set(result_labels)

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration

MODEL = "Salesforce/blip-image-captioning-large"

processor = BlipProcessor.from_pretrained(MODEL)
model = BlipForConditionalGeneration.from_pretrained(MODEL).to("cuda")

input = processor(image, "image of", return_tensors="pt").to("cuda")

out = model.generate(**input, max_length=50)
caption = processor.decode(out[0], skip_special_tokens=True)
print(caption)

In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering

MODEL = "Salesforce/blip-vqa-capfilt-large"

processor = BlipProcessor.from_pretrained(MODEL)
model = BlipForQuestionAnswering.from_pretrained(MODEL).to("cuda")

question = "are people in the image?"
inputs = processor(image, question, return_tensors="pt").to("cuda")

out = model.generate(**inputs, max_length=32)
print(processor.decode(out[0], skip_special_tokens=True))

In [None]:
%%time
images = [image] * len(questions)
inputs = processor(images, questions, padding=True, return_tensors="pt").to("cuda")

out = model.generate(**inputs, max_length=32)
answers = processor.batch_decode(out, skip_special_tokens=True)
objs = [o for o,a in zip(OBSOI, answers) if a == "yes"]
print(objs)

In [None]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

MODEL = "nlpconnect/vit-gpt2-image-captioning"

model = VisionEncoderDecoderModel.from_pretrained(MODEL).to("cuda")
processor = ViTImageProcessor.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

input = processor(images=image, return_tensors="pt").to("cuda")

out = model.generate(**input, max_length=50)
caption = tokenizer.decode(out[0], skip_special_tokens=True)
print(caption)

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

MODEL = "openbmb/MiniCPM-V-2"

model = AutoModel.from_pretrained(MODEL, trust_remote_code=True, torch_dtype=torch.bfloat16)
model = model.to(device='cuda', dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
_ = model.eval()

In [None]:
%%time

msgs = [
    {'role': 'user', 'content': "The following image was taken during a protest."},
    {'role': 'user', 'content': "Give a short description of the image."},
    {'role': 'user', 'content': "Don't mention sports or winter."},
]

caption, _, _ = qa_model.chat(
  image=image,
  msgs=msgs,
  max_length=32,
  context=None,
  tokenizer=qa_tokenizer,
  sampling=True,
  temperature=0.1
)
print(caption)

In [None]:
%%time

answers = []
for o in OBSOI:
  question = f'using only yes or no, are there any {o} in the image?'
  msgs = [{'role': 'user', 'content': question}]

  res, context, _ = model.chat(
    image=image,
    msgs=msgs,
    context=None,
    tokenizer=tokenizer,
    sampling=True,
    temperature=0.1
  )
  print(res.split(',')[0].lower())
  answers.append(res.split(',')[0].lower())

objs = [o for o,a in zip(OSOI, answers) if a == "yes"]
print(objs)


### Time Tests

In [None]:
input_dir = sorted([d for d in listdir(VIDEO_PATH) if DIR_PATTERN.search(d) is not None])[0]
input_dir_path = path.join(VIDEO_PATH, input_dir)
input_files = sorted([f for f in listdir(input_dir_path) if f.endswith("mp4")])

In [None]:
%%time

for io_file in input_files:
  input_file_path = path.join(input_dir_path, io_file)
  vid = cv2.VideoCapture(input_file_path)
  rep_frames = video_data[io_file]["representative_frames"]

  vid.set(cv2.CAP_PROP_POS_FRAMES, 0)
  for frame_data in rep_frames:
    frameIdx = frame_data["index"]
    vid.set(cv2.CAP_PROP_POS_FRAMES, frameIdx)
    _, frame = vid.read()
    frame_grey = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
  vid.release()

In [None]:
%%time

for io_file in input_files:
  input_file_path = path.join(input_dir_path, io_file)
  vid = cv2.VideoCapture(input_file_path)
  rep_frames = video_data[io_file]["representative_frames"]
  rep_frame_idxs = [f["index"] for f in rep_frames]

  vid.set(cv2.CAP_PROP_POS_FRAMES, 0)
  for frameIdx in range(0, frame_count):
    _, frame = vid.read()
    if frameIdx not in rep_frame_idxs:
      continue
    else:
      frame_grey = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
  vid.release()