## Read object file and confirm objects

In [None]:
import json

from IPython.display import clear_output
from os import listdir, makedirs, path
from PIL import Image as PImage, ImageDraw as PImageDraw, ImageFont as PImageFont
from random import sample

from parameters.finetune_0915 import IMAGES_PATH, OBJECTS, OBJECT2LABEL, DB_FILE_PATH

MFONT = PImageFont.load_default(20)

OBJS_LABELS_IN = [sorted(o.keys()) for o in OBJECTS]
OBJS_LABELS_OUT = [[OBJECT2LABEL.get(l, l) for l in oli] for oli in OBJS_LABELS_IN]
OBJS_THOLDS = [[OBJECTS[i][k] for k in oli] for i,oli in enumerate(OBJS_LABELS_IN)]

JSON_FILE = "./metadata/json/pos-neg-ft-0915.json"

In [None]:
with open(DB_FILE_PATH, "r") as f:
  json_data = json.load(f)
  img_data = json_data["images"]
  obj_data = json_data["objects"]

print({k:len(v) for k,v in obj_data.items()})

In [None]:
positives = {k:set() for k in obj_data.keys()}
negatives = {k:set() for k in obj_data.keys()}

if path.isfile(JSON_FILE):
  with open(JSON_FILE, "r") as f:
    json_data = json.load(f)
    positives = {k:set(v) for k,v in json_data["positives"].items()}
    negatives = {k:set(v) for k,v in json_data["negatives"].items()}

In [None]:
label_ids = sorted([[k, str(i)] for k,v in obj_data.items() for i in v ], key=lambda x: f"{x[0]}{('00000'+x[1])[-5:]}")

In [None]:
idx_start = 0
for idx,(obj_label,img_id) in enumerate(label_ids[idx_start:]):
  input_file_path = path.join(IMAGES_PATH, f"{img_id}.jpg")

  image = PImage.open(input_file_path).convert("RGB")
  iw,ih = image.size

  print(idx_start+idx, ":", img_id)

  draw = PImageDraw.Draw(image)
  (x0,y0,x1,y1) = img_data[img_id]["boxes"][obj_label]
  draw.rectangle(((x0*iw, y0*ih), (x1*iw, y1*ih)), outline=(255, 0, 0), width=2)
  draw.text((x0*iw, y0*ih - 0), f"{obj_label}", (255, 0, 0), font=MFONT)

  display(image)

  keyp = input("y/n/c")

  if keyp == 'y':
    positives[obj_label].add(img_id)
  elif keyp == 'n':
    negatives[obj_label].add(img_id)
  elif keyp == 'c':
    break

  clear_output(wait=True)

In [None]:
ft = {
  "positives": {k:list(v) for k,v in positives.items()},
  "negatives": {k:list(v) for k,v in negatives.items()},
}

In [None]:
with open(JSON_FILE, "w", encoding="utf8") as outf:
  json.dump(ft, outf, separators=(',',':'), sort_keys=True, ensure_ascii=False)

## Create mini-dataset with file,label,box info

In [None]:
positives = {k:list(v) for k,v in positives.items()}
negatives = {k:list(v) for k,v in negatives.items()}

if path.isfile(JSON_FILE):
  with open(JSON_FILE, "r") as f:
    json_data = json.load(f)
    positives = {k:set(v) for k,v in json_data["positives"].items()}
    negatives = {k:set(v) for k,v in json_data["negatives"].items()}

In [None]:
{k:len(v) for k,v in positives.items()},\
{k:len(v) for k,v in negatives.items()}

In [None]:
dataset_light = []

for label,pos_ids in positives.items():
  pos_cnt = len(positives[label])
  neg_cnt = len(negatives[label])
  neg_ids = sample(negatives[label], min(pos_cnt, neg_cnt))

  for pid in pos_ids:
    dataset_light.append([
      f"{pid}.jpg", img_data[pid]["boxes"][label], label
    ])

  for nid in neg_ids:
    dataset_light.append([
      f"{nid}.jpg", img_data[nid]["boxes"][label], f"not{label}"
    ])

In [None]:
with open("./metadata/json/dataset-ft-aww.json", "w", encoding="utf8") as outf:
  json.dump(dataset_light, outf, separators=(',',':'), sort_keys=True, ensure_ascii=False)

## Prepare HF dataset

In [None]:
import datasets
import json
import pandas as pd

from os import path
from PIL import Image as PImage
from random import sample

from dataset_utils.finetune_0915 import FTUtils
from parameters.finetune_0915 import IMAGES_PATH

In [None]:
with open("./metadata/json/dataset-ft-0915.json", "r") as f:
  datasetft = json.load(f)

In [None]:
label2id = {i:set() for i in FTUtils.LABELS}
testids = set()

for filename,box,label in datasetft:
  if label in label2id:
    id = filename.replace(".jpg", "")
    label2id[label].add(id)

for k,v in label2id.items():
  num_tests = int(len(v) // 5)
  for id in sample(list(v), k=num_tests):
    testids.add(id)

len(testids)

In [None]:
data = {
  "train": {},
  "test": {}
}

for boxid,(filename, box, label) in enumerate(datasetft):
  if label not in FTUtils.LABEL2ID:
    continue

  filepath = path.join(IMAGES_PATH, filename)
  id = filename.replace(".jpg", "")
  split = "test" if id in testids else "train"
  if id not in data[split]:
    img = PImage.open(filepath)
    data[split][id] = {
      "image_id": int(id),
      "image": filepath,
      "image_filename": filename,
      "width": img.size[0],
      "height": img.size[1],
      "objects": []
    }
  box_xywh = FTUtils.xyxy_pct_to_xywh(box, data[split][id]["width"], data[split][id]["height"])
  data[split][id]["objects"].append({
    "bbox_id": boxid,
    "area": box_xywh[2] * box_xywh[3],
    "bbox": box_xywh,
    "category": label,
    "super_category": FTUtils.LABEL2SUPERLABEL[label],
    "is_crowd": False
  })

In [None]:
ds_dict = {
  "train": pd.DataFrame(data["train"].values()).to_dict("list"),
  "test": pd.DataFrame(data["test"].values()).to_dict("list")
}

In [None]:
hf_dataset = datasets.DatasetDict({
  split: datasets.Dataset.from_dict(data, features=FTUtils.FEATURES, info=FTUtils.get_dataset_info(), split=split) for split, data in ds_dict.items()
})

In [None]:
HF_DATASET = "acervos-digitais/ft-0915"
hf_dataset.push_to_hub(HF_DATASET, private=False)