# Analyze Data
- t-SNE/PCA embedding clustering
- Object detection visualization

In [None]:
import json

from os import listdir, path

from embeddings import pca_kmeans, tsne_kmeans
from embeddings import plot_clusters, visualize_pca_clusters, visualize_tsne_clusters

from params.collections import MUSEUMS
from Museum import Museum

### t-SNE: by museum

In [None]:
museum_info = MUSEUMS["brasiliana"]
Museum.prep_dirs(museum_info)

In [None]:
embeding_files = sorted([f for f in listdir(Museum.DIRS["embeddings"]) if f.endswith(".json")])

raw_embs = []
image_paths = []

for idx, io_file in enumerate(embeding_files):
  qid = io_file.replace(".json", "")
  img_file = io_file.replace(".json", ".jpg")

  embeding_path = path.join(Museum.DIRS["embeddings"], io_file)
  with open(embeding_path, "r", encoding="utf8") as f:
    m_embs = json.load(f)

  raw_embs.append(m_embs[qid]["clip"])
  image_paths.append(path.join(Museum.IMGS["500"], img_file))

In [None]:
pca_vals, pca_clusters, pca_centers = pca_kmeans(raw_embs, n_clusters=4)
plot_clusters(pca_clusters, pca_vals, title="clip pca")

In [None]:
visualize_pca_clusters(raw_embs, image_paths, n_clusters=4, grid_dim=6)

In [None]:
tsne3_vals, tsne3_clusters, tsne3_centers = tsne_kmeans(raw_embs, n_clusters=4, n_components=3)
plot_clusters(tsne3_clusters, tsne3_vals, title="clip tsne 3D")

In [None]:
tsne2_vals, tsne2_clusters, tsne2_centers = tsne_kmeans(raw_embs, n_clusters=4, n_components=2)
plot_clusters(tsne2_clusters, tsne2_vals, title="clip tsne 2D")

In [None]:
visualize_tsne_clusters(raw_embs, image_paths, n_clusters=4, grid_dim=6)

### t-SNE: aggregate

In [None]:
import os, fnmatch

def find_file(pattern, path):
  for root, _, files in os.walk(path):
    for name in files:
      fpath = os.path.join(root, name)
      if fnmatch.fnmatch(fpath, pattern):
        return fpath
  return False

In [None]:
DATA_FILE = "./metadata/json/20250422_full.json"
IMGS_PATH = "../../imgs/arts"

with open(DATA_FILE, "r", encoding="utf8") as f:
  m_data = json.load(f)

raw_embs = []
image_paths = []

for qid,data in m_data.items():
  img_path = find_file(f"*/500/{qid}.jpg", IMGS_PATH)
  # img_path = path.join(IMGS_PATH, f"{qid}.jpg")
  if img_path:
    raw_embs.append(data["clip"])
    image_paths.append(img_path)

In [None]:
visualize_tsne_clusters(raw_embs, image_paths, n_clusters=8, grid_dim=6)

## Visualize Objects

In [None]:
import json

from os import listdir, makedirs, path

from PIL import Image as PImage, ImageOps as PImageOps, ImageDraw as PImageDraw

from params.collections import MUSEUMS
from Museum import Museum

### Visualize Boxes

In [None]:
museum_info = MUSEUMS["brasiliana"]
Museum.prep_dirs(museum_info)

In [None]:
obj_files = sorted([f for f in listdir(Museum.DIRS["objects"]) if f.endswith(".json")])

In [None]:
for fname in obj_files:
  with open(path.join(Museum.DIRS["objects"], fname), "r") as inp:
    iboxes = json.load(inp)

  if len(iboxes) < 1:
    continue

  image_file_path = path.join(Museum.IMGS["900"], fname.replace(".json", ".jpg"))
  image = PImageOps.exif_transpose(PImage.open(image_file_path).convert("RGB"))
  iw,ih = image.size
  draw = PImageDraw.Draw(image)

  for box in iboxes:
    label, (x0,y0,x1,y1) = box["label"], box["box"]
    draw.rectangle(((x0*iw, y0*ih), (x1*iw, y1*ih)), outline=(255, 0, 0), width=2)

  display(image)

### Create Mosaic Images

In [None]:
museum_info = MUSEUMS["brasiliana"]
Museum.prep_dirs(museum_info)

In [None]:
IMG_DIR_FLORA = path.join(Museum.DIRS["imgs"], "flora-mosaic")
makedirs(IMG_DIR_FLORA, exist_ok=True)

In [None]:
obj_files = sorted([f for f in listdir(Museum.DIRS["objects"]) if f.endswith(".json")])

In [None]:
MAX_HEIGHT = 100

total_width = 0

for fname in obj_files:
  with open(path.join(Museum.DIRS["objects"], fname), "r") as inp:
    iboxes = json.load(inp)

  if len(iboxes) < 1:
    continue

  image_file_path = path.join(Museum.IMGS["full"], fname.replace(".json", ".jpg"))
  image = PImageOps.exif_transpose(PImage.open(image_file_path).convert("RGB"))
  iw,ih = image.size

  for bidx,box in enumerate(iboxes):
    (x0,y0,x1,y1) = box["box"]
    bimg = image.crop((x0*iw, y0*ih, x1*iw, y1*ih))
    biw, bih = bimg.size
    if bih > MAX_HEIGHT:
      bimg = bimg.resize((int(biw * MAX_HEIGHT / bih), MAX_HEIGHT))
    
    bifname = f"{fname.replace('.json', '')}_{('0000'+str(bidx))[-3:]}.jpg"
    bimg.save(path.join(IMG_DIR_FLORA, bifname))
    
    total_width += bimg.size[0]

print(total_width)

### Create Mosaic

In [None]:
import numpy as np

In [None]:
MAX_HEIGHT = 100
total_width = 218903
F2x1 = 2.15

total_pxs = total_width * MAX_HEIGHT
print(total_pxs)

mdim_1x1 = round(total_pxs ** 0.5)
mdim_2x1 = round((total_pxs/F2x1)**0.5)

print(mdim_1x1, "->", mdim_1x1*mdim_1x1)
print(mdim_2x1, "->", F2x1*mdim_2x1*mdim_2x1)

In [None]:
IMG_SIZES = {
  "1x1": (4680, 4680),
  "2x1": (3190, int(F2x1*3190))
}

In [None]:
flora_files = sorted([f for f in listdir(IMG_DIR_FLORA) if f.startswith("Q") and f.endswith(".jpg")])
print(len(flora_files))

In [None]:
MIN_WIDTH_F = 0.98

for size_tag,mimg_size in IMG_SIZES.items():
  cx,cy = 0,0
  mimg = PImage.fromarray(np.zeros(mimg_size)).convert("RGB")
  miw, mih = mimg.size

  for fname in flora_files:
    fimg = PImage.open(path.join(IMG_DIR_FLORA, fname))
    fiw, fih = fimg.size

    if (cx+fiw) > miw and (cx > MIN_WIDTH_F*miw):
      cx = 0
      cy += MAX_HEIGHT
    if cy >= mih:
      print("breaking")
      break

    mimg.paste(fimg, (cx,cy))
    cx += fiw

  mimg = mimg.crop((0, 0, int(MIN_WIDTH_F*miw), cy))
  mimg.save(path.join(IMG_DIR_FLORA, f"flora_{size_tag}.jpg"))

## SigLip2 (embedding)

Large patch 16x16:
- https://huggingface.co/google/siglip2-large-patch16-384
- https://huggingface.co/google/siglip2-large-patch16-512

Giant 16x16:
- https://huggingface.co/google/siglip2-giant-opt-patch16-256
- https://huggingface.co/google/siglip2-giant-opt-patch16-384


## Owl2 (zero-shot detection)
- https://huggingface.co/google/owlv2-base-patch16
- https://huggingface.co/google/owlv2-large-patch14
- https://huggingface.co/google/owlv2-large-patch14-ensemble

#### Results from experiments
- Use larger images and smaller models