# Process Data
- Color Information
- Embedding Information
- Object Detection
- Captioning

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

### Color Palette

In [None]:
for name,info in MUSEUMS.items():
  print("color:", name)
  Museum.get_colors(info)

### Embeddings (CLIP)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "clip")

### Embeddings (SigLip2)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "siglip2")

### Objects (Owlv2)

In [None]:
for name,info in MUSEUMS.items():
  print("objects:", name)
  Museum.get_objects(info)

### Captions (Llama3.2-vision)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info)

### Combine and Export JSONs

In [None]:
for name,info in MUSEUMS.items():
  print("combine:", name)
  Museum.combine_data(info)

### Combine all JSONs

In [None]:
import json

from params.collections import MUSEUMS
from Museum import Museum

OUTPUT_FILE_TEMPLATE = "./metadata/json/20250515_XTYPEX.json"

for out_type in ["embeddings", "full", "no-embeddings"]:
  output_file_path = OUTPUT_FILE_TEMPLATE.replace("XTYPEX", out_type)
  all_data = {}

  for name,info in MUSEUMS.items():
    Museum.prep_dirs(info)

    with open(Museum.INFO_PATH.replace(".json", f"_{out_type}.json"), "r") as ifp:
      museum_data = json.load(ifp)
    
    print("reading:", name, len(museum_data))

    repeat_keys = [k for k in museum_data.keys() if k in all_data]
    print("repeat keys:", repeat_keys)

    all_data |= museum_data

  print("writing", len(all_data))

  with open(output_file_path, "w") as ofp:
    json.dump(all_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)

### Export pre-processed data

In [None]:
import json

INPUT_FILE_PATH = "./metadata/json/20250515_no-embeddings.json"
OUTPUT_FILE_PATH = "./metadata/json/20250515_preload.json"

with open(INPUT_FILE_PATH, "r") as ifp:
  data = json.load(ifp)

preload_data = {
  "categories": {},
  "museums": {},
  "objects": {},
}

for k,v in data.items():
  if v["museum"] not in preload_data["museums"]:
    preload_data["museums"][v["museum"]] = []
  preload_data["museums"][v["museum"]].append(k)

  for cat in v["categories"]:
    if cat not in preload_data["categories"]:
      preload_data["categories"][cat] = []
    preload_data["categories"][cat].append(k)

  for obj in v["objects"]:
    if obj["label"] not in preload_data["objects"]:
      preload_data["objects"][obj["label"]] = []
    preload_data["objects"][obj["label"]].append(k)

with open(OUTPUT_FILE_PATH, "w") as ofp:
  json.dump(preload_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)

### Export cluster data

In [None]:
import json
import numpy as np

from sklearn.metrics import euclidean_distances
from embeddings import tsne_kmeans

INPUT_FILE_PATH = "./metadata/json/20250515_embeddings.json"
OUTPUT_FILE_PATH = "./metadata/json/20250515_clusters.json"

with open(INPUT_FILE_PATH, "r") as ifp:
  data = json.load(ifp)

ids = np.array(list(data.keys()))
embeddings = np.array([v["clip"] for v in data.values()])

cluster_data = {}

for n in [2,4,6,8,10,12,14,16]:
  embs, clusters, centers = tsne_kmeans(embeddings, n_clusters=n)
  cluster_distances = euclidean_distances(embs, centers)

  i_c_d = zip(ids.tolist(), clusters.tolist(), cluster_distances.tolist())

  cluster_data[n] = {id: {"cluster": c, "distances": [round(d,6) for d in ds]} for  id,c,ds in i_c_d}

with open(OUTPUT_FILE_PATH, "w") as ofp:
  json.dump(cluster_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)