# Process Data
- Color Information
- Embedding Information
- Object Detection
- Captioning

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

### Color Palette

In [None]:
for name,info in MUSEUMS.items():
  print("color:", name)
  Museum.get_colors(info)

### Embeddings (CLIP)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "clip")

### Embeddings (SigLip2)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "siglip2")

### Objects (Owlv2)

In [None]:
for name,info in MUSEUMS.items():
  print("objects:", name)
  Museum.get_objects(info)

### Export Object Crop Images

In [None]:
for name,info in MUSEUMS.items():
  print("export crops:", name)
  Museum.export_object_crops(info)

### Captions (Llama3.2-vision)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="llama3.2-vision:11b")

### Captions (Gemma3)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="gemma3:4b")

### Combine and Export JSONs

In [None]:
for name,info in MUSEUMS.items():
  print("combine:", name)
  Museum.combine_data(info)

### Combine all museum JSONs

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

OUT_DIR = "./metadata/json"
OUT_PREFIX = "20250619"
DATA_DIRS = ["embeddings", "processed"]

Museum.combine_museums(MUSEUMS, OUT_DIR, OUT_PREFIX, DATA_DIRS)

### Export pre-processed data

In [None]:
import json

OUT_PREFIX = "20250619"
INPUT_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_processed.json"
OUTPUT_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_preload.json"

with open(INPUT_FILE_PATH, "r") as ifp:
  data = json.load(ifp)

preload_data = {
  "categories": {},
  "museums": {},
  "objects": {},
}

for k,v in data.items():
  if v["museum"] not in preload_data["museums"]:
    preload_data["museums"][v["museum"]] = []
  preload_data["museums"][v["museum"]].append(k)

  for cat in v["categories"]:
    if cat not in preload_data["categories"]:
      preload_data["categories"][cat] = []
    preload_data["categories"][cat].append(k)

  for obj in v["objects"]:
    if obj["label"] not in preload_data["objects"]:
      preload_data["objects"][obj["label"]] = []
    preload_data["objects"][obj["label"]].append(k)

with open(OUTPUT_FILE_PATH, "w") as ofp:
  json.dump(preload_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)

## Export cluster information

### Cluster descriptions using Gemma3

In [None]:
from data_utils import Clusterer

DATA_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"

Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_vlm_10_10.json", top_images=10, num_images=10)
Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_vlm_50_10.json", top_images=50, num_images=10)

## Export cluster information

### Cluster descriptions using SigLip2

In [None]:
from data_utils import Clusterer

DATA_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"

Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_siglip_32.json", describe="siglip", num_images=32)
Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_siglip_100.json", describe="siglip", num_images=100)
Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_siglip_200.json", describe="siglip", num_images=200)