# Process Data
- Color Information
- Embedding Information
- Object Detection
- Captioning

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

### Color Palette

In [None]:
for name,info in MUSEUMS.items():
  print("color:", name)
  Museum.get_colors(info)

### Embeddings (CLIP)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "clip")

### Embeddings (SigLip2)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "siglip2")

### Objects (Owlv2)

In [None]:
for name,info in MUSEUMS.items():
  print("objects:", name)
  Museum.get_objects(info)

### Export Object Crop Images

In [None]:
for name,info in MUSEUMS.items():
  print("export crops:", name)
  Museum.export_object_crops(info)

### Captions (Llama3.2-vision)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="llama3.2-vision:11b")

### Captions (Gemma3)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="gemma3:4b")

### Combine and Export JSONs

In [None]:
for name,info in MUSEUMS.items():
  print("combine:", name)
  Museum.combine_data(info)

### Combine all museum JSONs

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

OUT_DIR = "./metadata/json"
OUT_PREFIX = "20250619"
DATA_DIRS = ["embeddings", "processed"]

Museum.combine_museums(MUSEUMS, OUT_DIR, OUT_PREFIX, DATA_DIRS)

### Export pre-processed data

In [None]:
import json

OUT_PREFIX = "20250619"
INPUT_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_processed.json"
OUTPUT_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_preload.json"

with open(INPUT_FILE_PATH, "r") as ifp:
  data = json.load(ifp)

preload_data = {
  "categories": {},
  "museums": {},
  "objects": {},
}

for k,v in data.items():
  if v["museum"] not in preload_data["museums"]:
    preload_data["museums"][v["museum"]] = []
  preload_data["museums"][v["museum"]].append(k)

  for cat in v["categories"]:
    if cat not in preload_data["categories"]:
      preload_data["categories"][cat] = []
    preload_data["categories"][cat].append(k)

  for obj in v["objects"]:
    if obj["label"] not in preload_data["objects"]:
      preload_data["objects"][obj["label"]] = []
    preload_data["objects"][obj["label"]].append(k)

with open(OUTPUT_FILE_PATH, "w") as ofp:
  json.dump(preload_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)

## Export cluster information

### Cluster descriptions using Gemma3

In [None]:
import json
import numpy as np

from os import path
from sklearn.metrics.pairwise import euclidean_distances

from clustering import tsne_kmeans
from models.LlamaVision import LlamaVision

OUT_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"
EMBEDDING_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_embeddings.json"
OUTPUT_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_clusters.json"

with open(EMBEDDING_FILE_PATH, "r") as ifp:
  data = json.load(ifp)

ids = np.array(list(data.keys()))
embeddings = np.array([v["siglip2"] for v in data.values()])

cluster_data = {}

llama = LlamaVision()

for n in range(2, 17, 2):
  print(n, "clusters...")
  embs, clusters, centers = tsne_kmeans(embeddings, n_clusters=n)
  cluster_distances = euclidean_distances(centers, embs)
  id_idxs_by_distance = cluster_distances.argsort(axis=1)

  i_c_d = zip(ids.tolist(), clusters.tolist(), cluster_distances.T.tolist())

  cluster_data[n] = {
    "images": {id: {"cluster": c, "distances": [round(d,6) for d in ds]} for  id,c,ds in i_c_d},
    "clusters": {"descriptions": {"pt": [], "en": []}}
  }

  ids_to_describe = ids[id_idxs_by_distance[:, :100:10]]

  for c in range(n):
    img_paths = [path.join(IMAGES_PATH, f"{id}.jpg") for id in ids_to_describe[c]]
    for lang in cluster_data[n]["clusters"]["descriptions"].keys():
      words = llama.common(img_paths, lang=lang)
      cluster_data[n]["clusters"]["descriptions"][lang].append(words)

with open(OUTPUT_FILE_PATH, "w") as ofp:
  json.dump(cluster_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)

## Export cluster information

### Cluster descriptions using SigLip2

In [None]:
import json
import numpy as np

from os import path
from sklearn.metrics.pairwise import euclidean_distances

from clustering import tsne_kmeans
from data_utils import get_caption_words

from models.SigLip2 import SigLip2

OUT_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"
DATA_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_processed.json"
EMBEDDING_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_embeddings.json"
OUTPUT_FILE_PATH = f"./metadata/json/{OUT_PREFIX}_clusters_siglip.json"

with open(EMBEDDING_FILE_PATH, "r") as ifp:
  data = json.load(ifp)

ids = np.array(list(data.keys()))
embeddings = np.array([v["siglip2"] for v in data.values()])

words_en = get_caption_words(DATA_FILE_PATH, lang="en", categories=["people", "fauna", "flora"])
words_pt = get_caption_words(DATA_FILE_PATH, lang="pt", categories=["people", "fauna", "flora"])

cluster_data = {}

siglip = SigLip2()

for n in range(2, 17, 2):
  print(n, "clusters...")
  embs, clusters, centers = tsne_kmeans(embeddings, n_clusters=n)
  cluster_distances = euclidean_distances(centers, embs)
  id_idxs_by_distance = cluster_distances.argsort(axis=1)

  i_c_d = zip(ids.tolist(), clusters.tolist(), cluster_distances.T.tolist())

  cluster_data[n] = {
    "images": {id: {"cluster": c, "distances": [round(d,6) for d in ds]} for  id,c,ds in i_c_d},
    "clusters": {"descriptions": {"pt": [], "en": []}}
  }

  ids_to_avg = ids[id_idxs_by_distance[:, :128]]
  embeddings_to_avg = np.array([[data[id]["siglip2"] for id in ids] for ids in ids_to_avg])
  embeddings_avg = embeddings_to_avg.mean(axis=1)

  for c in range(n):
    img_tags_en = siglip.zero_shot(embeddings_avg[c], words_en[:500])
    img_tags_pt = siglip.zero_shot(embeddings_avg[c], words_pt[:500], prefix="pintura mostrando")
    cluster_data[n]["clusters"]["descriptions"]["en"].append(img_tags_en[:10])
    cluster_data[n]["clusters"]["descriptions"]["pt"].append(img_tags_pt[:10])

with open(OUTPUT_FILE_PATH, "w") as ofp:
  json.dump(cluster_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)