# Process Data
- Color Information
- Embedding Information
- Object Detection
- Captioning

In [None]:
from Museum import Museum
from params.collections import MUSEUMS

### Color Palette

In [None]:
for name,info in MUSEUMS.items():
  print("color:", name)
  Museum.get_colors(info)

### Embeddings (CLIP)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "clip")

### Embeddings (SigLip2)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "siglip2")

### Objects (Owlv2)

In [None]:
for name,info in MUSEUMS.items():
  print("objects:", name)
  Museum.get_objects(info)

### Export Object Crop Images

In [None]:
for name,info in MUSEUMS.items():
  print("export crops:", name)
  Museum.export_object_crops(info)

### Captions (Llama3.2-vision)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="llama3.2-vision:11b")

### Captions (Gemma3)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="gemma3:4b")

### Combine and Export JSONs

In [None]:
for name,info in MUSEUMS.items():
  print("combine:", name)
  Museum.combine_data(info)

## Combine Data

### Combine all museum JSONs (and add tsne embeddings)

In [None]:
from Museum import Museum
from params.collections import MUSEUMS

from utils.data_utils import get_tsne_embeddings

OUT_PREFIX = "20250705"
OUT_DIR = "./metadata/json"
DATA_DIRS = ["processed"]

embedding_data = Museum.combine_all_data(MUSEUMS, "embeddings")
tsne_embeddings = get_tsne_embeddings(embedding_data)

extra_data = {
  "processed" : tsne_embeddings
}

Museum.combine_museums(MUSEUMS, OUT_DIR, OUT_PREFIX, DATA_DIRS, extra_data)

### Add image ratios

In [None]:
import json

from utils.data_utils import get_image_ratios

OUT_PREFIX = "20250705"
OUT_DIR = "./metadata/json"
DATA_FILE = f"{OUT_DIR}/{OUT_PREFIX}_processed.json"

ratioH_data = get_image_ratios()

with open(DATA_FILE, "r", encoding="utf-8") as ifp:
  all_data = json.load(ifp)

for k,v in all_data.items():
  if k in ratioH_data:
    all_data[k]["image"]["ratio"] = ratioH_data[k]
  else:
    print(k, "not in ratioH_data")

with open(DATA_FILE, "w", encoding="utf-8") as ofp:
  json.dump(all_data, ofp, separators=(",",":"), sort_keys=True, ensure_ascii=False)

### ~~Combine all museum JSONs~~

In [None]:
from Museum import Museum
from params.collections import MUSEUMS

OUT_PREFIX = "20250705"
OUT_DIR = "./metadata/json"
DATA_DIRS = ["processed"]

Museum.combine_museums(MUSEUMS, OUT_DIR, OUT_PREFIX, DATA_DIRS)

### Export cluster information (with cluster descriptions using Gemma3 and SigLip2)

In [None]:
from Museum import Museum

from params.collections import MUSEUMS
from utils.data_utils import Clusterer

OUT_PREFIX = "20250705"
IMAGES_PATH = "../../imgs/arts/500"

embedding_data = Museum.combine_all_data(MUSEUMS, "embeddings")

Clusterer(embedding_data, OUT_PREFIX, IMAGES_PATH).export_clusters("clusters.json")