# Process Data
- Color Information
- Embedding Information
- Object Detection
- Captioning

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

### Color Palette

In [None]:
for name,info in MUSEUMS.items():
  print("color:", name)
  Museum.get_colors(info)

### Embeddings (CLIP)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "clip")

### Embeddings (SigLip2)

In [None]:
for name,info in MUSEUMS.items():
  print("embeddings:", name)
  Museum.get_embeddings(info, "siglip2")

### Objects (Owlv2)

In [None]:
for name,info in MUSEUMS.items():
  print("objects:", name)
  Museum.get_objects(info)

### Export Object Crop Images

In [None]:
for name,info in MUSEUMS.items():
  print("export crops:", name)
  Museum.export_object_crops(info)

### Captions (Llama3.2-vision)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="llama3.2-vision:11b")

### Captions (Gemma3)

In [None]:
for name,info in MUSEUMS.items():
  print("caption:", name)
  Museum.get_captions(info, model="gemma3:4b")

### Combine and Export JSONs

In [None]:
for name,info in MUSEUMS.items():
  print("combine:", name)
  Museum.combine_data(info)

## Combine Data

### Combine all museum JSONs

In [None]:
from params.collections import MUSEUMS
from Museum import Museum

OUT_DIR = "./metadata/json"
OUT_PREFIX = "20250619"
DATA_DIRS = ["embeddings", "processed"]

Museum.combine_museums(MUSEUMS, OUT_DIR, OUT_PREFIX, DATA_DIRS)

### Export pre-processed data

In [None]:
from utils.data_utils import export_preload_data

DATA_PREFIX = "20250619"
export_preload_data(DATA_PREFIX, ["categories", "museum", "objects"], "preload.json")

### Export cluster information (with cluster descriptions using Gemma3)

In [None]:
from utils.data_utils import Clusterer

DATA_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"

Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_vlm_10_10.json", describe="gemma3")

### Export cluster information (with cluster descriptions using SigLip2)

In [None]:
from utils.data_utils import Clusterer

DATA_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"

Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters_siglip.json", describe="siglip")

### Export cluster information (with cluster descriptions using Gemma3 and SigLip2)

In [None]:
from utils.data_utils import Clusterer

DATA_PREFIX = "20250619"
IMAGES_PATH = "../../imgs/arts/500"

Clusterer(DATA_PREFIX, IMAGES_PATH).export_clusters("clusters.json")