In [None]:
import json

from Museum import Museum
from params.collections import MUSEUMS

from utils.classification_utils import impute_year

## Load Data

In [None]:
PREFIX = "20250705"
DATA_DIR = "./metadata/json"
DATA_FILE = f"{DATA_DIR}/{PREFIX}_processed.json"
CLUSTER_FILE = f"{DATA_DIR}/{PREFIX}_clusters.json"

embedding_data = Museum.combine_all_data(MUSEUMS, "embeddings")

with open(DATA_FILE, "r", encoding="utf-8") as ifp:
  all_data = json.load(ifp)

## Predict Dataset

In [None]:
imputed_data = impute_year(all_data, embedding_data)
len([1 for x in imputed_data.values() if x["yearp"][0] > 2025])

## Test / Debug

In [None]:
import json
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from Museum import Museum
from params.collections import MUSEUMS

from utils.classification_utils import AverageClassify, ClusterClassify, TorchClassify
from utils.classification_utils import KNNClassify, MLPClassify, RFClassify, SVClassify
from utils.classification_utils import GaussianBayesClassify, GaussianProcessClassify, SGDClassify

from utils.classification_utils import YearData

In [None]:
PREFIX = "20250705"
DATA_DIR = "./metadata/json"
DATA_FILE = f"{DATA_DIR}/{PREFIX}_processed.json"
CLUSTER_FILE = f"{DATA_DIR}/{PREFIX}_clusters.json"

embedding_data = Museum.combine_all_data(MUSEUMS, "embeddings")

with open(DATA_FILE, "r", encoding="utf-8") as ifp:
  all_data = json.load(ifp)

## Training Dataset

In [None]:
mYD = YearData(1800, 2000, 10)

embedding_data = Museum.combine_all_data(MUSEUMS, "embeddings")

with open(DATA_FILE, "r", encoding="utf-8") as ifp:
  all_data = json.load(ifp)

with open(CLUSTER_FILE, "r", encoding="utf-8") as ifp:
  cluster_data_all = json.load(ifp)
  cluster_data = cluster_data_all["8"]["images"]

class_data = []

for id in all_data.keys():
  if all_data[id]["year"] < 2030:
    class_data.append({
      "id": str(all_data[id]["id"]),
      "year": all_data[id]["year"],
      "cluster": cluster_data[id]["cluster"],
      "class": mYD.year2class(all_data[id]["year"]),
      "embedding": embedding_data[id]["siglip2"]
    })

class_data_train, class_data_test = train_test_split(class_data, test_size=0.25, random_state=101010)

classes_train = np.array([x["class"] for x in class_data_train])
classes_test = np.array([x["class"] for x in class_data_test])

## Classification

### Good classifiers

In [None]:
mCC = KNNClassify(7)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
KNNClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: >0.8 (correct mean)

for t in range(60, 105, 5):
  print(f"({t/100}): {KNNClassify.thold_accuracy(classes_test, preds_prob_test, t/100)}")

In [None]:
mCC = SVClassify(C=8.0, n_components=256)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
SVClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: >0.8 (test correct mean)

for t in range(60, 105, 5):
  print(f"({t/100}): {SVClassify.thold_accuracy(classes_test, preds_prob_test, t/100)}")

In [None]:
mCC = TorchClassify(lr=1e-1, epochs=288)
mCC.fit(class_data_train)

# preds_train = mCC.predict(class_data_train)
# preds_test = mCC.predict(class_data_test)

# preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

# print(f"train: {accuracy_score(classes_train, preds_train)}")
# print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
MLPClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: >0.999

print()
print(f"test ({0.9}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.9)}")
print(f"test ({0.95}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.95)}")
print(f"test ({0.99}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.99)}")
print(f"test ({0.999}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.999)}")
print(f"test ({0.9999}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.9999)}")

### Not so good classifiers

In [None]:
mCC = AverageClassify(24)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_dist_train = mCC.predict_dist(class_data_train)
preds_dist_test = mCC.predict_dist(class_data_test)

print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 1)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 1)}")

print()
print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 2)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 2)}")

# AverageClassify.dist_stats(classes_train, preds_dist_train)
# AverageClassify.dist_stats(classes_test, preds_dist_test)
AverageClassify.plot_accuracy_coverage(classes_test, preds_dist_test)
# thold: <10

In [None]:
mCC = ClusterClassify(20)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 1)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 1)}")

print()
print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 2)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 2)}")

In [None]:
mCC = RFClassify(16)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
RFClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: >0.65 (test correct mean)

In [None]:
mCC = MLPClassify()
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
MLPClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: >0.95

print()
print(f"({0.9}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.9)}")
print(f"({0.99}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.99)}")
print(f"({0.999}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.999)}")
print(f"({0.9999}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.9999)}")

In [None]:
mCC = SGDClassify()
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
SGDClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: >0.5

In [None]:
mCC = GaussianProcessClassify(n_components=256)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
GaussianProcessClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: ?? Features not gaussian

In [None]:
mCC = GaussianBayesClassify()
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
GaussianBayesClassify.plot_accuracy_coverage(classes_test, preds_prob_test)
# thold: ?? Features not gaussian