In [None]:
import json

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from Museum import Museum
from params.collections import MUSEUMS

from utils.classification_utils import AverageClassify, ClusterClassify, TorchClassify
from utils.classification_utils import KNNClassify, MLPClassify, RFClassify, SVClassify
from utils.classification_utils import GaussianBayesClassify, GaussianProcessClassify, SGDClassify

from utils.classification_utils import YearData

PREFIX = "20250705"
DATA_DIR = "./metadata/json"
DATA_FILE = f"{DATA_DIR}/{PREFIX}_processed.json"
CLUSTER_FILE = f"{DATA_DIR}/{PREFIX}_clusters.json"

## Dataset

In [None]:
mYD = YearData(1700, 2000, 10)

embedding_data = Museum.combine_all_data(MUSEUMS, "embeddings")

with open(DATA_FILE, "r", encoding="utf-8") as ifp:
  all_data = json.load(ifp)

with open(CLUSTER_FILE, "r", encoding="utf-8") as ifp:
  cluster_data_all = json.load(ifp)
  cluster_data = cluster_data_all["8"]["images"]

class_data = []

for id in all_data.keys():
  if all_data[id]["year"] < 2030:
    class_data.append({
      "id": str(all_data[id]["id"]),
      "year": all_data[id]["year"],
      "cluster": cluster_data[id]["cluster"],
      "class": mYD.year2class(all_data[id]["year"]),
      "embedding": embedding_data[id]["siglip2"]
    })

class_data_train, class_data_test = train_test_split(class_data, test_size=0.25, random_state=101010)

classes_train = np.array([x["class"] for x in class_data_train])
classes_test = np.array([x["class"] for x in class_data_test])

## Classification

In [None]:
mCC = AverageClassify(24)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_dist_train = mCC.predict_dist(class_data_train)
preds_dist_test = mCC.predict_dist(class_data_test)

print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 1)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 1)}")

print()
print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 2)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 2)}")

# AverageClassify.dist_stats(classes_train, preds_dist_train)
# AverageClassify.dist_stats(classes_test, preds_dist_test)
# thold: <10

print()
print(f"test ({10}): {AverageClassify.thold_accuracy(classes_test, preds_dist_test, 10)}")
print(f"test ({13}): {AverageClassify.thold_accuracy(classes_test, preds_dist_test, 13)}")
print(f"test ({15}): {AverageClassify.thold_accuracy(classes_test, preds_dist_test, 15)}")

In [None]:
mCC = ClusterClassify(20)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 1)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 1)}")

print()
print(f"train: {AverageClassify.top_k_accuracy(classes_train, preds_train, 2)}")
print(f"test: {AverageClassify.top_k_accuracy(classes_test, preds_test, 2)}")

In [None]:
mCC = KNNClassify(7)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")


# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: >0.8 (correct mean)

print()
print(f"test ({0.6}): {KNNClassify.thold_accuracy(classes_test, preds_prob_test, 0.6)}")
print(f"test ({0.8}): {KNNClassify.thold_accuracy(classes_test, preds_prob_test, 0.8)}")

In [None]:
mCC = RFClassify(32)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: >0.65 (test correct mean)

print()
print(f"test ({0.5}): {RFClassify.thold_accuracy(classes_test, preds_prob_test, 0.5)}")
print(f"test ({0.6}): {RFClassify.thold_accuracy(classes_test, preds_prob_test, 0.6)}")
print(f"test ({0.7}): {RFClassify.thold_accuracy(classes_test, preds_prob_test, 0.7)}")

In [None]:
mCC = SVClassify(C=8.0, n_components=128)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: >0.8 (test correct mean)

print()
print(f"test ({0.4}): {SVClassify.thold_accuracy(classes_test, preds_prob_test, 0.4)}")
print(f"test ({0.7}): {SVClassify.thold_accuracy(classes_test, preds_prob_test, 0.7)}")
print(f"test ({0.8}): {SVClassify.thold_accuracy(classes_test, preds_prob_test, 0.8)}")

In [None]:
mCC = MLPClassify(128)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: >0.95

print()
print(f"test ({0.95}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.95)}")
print(f"test ({0.97}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.97)}")
print(f"test ({0.99}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 0.99)}")

In [None]:
mCC = SGDClassify()
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: >0.5

print()
print(f"test ({0.4}): {SGDClassify.thold_accuracy(classes_test, preds_prob_test, 0.4)}")
print(f"test ({0.5}): {SGDClassify.thold_accuracy(classes_test, preds_prob_test, 0.5)}")
print(f"test ({0.6}): {SGDClassify.thold_accuracy(classes_test, preds_prob_test, 0.6)}")

In [None]:
mCC = GaussianProcessClassify(n_components=256)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

AverageClassify.dist_stats(classes_train, preds_prob_train)
AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: ?? Features not gaussian

In [None]:
mCC = GaussianBayesClassify()
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

AverageClassify.dist_stats(classes_train, preds_prob_train)
AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: ?? Features not gaussian

In [None]:
mCC = TorchClassify(lr=1e-1, epochs=192)
mCC.fit(class_data_train)

preds_train = mCC.predict(class_data_train)
preds_test = mCC.predict(class_data_test)

preds_prob_train = mCC.predict_prob(class_data_train)
preds_prob_test = mCC.predict_prob(class_data_test)

print(f"train: {accuracy_score(classes_train, preds_train)}")
print(f"test: {accuracy_score(classes_test, preds_test)}")

# AverageClassify.dist_stats(classes_train, preds_prob_train)
# AverageClassify.dist_stats(classes_test, preds_prob_test)
# thold: >13 (test correct mean)

print()
print(f"test ({13}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 13)}")
print(f"test ({14}): {MLPClassify.thold_accuracy(classes_test, preds_prob_test, 14)}")