In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pathlib
from scipy import stats
import numpy as np

import utils

## Constants

In [3]:
CURRENT_PATH = pathlib.Path().cwd()
DATA_BASE_PATH = pathlib.Path("/home/birinxhl/automl/automl_conf/hnas_with_string_kernels/results/search")
PLOTS_PATH = CURRENT_PATH / "figures"

In [4]:
if not DATA_BASE_PATH.is_dir():
    raise ValueError()

In [5]:
SEEDS = (777, 888, 999)

# models
hNASK = utils.Model(experiment_name="final", code="gp_string_hierarchical", name="hNASK")
hWL = utils.Model(experiment_name="final", code="gp_hierarchical", name="hWL")

## Result evaluation

In [6]:
datasets = [
    (utils.Dataset(code="nb201_cifar10", name="CIFAR-10"), 100,
     {"val": "x-valid_1", "test": "ori-test_1", "reverse": True, "msg": "higher is better"}),
    (utils.Dataset(code="nb201_cifar100", name="CIFAR-100"), 100,
     {"val": "x-valid_1", "test": "x-test_1", "reverse": True, "msg": "higher is better"}),
    (utils.Dataset(code="nb201_ImageNet16-120", name="ImageNet16-120"), 100,
     {"val": "x-valid_1", "test": "x-test_1", "reverse": True, "msg": "higher is better"}),
    (utils.Dataset(code="nb201_cifarTile", name="CIFARTile"), 100,
     {"val": "val_score", "test": "test_score", "reverse": False, "msg": "lower is better"}),
    (utils.Dataset(code="nb201_addNIST", name="AddNIST"), 100,
     {"val": "val_score", "test": "test_score", "reverse": False, "msg": "lower is better"}),
    (utils.Dataset(code="act_cifar10", name="Act CIFAR-10"), 1000,
     {"val": "val_error", "test": "test_error", "reverse": False, "msg": "lower is better"}),
]
models = [hNASK, hWL]
seeds = SEEDS

In [7]:
result = {}

for (dataset, num_configs, metrics) in datasets:
    for model in models:
        for seed in seeds:
            print(dataset.code, model.code, seed)
            configs = model.read_search_results(
                data_base_path=DATA_BASE_PATH,
                dataset_code=dataset.code,
                seed=seed,
                num_configs=num_configs,
            )
            configs = {
                config: result
                for config, result in configs.items()
                if result != "error"
            }
            print(len(configs))

            scores = [
                (config_id, result)
                for config_id, result in configs.items()
            ]

            print(list(
                (i[0], i[1]["loss"])
                for i in sorted(scores, key=lambda item: (item[1]["loss"], item[0]))[:3]
            ))

            by_val_score = sorted(
                scores,
                key=lambda item: (item[1]["info_dict"][metrics["val"]], item[0]),
                reverse=metrics["reverse"],
            )

            print(metrics["msg"])
            print("val: ", by_val_score[0][0], by_val_score[0][1]["info_dict"][metrics["val"]])
            print("test: ", by_val_score[0][0], by_val_score[0][1]["info_dict"][metrics["test"]])

            r = result.setdefault(
                dataset.code, {"info": metrics["msg"]}
            ).setdefault(
                model.code, {"by_val_score_test": [], "by_val_score_val": []}
            )
            r["by_val_score_test"].append(by_val_score[0][1]["info_dict"][metrics["test"]])
            r["by_val_score_val"].append(by_val_score[0][1]["info_dict"][metrics["val"]])

            print()


nb201_cifar10 gp_string_hierarchical 777
100
[(73, -2.5078886024908082), (83, -2.5059258917703375), (65, -2.498599854671493)]
higher is better
val:  73 91.85599999267578
test:  73 91.54

nb201_cifar10 gp_string_hierarchical 888
100
[(96, -2.438175630262763), (84, -2.4209181657451513), (80, -2.398215214780013)]
higher is better
val:  96 91.26799997802735
test:  96 90.78

nb201_cifar10 gp_string_hierarchical 999
100
[(73, -2.4423069192085363), (33, -2.4326936644458566), (86, -2.4299639076299218)]
higher is better
val:  73 91.30399998291016
test:  73 90.93

nb201_cifar10 gp_hierarchical 777
100
[(76, -2.520243587029875), (85, -2.5059258929671038), (98, -2.505435816458207)]
higher is better
val:  76 91.95599997802735
test:  76 91.66

nb201_cifar10 gp_hierarchical 888
100
[(89, -2.471713938442125), (83, -2.468875711452436), (98, -2.467931426449716)]
higher is better
val:  89 91.55600000732422
test:  89 91.02

nb201_cifar10 gp_hierarchical 999
100
[(86, -2.389887311474956), (89, -2.387271791

In [8]:
print("^ In the lists scores for seeds [777, 888, 999] in that order.")
print("^ Within the seed, we sorted the configs by their validation score. Then for the best config of the seed, we take its test score and use that for comparison between models.")
print("^ Additionally for each seed, we present the best validation scores for the configs tested.")
print()

def std_err(vals):
    y_mean = np.mean(vals, axis=0)
    std_error = stats.sem(vals, axis=0)
    return y_mean, std_error


for dataset, res in result.items():
    res = res.copy()
    print(f"{dataset.upper()} ({res.pop('info')})")
    for model in res:
        val_scores_by_val = [round(num, 10) for num in res[model]["by_val_score_val"]]
        mean, serr = [round(num, 10) for num in std_err(val_scores_by_val)]
        print(f"--> {model} (val_scores_by_val): \t{mean}±{serr}")
    for model in res:
        test_scores_by_val = [round(num, 10) for num in res[model]["by_val_score_test"]]
        mean, serr = [round(num, 10) for num in std_err(test_scores_by_val)]
        print(f"--> {model} (test_scores_by_val): \t{mean}±{serr}")
    print()

^ In the lists scores for seeds [777, 888, 999] in that order.
^ Within the seed, we sorted the configs by their validation score. Then for the best config of the seed, we take its test score and use that for comparison between models.
^ Additionally for each seed, we present the best validation scores for the configs tested.

NB201_CIFAR10 (higher is better)
--> gp_string_hierarchical (val_scores_by_val): 	91.4759999845±0.1902840024
--> gp_hierarchical (val_scores_by_val): 	91.449333326±0.3276854831
--> gp_string_hierarchical (test_scores_by_val): 	91.0833333333±0.2324029068
--> gp_hierarchical (test_scores_by_val): 	91.0233333333±0.3666212093

NB201_CIFAR100 (higher is better)
--> gp_string_hierarchical (val_scores_by_val): 	72.7133333211±0.6410755067
--> gp_hierarchical (val_scores_by_val): 	72.5599999023±0.373630843
--> gp_string_hierarchical (test_scores_by_val): 	71.8933332682±0.5396707177
--> gp_hierarchical (test_scores_by_val): 	71.7666666707±0.4931981877

NB201_IMAGENET16-120