In [70]:
import os
import time
import numpy as np
from rich.pretty import pprint
from typing import List, Dict, Tuple

from pycomex.functional.experiment import Experiment

In [71]:
PATH = os.getcwd()
RESULTS_PATH = os.path.join(PATH, 'results')

# A list of identifier strings that will be matched to decide which experiments to include 
# in the evaluation. These strings can be used to differentiate between different runs of 
# an experiment.
IDENTIFIERS: List[str] = ['hyperopt_b']

# These lists define which of the methods within the different categories should be included 
# in the computation to determine the best value of the hyperparameters.
BASE_METHOD_MAP: Dict[str, List[str]] = {
    'gnn': ['gatv2', 'gin'],
    'hdc': ['neural_net', 'random_forest'],
    'fp': ['neural_net', 'random_forest']
}

BASE_HPARAMS_MAP: Dict[str, List[str]] = {
    'gnn': ['BATCH_SIZE', 'LEARNING_RATE'],
    'fp': ['FINGERPRINT_SIZE', 'FINGERPRINT_RADIUS'],
    'hdc': ['EMBEDDING_SIZE', 'NUM_LAYERS']
}

METRIC: str = 'f1'
def get_metric(metrics: dict):
    #return 1 - metrics[METRIC]
    return metrics[METRIC]

In [72]:
# ~ finding experiments

print('traversing experiment namespaces...')
experiment_namespace_paths: List[str] = [
    path
    for file in os.listdir(RESULTS_PATH)
    if os.path.isdir(path := os.path.join(RESULTS_PATH, file))
]
pprint(experiment_namespace_paths)

print('traversing experiment paths...')
experiment_paths: List[str] = [
    path
    for folder_path in experiment_namespace_paths
    for file in os.listdir(folder_path)
    if os.path.isdir(path := os.path.join(folder_path, file))
]
pprint(experiment_paths, max_length=10)

traversing experiment namespaces...


traversing experiment paths...


In [73]:
import json

# ~ loading experiments
# Now that we have the paths to all the experiment archive folders, we can now actually 
# load them back into memory
experiments: List[Experiment] = []

time_start = time.time()
for path in experiment_paths:
    
    experiment_data_path = os.path.join(path, 'experiment_data.json')
    if not os.path.exists(experiment_data_path):
        print(f'no experiment data found at {experiment_data_path}')
        continue
    
    experiment_meta_path = os.path.join(path, 'experiment_meta.json')
    with open(experiment_meta_path, 'r') as file:
        metadata: dict = json.loads(file.read())
        parameters: dict = metadata['parameters']
    
    # We want to match a specific identifier
    if parameters['IDENTIFIER']['value'] not in IDENTIFIERS:
        continue
    
    experiment = Experiment.load(path)
    experiments.append(experiment)
    
time_end = time.time()
    
print(f'loaded {len(experiments)} experiments in {time_end - time_start:.1f} seconds')

no experiment data found at /media/ssd/Programming/graph_hdc/graph_hdc/experiments/fingerprints/results/predict_molecules__gnn__aqsoldb/debug/experiment_data.json
no experiment data found at /media/ssd/Programming/graph_hdc/graph_hdc/experiments/fingerprints/results/predict_molecules__fp__conjugated/hyperopt_b__16_01_2025__08_56__Cv1n/experiment_data.json
no experiment data found at /media/ssd/Programming/graph_hdc/graph_hdc/experiments/fingerprints/results/predict_molecules__fp__ames/debug/experiment_data.json
no experiment data found at /media/ssd/Programming/graph_hdc/graph_hdc/experiments/fingerprints/results/predict_molecules__gnn__qm9_smiles/ex_01_a__10_01_2025__20_18__l2MR/experiment_data.json
no experiment data found at /media/ssd/Programming/graph_hdc/graph_hdc/experiments/fingerprints/results/predict_molecules__gnn__qm9_smiles/ex_01_a__10_01_2025__21_18__xmQY/experiment_data.json
no experiment data found at /media/ssd/Programming/graph_hdc/graph_hdc/experiments/fingerprints/r

In [74]:
print('example experiment data:')
example_experiment = experiments[0]
pprint(experiments[0].data, max_length=12)

example experiment data:


In [75]:
from collections import defaultdict

# Here we first group the experiment instances by the method that was used (aka the model)
# the model identifiers will be the keys and the values will be lists of experiments.
method_experiments_map: Dict[Tuple[str, str], List[Experiment]] = defaultdict(list)

for experiment in experiments:
    
    _, base, _ = experiment.metadata['name'].strip('.py').split('__')
    methods: List[str] = BASE_METHOD_MAP[base]
    for method in methods:
        method_experiments_map[(base, method)].append(experiment)

pprint(method_experiments_map, max_length=5)    

In [76]:
for base, methods in BASE_METHOD_MAP.items():
    
    print(f'base: {base}')
    hparam_results_map: Dict[tuple, list[float]] = defaultdict(list)
    
    for experiment in experiments:
        
        _, base_, _ = experiment.metadata['name'].strip('.py').split('__')
        if base_ == base:
            hparams: tuple = tuple([experiment.parameters[hparam] for hparam in BASE_HPARAMS_MAP[base]])
            
            values: List[float] = []
            keys: List[str] = [f'test_{method}' for method in BASE_METHOD_MAP[base]]
            for key in keys:
                if 'metrics' in experiment.data:
                    value = get_metric(experiment.data['metrics'][key])
                    values.append(value)
                
            # Here we actually want to include the MEAN of all the different methods values as the indicator
            # to determine how well the hyperparameters performed
            hparam_results_map[hparams].append(np.mean(values))
            
    # Currently the dict data structure still contains lists as values which represent the different 
    # independent repetitions of the experiment. But we need to decide based on a single value 
    # which hyperparameters are the best. We can do this by taking the mean of all the different
    # results for each hyperparameter configuration.
    hparam_means_map = {hparam: np.mean(results) for hparam, results in hparam_results_map.items()}
    pprint(hparam_means_map)
    
    # Now we can sort the hyperparameters based on their mean value and select the best one
    if hparam_means_map:
        best_hparam = max(hparam_means_map.items(), key=lambda x: x[1])
        print(f'best hyperparameters: {best_hparam}')

base: gnn


best hyperparameters: ((16, 0.001), 0.9575757782123983)
base: hdc


best hyperparameters: ((8192, 1), 0.8747765737206095)
base: fp


best hyperparameters: ((8192, 1), 0.6943678186105804)
