# Experiment Analysis

This notebook is used to analyze the results of certain experiments which can be found in the pycomex experiment archive located at the ``RESULTS_PATH`` directory.

In [1]:
import os
import time
import json
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Latex
from rich.pretty import pprint
from pycomex.utils import is_experiment_archive
from pycomex.utils import render_string_table
from pycomex.functional.experiment import Experiment

# This will be the path to the directory in which the notebook is located.
PATH: str = os.getcwd()
# This will have to be the path to the pycomex "results" directory containing the 
# experiment archives of interest.
# HAS TO BE CHANGED, if the analysis is not located in the same directory as the results
# folder.
RESULTS_PATH: str = os.path.join(PATH, 'results')

  import pkg_resources


In [2]:

# Filter which experiments will be loaded based on their name and/or parameters.
def select_experiment(experiment_name: str,
                      experiment_metadata: dict,
                      experiment_parameters: dict
                      ) -> bool:
    return '__PREFIX__' in experiment_parameters and 'ex_02_a' in experiment_parameters['__PREFIX__']


# Assign a unique key to the experiment based on its data / parameters etc.
# Later on the experiments will be sorted based on this key which will be the 
# key of a dictionary and the values will be a list of all the experiments with 
# that same key.
def sort_experiment(experiment: Experiment) -> tuple:
    
    *_, encoding, dataset = experiment.metadata['name'].split('__')
    method = experiment.parameters['MODELS'][0]
    
    if 'FINGERPRINT_TYPE' in experiment.parameters:
        encoding = experiment.parameters['FINGERPRINT_TYPE']
        
    if 'NOTE' in experiment.parameters:
        dataset = experiment.parameters['NOTE']
    
    return (encoding, dataset)

### Experiment Discovery

The following cell will first discover all of the previously executed experiment runs which have been archived in the given results directory. It will then print the number of experiments found.

In [3]:
# This list will contain the paths to the individual expeirment *namespaces* which in 
# turn contain the actual individual experiment archives.
experiment_namespace_paths: list[str] = [
    path
    for file_name in os.listdir(RESULTS_PATH)
    if os.path.isdir(path := os.path.join(RESULTS_PATH, file_name))
]

# Subsequently, this list will contain the paths to the individual experiment archives
# folders.
experiment_paths: list[str] = []
for namespace_path in experiment_namespace_paths:
    for dirpath, dirnames, filenames in os.walk(namespace_path):
        if is_experiment_archive(dirpath):
            experiment_paths.append(dirpath)
            dirnames.clear() # Prevents further recursion into subdirectories
        
print(f'✅ found {len(experiment_paths)} experiment archives in {len(experiment_namespace_paths)} namespaces')
pprint(experiment_paths, max_length=3)

✅ found 1329 experiment archives in 18 namespaces


### Experiment Loading

The following cell will then load the experiment runs into memory selectively based on the previously defined ``select_experiment`` function. Only experiments for which the function returns ``True`` will be loaded into memory. The loaded experiments will actually be restored from the archive by loading the parameters as well as the data back into a ``pycomex.experiment.Experiment`` object.

In [4]:
# This list will be populated with the actual Experiment instances which will 
# be loaded from the experiment archive folders.
experiments: list[Experiment] = []

print(f'Loading experiments from archives...')
print(f'Iterating over {len(experiment_paths)} experiment paths')
time_start: float = time.time()
for experiment_path in experiment_paths:
    
    experiment_identifier: str = os.path.basename(experiment_path)
    
    experiment_data_path = os.path.join(experiment_path, Experiment.DATA_FILE_NAME)
    if not os.path.exists(experiment_data_path):
        print(f'   ⚠️ Skipping experiment "{experiment_identifier}" - no data found')
        continue
    
    experiment_meta_path = os.path.join(experiment_path, Experiment.METADATA_FILE_NAME)
    if not os.path.exists(experiment_meta_path):
        print(f'   ⚠️ Skipping experiment "{experiment_identifier}" - no metadata found')
        continue
    
    with open(experiment_meta_path) as file:
        content = file.read()
        # This will contain the experiment metadata as a dictionary which we can now use to
        # filter for instance.
        experiment_metadata: dict = json.loads(content)
        
    if 'parameters' not in experiment_metadata:
        print(f'   ⚠️ Skipping experiment "{experiment_identifier}" - no parameters found')
        continue
    
    experiment_parameters: dict = {
        param: info['value']
        for param, info in experiment_metadata['parameters'].items()
        if 'value' in info
    }
    
    # Here we apply the filter to determine whether or not the experiment should be included
    # in the analysis.
    condition: bool = select_experiment(
        experiment_name=experiment_metadata['name'],
        experiment_metadata=experiment_metadata,
        experiment_parameters=experiment_parameters
    )
    
    if condition:
        try:
            print(f'   > included experiment "{experiment_identifier}"')
            experiment: Experiment = Experiment.load(experiment_path)
            experiments.append(experiment)
        except Exception as e:
            print(f'   ⚠️ Failed to load experiment "{experiment_identifier}" - Exception: {e}')
    
            
time_end: float = time.time()
duration: float = time_end - time_start
print(f'✅ Loaded {len(experiments)} experiments in {duration:.2f} seconds')

Loading experiments from archives...
Iterating over 1329 experiment paths
   > included experiment "ex_02_a__08_09_2025__03_13__9yKx"


  from .autonotebook import tqdm as notebook_tqdm


   > included experiment "ex_02_a__09_09_2025__10_30__KCs6"
   > included experiment "ex_02_a__05_09_2025__23_40__PqQN"
   > included experiment "ex_02_a__06_09_2025__20_08__xtkg"
   > included experiment "ex_02_a__10_09_2025__07_26__Sqxp"
   > included experiment "ex_02_a__10_09_2025__12_49__fc1J"
   > included experiment "ex_02_a__08_09_2025__14_42__L5yg"
   > included experiment "ex_02_a__10_09_2025__14_26__s1ii"
   > included experiment "ex_02_a__07_09_2025__19_01__GcFT"
   > included experiment "ex_02_a__08_09_2025__18_32__BSN3"
   > included experiment "ex_02_a__08_09_2025__05_21__tM2s"
   > included experiment "ex_02_a__07_09_2025__23_02__srVo"
   > included experiment "ex_02_a__06_09_2025__08_26__pn6v"
   > included experiment "ex_02_a__09_09_2025__04_22__sW9f"
   > included experiment "ex_02_a__09_09_2025__13_32__3HUj"
   > included experiment "ex_02_a__05_09_2025__15_18__0u9C"
   > included experiment "ex_02_a__07_09_2025__01_34__7AwG"
   > included experiment "ex_02_a__07_09

In [5]:
example_experiment: Experiment = experiments[0] if experiments else None
pprint(example_experiment.data, max_length=10)

### Experiment Sorting

The following cell will sort the - currently still unsorted - experiment list by a custom criteria defined in the ``sort_experiments`` function. This is done by putting all the experiments into the same list for which this function returns the same index tuple. The result is the ``key_experiment_map`` dictionary data structure which maps the index tuple to a list of experiments.

In [6]:
# This will be a dictionary mapping the unique key of the experiment to a list of
# experiments which share that key. This will allow us to group experiments based on
# their parameters or other attributes.
key_experiment_map: dict[tuple, list[Experiment]] = defaultdict(list)

for experiment in experiments:
    key: tuple = sort_experiment(experiment)
    key_experiment_map[key].append(experiment)
    
pprint(key_experiment_map, max_length=3)

In [7]:
dataset_experiment_map: dict[str, dict[str, list[Experiment]]] = defaultdict(lambda: defaultdict(list))

for (encoding, dataset), _experiments in key_experiment_map.items():

    dataset_experiment_map[dataset][encoding] = _experiments
    
pprint(dataset_experiment_map, max_length=3)
pprint(list(dataset_experiment_map.keys()))

### Experiment Result Table

The following cell illustrates how to create a table from the aggregated results of many loaded experiments, which is a common use case of the analysis.

In [8]:
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = [
    'Dataset', 
    'Quantity', 
    'GNN', 
    'HDC',
    'Morgan',
    'RDKit',
    'Torsion',
    'AtomPair',
]

dataset_name_map = {
    'aqsoldb_logs': 'AqSolDB',
    'zinc_clogp': 'ZINC250k',
    'zinc_qed': 'ZINC250k',
    'zinc_sas': 'ZINC250k',
    'lipop_logd': 'LIPOP',
    'qm9_gap': 'QM9',
    'qm9_energy': 'QM9',
    'qm9_dipole': 'QM9',
    'qm9_alpha': 'QM9',
    'compas_dipole': 'COMPAS-3X',
    'compas_gap': 'COMPAS-3X',
    'compas_energy': 'COMPAS-3X',
    'tadf_oscillator': 'TADF',
    'tadf_rate': 'TADF',
    'tadf_splitting': 'TADF'
}

dataset_quantity_map = {
    'aqsoldb_logs': 'logS',
    'zinc_clogp': 'ClogP',
    'zinc_qed': 'QED',
    'zinc_sas': 'SAS',
    'lipop_logd': 'LogD',
    'qm9_gap': 'Gap',
    'qm9_energy': r'$U_0$',
    'qm9_dipole': r'$\mu$',
    'qm9_alpha': r'$\alpha$',
    'compas_dipole': '$\mu$',
    'compas_gap': 'Gap',
    'compas_energy': r'$U_0$',
    'tadf_oscillator': 'f',
    'tadf_rate': r'$k_{TADF}$',
    'tadf_splitting': r'$E_{ST}$'
}


# This data structure will hold the (encoding, dataset, method) tuple as keys and the metric
# values as a list value.
method = 'neural_net2'
key_values_map = defaultdict(list)

# Store data for ranking calculation
dataset_results = {}

for dataset, encoding_experiment_map in dataset_experiment_map.items():
    
    row = [
        dataset_name_map.get(dataset, dataset),
        dataset_quantity_map.get(dataset, '---'),
    ]
    
    num_encodings = len(list(encoding_experiment_map.keys()))
    if num_encodings < 6:
        continue
    
    # Store results for this dataset for ranking calculation
    dataset_results[dataset] = {}
    
    # Define the order of encodings to match table columns
    encoding_order = ['gnn', 'hdc', 'morgan', 'rdkit', 'torsion', 'atom']
    
    for encoding in encoding_order:
        if encoding not in encoding_experiment_map:
            row.append('N/A')
            dataset_results[dataset][encoding] = None
            continue
            
        _experiments = encoding_experiment_map[encoding]
        
        if len(_experiments) == 0:
            row.append('0')
            dataset_results[dataset][encoding] = None
            continue
            
        example_experiment = _experiments[0]
        if example_experiment.parameters['DATASET_TYPE'] == 'regression':
            metric = 'mae'
        else:
            metric = 'f1'
        
        if encoding == 'gnn':
            method = 'gatv2'
        else:
            method = 'neural_net2'
                
        values = [exp.data['metrics'][f'test_{method}'][metric] for exp in _experiments if 'metrics' in exp.data]
        if values:
            mean_value = np.mean(values)
            std_value = np.std(values)
            row.append(f'{mean_value:.2f} ± {std_value:.2f}')
            dataset_results[dataset][encoding] = mean_value
        else:
            row.append('N/A')
            dataset_results[dataset][encoding] = None
        
    table.add_row(row)

print(table.get_string())

+-----------+----------+-------------+-------------+--------------+---------------+--------------+--------------+
|  Dataset  | Quantity |     GNN     |     HDC     |    Morgan    |     RDKit     |   Torsion    |   AtomPair   |
+-----------+----------+-------------+-------------+--------------+---------------+--------------+--------------+
|  ZINC250k |   QED    | 0.01 ± 0.00 | 0.04 ± 0.00 | 0.02 ± 0.00  |  0.04 ± 0.00  | 0.03 ± 0.00  | 0.03 ± 0.00  |
|  ZINC250k |  ClogP   | 0.07 ± 0.00 | 0.26 ± 0.00 | 0.17 ± 0.00  |  0.37 ± 0.00  | 0.25 ± 0.00  | 0.22 ± 0.00  |
|  ZINC250k |   SAS    | 0.05 ± 0.00 | 0.23 ± 0.01 | 0.06 ± 0.00  |  0.21 ± 0.00  | 0.14 ± 0.00  | 0.16 ± 0.00  |
|    QM9    |   Gap    | 0.01 ± 0.00 | 0.01 ± 0.00 | 0.01 ± 0.00  |  0.01 ± 0.00  | 0.01 ± 0.00  | 0.01 ± 0.00  |
|    QM9    |  $U_0$   | 1.26 ± 0.03 | 0.77 ± 0.05 | 4.66 ± 0.11  |  8.81 ± 0.06  | 4.83 ± 0.09  | 1.11 ± 0.01  |
|    QM9    |  $\mu$   | 0.50 ± 0.00 | 0.58 ± 0.00 | 0.49 ± 0.00  |  0.57 ± 0.00  | 0.58

In [31]:
# Calculate average ranks (lower values are better)
# Only rank HDC and FP columns (indices 1, 2, 3, 4 in the encoding_order)
ranking_encodings = ['gnn', 'hdc', 'morgan', 'rdkit', 'torsion', 'atom']

# Collect all valid values for each encoding across datasets
encoding_ranks = {encoding: [] for encoding in ranking_encodings}

# For each dataset, rank the methods
for dataset, results in dataset_results.items():
    # Get valid results for ranking encodings only
    valid_results = {enc: val for enc, val in results.items() 
                    if enc in ranking_encodings and val is not None}
    
    if len(valid_results) < 2:  # Need at least 2 methods to rank
        continue
    
    # Sort by value (ascending since lower is better)
    sorted_results = sorted(valid_results.items(), key=lambda x: x[1])
    
    # Assign ranks (1 = best/lowest value)
    for rank, (encoding, _) in enumerate(sorted_results, 1):
        encoding_ranks[encoding].append(rank)

# Calculate average ranks
avg_ranks = {}
for encoding in ranking_encodings:
    if encoding_ranks[encoding]:
        avg_ranks[encoding] = np.mean(encoding_ranks[encoding])
    else:
        avg_ranks[encoding] = None

# Create the average rank row
rank_row = ['Average Rank', '-']  # Dataset, Quantity, GNN columns

# Add average ranks for HDC and FP methods
encoding_order = ['gnn', 'hdc', 'morgan', 'rdkit', 'torsion', 'atom']
for encoding in encoding_order:
    if avg_ranks[encoding] is not None:
        rank_row.append(f'{avg_ranks[encoding]:.2f}')
    else:
        rank_row.append('N/A')

# Add the rank row to the table
table.add_row(rank_row)

print("\\nTable with Average Ranks:")
print(table.get_string())

\nTable with Average Ranks:
+--------------+----------+-------------+-------------+--------------+---------------+--------------+--------------+
|   Dataset    | Quantity |     GNN     |     HDC     |    Morgan    |     RDKit     |   Torsion    |   AtomPair   |
+--------------+----------+-------------+-------------+--------------+---------------+--------------+--------------+
|   ZINC250k   |   QED    | 0.01 ± 0.00 | 0.04 ± 0.00 | 0.02 ± 0.00  |  0.04 ± 0.00  | 0.03 ± 0.00  | 0.03 ± 0.00  |
|   ZINC250k   |  ClogP   | 0.07 ± 0.00 | 0.26 ± 0.00 | 0.17 ± 0.00  |  0.37 ± 0.00  | 0.25 ± 0.00  | 0.22 ± 0.00  |
|   ZINC250k   |   SAS    | 0.05 ± 0.00 | 0.23 ± 0.01 | 0.06 ± 0.00  |  0.21 ± 0.00  | 0.14 ± 0.00  | 0.16 ± 0.00  |
|     QM9      |   Gap    | 0.01 ± 0.00 | 0.01 ± 0.00 | 0.01 ± 0.00  |  0.01 ± 0.00  | 0.01 ± 0.00  | 0.01 ± 0.00  |
|     QM9      |  $U_0$   | 1.26 ± 0.03 | 0.77 ± 0.05 | 4.66 ± 0.11  |  8.81 ± 0.06  | 4.83 ± 0.09  | 1.11 ± 0.01  |
|     QM9      |  $\mu$   | 0.50 ± 0

In [32]:
from pycomex.utils import render_latex_table


def highlight(cell, rows) -> dict:
    
    
    row_index: int = cell["row_index"]
    col_index: int = cell["col_index"]

    values = [
        float(cell['mean'])
        for cell in rows[row_index]
        if 'mean' in cell
    ]
    if not values:
        return cell
    
    best_index_of_row: int = np.argmin(values)
    if int(best_index_of_row) == int(col_index - 2):
        #print(True)
        cell.update({
            'string': f'$\\underset{{\color{{darkgray}} \pm{cell["std"]}}}{{\mathbf{{{cell["mean"]}}}}}$'
        })
        
    #print(values, best_index_of_row + 2, col_index, cell)
    return cell


latex_string: str = render_latex_table(
    table=table,
    transform_func=highlight,
)
print(latex_string)

\begin{tabular}{ llllllll }
% --- Header ---
\toprule
\multicolumn{1}{c}{ Dataset } &
\multicolumn{1}{c}{ Quantity } &
\multicolumn{1}{c}{ GNN } &
\multicolumn{1}{c}{ HDC } &
\multicolumn{1}{c}{ Morgan } &
\multicolumn{1}{c}{ RDKit } &
\multicolumn{1}{c}{ Torsion } &
\multicolumn{1}{c}{ AtomPair } 
\\
\midrule
% --- Content ---
% row 1
ZINC250k    & 
QED    & 
$\underset{\color{darkgray} \pm0.0}{\mathbf{0.01}}$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.04 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.02 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.04 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.03 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.03 }$
\\

% row 2
ZINC250k    & 
ClogP    & 
$\underset{\color{darkgray} \pm0.0}{\mathbf{0.07}}$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.26 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.17 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.37 }$    & 
$\underset{ \color{darkgray} \pm0.00 }{ 0.25 }$    & 
$\