In [1]:
# Cell 1 — imports & helpers
import os, re, json
from dataclasses import dataclass
from typing import Iterable, List, Optional, Tuple, Dict, Any
from datetime import datetime
import numpy as np
import pandas as pd

def _as_list(x) -> Optional[List[str]]:
    if x is None:
        return None
    if isinstance(x, (list, tuple, set)):
        return list(x)
    return [str(x)]

def _matches_substr(name: str, includes: Optional[Iterable[str]]) -> bool:
    """Case-insensitive substring match. If includes is None/empty → True."""
    if not includes:
        return True
    name_l = name.lower()
    return any(p.lower() in name_l for p in includes)

def _parse_date_like(x) -> Optional[datetime]:
    if x is None:
        return None
    if isinstance(x, datetime):
        return x
    for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.strptime(str(x), fmt)
        except ValueError:
            pass
    raise ValueError(f"Could not parse date '{x}'. Use 'YYYY-MM-DD' (optional time).")

def _get_dir_mtime(path: str) -> datetime:
    return datetime.fromtimestamp(os.path.getmtime(path))

def _infer_model_name_from_dir(dir_name: str) -> str:
    # e.g., "GL_GATConv_3l_512h_split3" -> "GL_GATConv_3l_512h"
    m = re.match(r"(.+?)(?:_?split\d+)?$", dir_name)
    return m.group(1) if m else dir_name

def _infer_feature_from_names(dir_name: str, json_obj: Dict[str, Any]) -> Optional[str]:
    # Prefer explicit JSON if present
    for k in ("feature_type", "features", "feat"):
        v = json_obj.get(k)
        if isinstance(v, str):
            return v
    # Fallback: infer from folder tokens
    lname = dir_name.lower()
    if "graph_based+" in lname:
        return "graph_based+"
    if "graph_based" in lname:
        return "graph_based"
    return None

def _extract_metric_from_json(obj: Dict[str, Any], metric_keys: Tuple[str, ...]) -> Optional[float]:
    # flat
    for k in metric_keys:
        v = obj.get(k)
        if isinstance(v, (int, float)):
            return float(v)
        if isinstance(v, str):
            try:
                return float(v)
            except ValueError:
                pass
    # nested
    for parent in ("test", "eval", "best", "metrics", "test_metrics", "val_metrics"):
        d = obj.get(parent)
        if isinstance(d, dict):
            for k in metric_keys:
                v = d.get(k)
                if isinstance(v, (int, float)):
                    return float(v)
    return None

@dataclass
class MetricRow:
    experiment: str
    dataset: str
    model: str
    feature: Optional[str]
    split_dir: str
    metric: float
    mtime: datetime


In [2]:
# Cell 2 — core scan & summarize (latest experiments enforced)
def collect_metrics(
    runs_root: str,
    include_experiments: Optional[Iterable[str]] = None,
    include_datasets: Optional[Iterable[str]] = None,
    include_models: Optional[Iterable[str]] = None,
    include_features: Optional[Iterable[str]] = None,
    min_date: Optional[str] = None,  # 'YYYY-MM-DD' (optional time)
    metrics_file_candidates: Tuple[str, ...] = ("results.json", "metrics.json"),
    metric_keys: Tuple[str, ...] = ("f1_macro", "f1-macro", "macro_f1"),
) -> List[MetricRow]:
    """
    Walks 'runs_root/<experiment>/<dataset>/<model_per_split>/' and extracts metrics.
    Filters are case-insensitive substrings (no regex).
    Applies a date threshold (min_date) and then *keeps only the latest experiment(s)* overall.
    """
    include_experiments = _as_list(include_experiments)
    include_datasets    = _as_list(include_datasets)
    include_models      = _as_list(include_models)
    include_features    = _as_list(include_features)
    threshold_dt        = _parse_date_like(min_date)

    if not os.path.isdir(runs_root):
        raise FileNotFoundError(f"runs_root not found: {runs_root}")

    rows: List[MetricRow] = []

    for exp in sorted(os.listdir(runs_root)):
        exp_path = os.path.join(runs_root, exp)
        if not os.path.isdir(exp_path) or not _matches_substr(exp, include_experiments):
            continue

        for ds in sorted(os.listdir(exp_path)):
            ds_path = os.path.join(exp_path, ds)
            if not os.path.isdir(ds_path) or not _matches_substr(ds, include_datasets):
                continue

            for split_dir in sorted(os.listdir(ds_path)):
                split_path = os.path.join(ds_path, split_dir)
                if not os.path.isdir(split_path):
                    continue

                # quick prefilter by model tokens in the folder name (helps performance)
                if not _matches_substr(split_dir, include_models):
                    continue

                # pick a metrics file
                metrics_json_path = next(
                    (os.path.join(split_path, cand) for cand in metrics_file_candidates
                     if os.path.isfile(os.path.join(split_path, cand))),
                    None
                )
                if metrics_json_path is None:
                    continue

                try:
                    mtime = datetime.fromtimestamp(os.path.getmtime(metrics_json_path))
                except Exception:
                    mtime = _get_dir_mtime(split_path)
                if threshold_dt and mtime < threshold_dt:
                    continue

                try:
                    with open(metrics_json_path, "r") as f:
                        obj = json.load(f)
                except Exception:
                    continue

                metric_val = _extract_metric_from_json(obj, metric_keys)
                if metric_val is None:
                    continue

                # normalize model name
                model_name = obj.get("model_name") or obj.get("model") or _infer_model_name_from_dir(split_dir)
                if not _matches_substr(model_name, include_models):
                    continue

                feature = _infer_feature_from_names(split_dir, obj)
                if include_features:
                    if feature is None:
                        continue
                    if not _matches_substr(feature, include_features):
                        continue

                rows.append(MetricRow(
                    experiment=exp,
                    dataset=ds,
                    model=model_name,
                    feature=feature,
                    split_dir=split_path,
                    metric=float(metric_val),
                    mtime=mtime,
                ))

    if not rows:
        raise RuntimeError("No metrics found after applying filters.")

    # --- keep only the latest experiment(s) overall (ties allowed) ---
    exp_to_latest: Dict[str, datetime] = {}
    for r in rows:
        exp_to_latest[r.experiment] = max(exp_to_latest.get(r.experiment, r.mtime), r.mtime)
    # find the global newest timestamp across experiments, then keep experiments with that timestamp
    newest_ts = max(exp_to_latest.values())
    newest_exps = {e for e, t in exp_to_latest.items() if t == newest_ts}
    rows = [r for r in rows if r.experiment in newest_exps]

    return rows

def summarize_metrics(
    rows: List[MetricRow],
    expected_splits: Optional[int] = 10,
    strict_expected_splits: bool = True,
) -> pd.DataFrame:
    """
    Aggregate per (Experiment, Dataset, Feature, Model):
    - N (count), Mean, Std.
    If expected_splits is set and strict_expected_splits=True → raise on mismatch.
    """
    if not rows:
        raise ValueError("Empty rows.")

    df = pd.DataFrame([{
        "Experiment": r.experiment,
        "Dataset": r.dataset,
        "Feature": r.feature or "",
        "Model": r.model,
        "SplitPath": r.split_dir,
        "Metric": r.metric,
        "MTime": r.mtime,
    } for r in rows])

    grp_cols = ["Experiment", "Dataset", "Feature", "Model"]
    out = (
        df.groupby(grp_cols, dropna=False)["Metric"]
          .agg(N="count", Mean="mean", Std="std")
          .reset_index()
    )
    out["Std"] = out["Std"].fillna(0.0)

    if expected_splits is not None and strict_expected_splits:
        bad = out.loc[out["N"] != int(expected_splits)]
        if len(bad):
            lines = "\n".join(
                f"- {row.Experiment} / {row.Dataset} / {row.Feature} / {row.Model}: N={row.N}"
                for row in bad.itertuples(index=False)
            )
            raise ValueError(
                f"Expected {expected_splits} splits per (Experiment, Dataset, Feature, Model). Mismatches:\n{lines}"
            )

    # nice sorting
    out = out.sort_values(["Dataset", "Mean"], ascending=[True, False]).reset_index(drop=True)
    return out

def summarize_runs(
    runs_root: str,
    include_experiments=None,
    include_datasets=None,
    include_models=None,
    include_features=None,
    min_date: Optional[str] = None,
    metrics_file_candidates: Tuple[str, ...] = ("results.json", "metrics.json"),
    metric_keys: Tuple[str, ...] = ("f1_macro", "f1-macro", "macro_f1"),
    expected_splits: Optional[int] = 10,
    strict_expected_splits: bool = True,
) -> pd.DataFrame:
    rows = collect_metrics(
        runs_root=runs_root,
        include_experiments=include_experiments,
        include_datasets=include_datasets,
        include_models=include_models,
        include_features=include_features,
        min_date=min_date,
        metrics_file_candidates=metrics_file_candidates,
        metric_keys=metric_keys,
    )
    return summarize_metrics(
        rows,
        expected_splits=expected_splits,
        strict_expected_splits=strict_expected_splits,
    )


In [4]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ["real_data_real_masks"]   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ["CR_real_masks_more_labeled_veritices_agreed"]           # or None for all
include_models      = ["GL_TAGConv_3l_512h_w_k3_gnorm"]  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_real_masks,CR_real_masks_more_labeled_veritices_agreed,graph_based+,GL_TAGConv_3l_512h_w_k3_gnorm,10,0.713525,0.012252


In [10]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ['real_data_no_mask']   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ["Western-Europe", 'Volga', 'Scandinavia', 'NC_graph_rel_eng', 'CR']           # or None for all
include_models      = None  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_no_mask,CR,graph_based+,GL_SAGEConv_3l_512h_de,10,0.610937,0.018063
1,real_data_no_mask,CR,graph_based+,GL_SAGEConv_3l_128h_de,10,0.596161,0.018909
2,real_data_no_mask,CR,graph_based+,GL_SAGEConv_9l_512h_de,10,0.589665,0.02229
3,real_data_no_mask,CR,graph_based+,GL_SAGEConv_9l_128h_de,10,0.520383,0.028754
4,real_data_no_mask,NC_graph_rel_eng,graph_based+,GL_SAGEConv_3l_512h_de,10,0.887433,0.065912
5,real_data_no_mask,NC_graph_rel_eng,graph_based+,GL_SAGEConv_9l_512h_de,10,0.873915,0.049113
6,real_data_no_mask,NC_graph_rel_eng,graph_based+,GL_SAGEConv_3l_128h_de,10,0.862811,0.064416
7,real_data_no_mask,NC_graph_rel_eng,graph_based+,GL_SAGEConv_9l_128h_de,10,0.809657,0.056397
8,real_data_no_mask,Scandinavia,graph_based+,GL_SAGEConv_3l_512h_de,10,0.972509,0.011182
9,real_data_no_mask,Scandinavia,graph_based+,GL_SAGEConv_9l_512h_de,10,0.970584,0.013308


In [11]:
# order datasets by their top Mean; within each dataset keep best-to-worst
df["_best_of_ds"] = df.groupby("Dataset")["Mean"].transform("max")
df = (
    df.sort_values(["_best_of_ds", "Dataset", "Mean"],
                   ascending=[False, True, False],
                   kind="mergesort")  # stable: preserves ties consistently
      .drop(columns="_best_of_ds")
      .reset_index(drop=True)
)
df

Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_no_mask,Volga,graph_based+,GL_SAGEConv_3l_128h_de,10,0.987093,0.023514
1,real_data_no_mask,Volga,graph_based+,GL_SAGEConv_3l_512h_de,10,0.984425,0.012918
2,real_data_no_mask,Volga,graph_based+,GL_SAGEConv_9l_512h_de,10,0.967313,0.028956
3,real_data_no_mask,Volga,graph_based+,GL_SAGEConv_9l_128h_de,10,0.910623,0.033099
4,real_data_no_mask,Scandinavia,graph_based+,GL_SAGEConv_3l_512h_de,10,0.972509,0.011182
5,real_data_no_mask,Scandinavia,graph_based+,GL_SAGEConv_9l_512h_de,10,0.970584,0.013308
6,real_data_no_mask,Scandinavia,graph_based+,GL_SAGEConv_3l_128h_de,10,0.969975,0.013755
7,real_data_no_mask,Scandinavia,graph_based+,GL_SAGEConv_9l_128h_de,10,0.966399,0.009589
8,real_data_no_mask,Western-Europe,graph_based+,GL_SAGEConv_3l_128h_de,10,0.946696,0.011057
9,real_data_no_mask,Western-Europe,graph_based+,GL_SAGEConv_3l_512h_de,10,0.944223,0.010138


In [12]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ['real_data_mask_100']   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ['CR']           # or None for all
include_models      = None  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell

# order datasets by their top Mean; within each dataset keep best-to-worst
df["_best_of_ds"] = df.groupby("Dataset")["Mean"].transform("max")
df = (
    df.sort_values(["_best_of_ds", "Dataset", "Mean"],
                   ascending=[False, True, False],
                   kind="mergesort")  # stable: preserves ties consistently
      .drop(columns="_best_of_ds")
      .reset_index(drop=True)
)
df


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_mask_100,CR,graph_based+,GL_TAGConv_3l_512h_w_k3_gnorm,10,0.552789,0.0
1,real_data_mask_100,CR,graph_based+,GL_TAGConv_3l_512h_w_k3,10,0.543185,0.0
2,real_data_mask_100,CR,graph_based+,GL_GINConv_3l_512h,10,0.541865,0.011769
3,real_data_mask_100,CR,graph_based+,GL_GINConv_3l_128h,10,0.52617,0.014034
4,real_data_mask_100,CR,graph_based+,GL_SAGEConv_9l_512h_de,10,0.51142,0.004501
5,real_data_mask_100,CR,graph_based+,GL_SAGEConv_3l_512h,10,0.508147,0.0
6,real_data_mask_100,CR,graph_based+,GL_SAGEConv_9l_512h,10,0.505302,0.0
7,real_data_mask_100,CR,graph_based+,GL_SAGEConv_3l_512h_de,10,0.499708,0.004578
8,real_data_mask_100,CR,graph_based+,GL_GATConv_3l_512h,10,0.496585,0.004252
9,real_data_mask_100,CR,graph_based+,GL_TAGConv_3l_128h_w_k3,10,0.486298,0.0


In [13]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ['real_data_mask_75']   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ['CR']           # or None for all
include_models      = None  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell

# order datasets by their top Mean; within each dataset keep best-to-worst
df["_best_of_ds"] = df.groupby("Dataset")["Mean"].transform("max")
df = (
    df.sort_values(["_best_of_ds", "Dataset", "Mean"],
                   ascending=[False, True, False],
                   kind="mergesort")  # stable: preserves ties consistently
      .drop(columns="_best_of_ds")
      .reset_index(drop=True)
)
df


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_mask_75,CR,graph_based+,GL_TAGConv_3l_512h_w_k3_gnorm,10,0.530633,0.012975
1,real_data_mask_75,CR,graph_based+,GL_GINConv_3l_512h,10,0.530036,0.012289
2,real_data_mask_75,CR,graph_based+,GL_TAGConv_3l_512h_w_k3,10,0.52072,0.006738
3,real_data_mask_75,CR,graph_based+,GL_GINConv_3l_128h,10,0.519649,0.013341
4,real_data_mask_75,CR,graph_based+,GL_SAGEConv_3l_512h,10,0.495889,0.012655
5,real_data_mask_75,CR,graph_based+,GL_SAGEConv_3l_512h_de,10,0.495259,0.007408
6,real_data_mask_75,CR,graph_based+,GL_SAGEConv_9l_512h,10,0.488033,0.011034
7,real_data_mask_75,CR,graph_based+,GL_SAGEConv_9l_512h_de,10,0.487269,0.009297
8,real_data_mask_75,CR,graph_based+,GL_GATConv_3l_512h,10,0.483406,0.012531
9,real_data_mask_75,CR,graph_based+,GL_SAGEConv_3l_128h,10,0.474731,0.00663


In [14]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ['real_data_mask_50']   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ['CR']           # or None for all
include_models      = None  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell

# order datasets by their top Mean; within each dataset keep best-to-worst
df["_best_of_ds"] = df.groupby("Dataset")["Mean"].transform("max")
df = (
    df.sort_values(["_best_of_ds", "Dataset", "Mean"],
                   ascending=[False, True, False],
                   kind="mergesort")  # stable: preserves ties consistently
      .drop(columns="_best_of_ds")
      .reset_index(drop=True)
)
df


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_mask_50,CR,graph_based+,GL_GINConv_3l_512h,10,0.507442,0.007653
1,real_data_mask_50,CR,graph_based+,GL_TAGConv_3l_512h_w_k3,10,0.504588,0.012771
2,real_data_mask_50,CR,graph_based+,GL_TAGConv_3l_512h_w_k3_gnorm,10,0.501404,0.017622
3,real_data_mask_50,CR,graph_based+,GL_GINConv_3l_128h,10,0.498459,0.02148
4,real_data_mask_50,CR,graph_based+,GL_SAGEConv_9l_512h,10,0.482474,0.008558
5,real_data_mask_50,CR,graph_based+,GL_SAGEConv_3l_512h,10,0.479759,0.01083
6,real_data_mask_50,CR,graph_based+,GL_SAGEConv_3l_512h_de,10,0.477524,0.009714
7,real_data_mask_50,CR,graph_based+,GL_SAGEConv_9l_512h_de,10,0.477059,0.010295
8,real_data_mask_50,CR,graph_based+,GL_SAGEConv_3l_128h,10,0.467928,0.007742
9,real_data_mask_50,CR,graph_based+,GL_SAGEConv_3l_128h_de,10,0.466459,0.008457


In [15]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ['real_data_mask_25']   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ['CR']           # or None for all
include_models      = None  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell

# order datasets by their top Mean; within each dataset keep best-to-worst
df["_best_of_ds"] = df.groupby("Dataset")["Mean"].transform("max")
df = (
    df.sort_values(["_best_of_ds", "Dataset", "Mean"],
                   ascending=[False, True, False],
                   kind="mergesort")  # stable: preserves ties consistently
      .drop(columns="_best_of_ds")
      .reset_index(drop=True)
)
df


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_mask_25,CR,graph_based+,GL_GINConv_3l_512h,10,0.479879,0.014709
1,real_data_mask_25,CR,graph_based+,GL_TAGConv_3l_512h_w_k3_gnorm,10,0.471472,0.012779
2,real_data_mask_25,CR,graph_based+,GL_TAGConv_3l_512h_w_k3,10,0.470564,0.012608
3,real_data_mask_25,CR,graph_based+,GL_GINConv_3l_128h,10,0.46425,0.02126
4,real_data_mask_25,CR,graph_based+,GL_SAGEConv_3l_512h,10,0.463623,0.01174
5,real_data_mask_25,CR,graph_based+,GL_SAGEConv_3l_512h_de,10,0.460409,0.010973
6,real_data_mask_25,CR,graph_based+,GL_SAGEConv_9l_512h,10,0.451494,0.013049
7,real_data_mask_25,CR,graph_based+,GL_SAGEConv_3l_128h,10,0.448032,0.010266
8,real_data_mask_25,CR,graph_based+,GL_SAGEConv_3l_128h_de,10,0.447689,0.010267
9,real_data_mask_25,CR,graph_based+,GL_TAGConv_3l_128h_w_k3,10,0.444241,0.010216


In [16]:
# Cell 3 — configure & run (edit these)
runs_root = "/home/jovyan/shmelev/GENLINK/downstream_tasks/runs"

# Choose what to include (case-insensitive substrings). None means "no filter".
include_experiments = ['real_data_mask_0']   # e.g., only experiments whose folder names contain these substrings
include_datasets    = ['CR']           # or None for all
include_models      = None  # or None for all
include_features    = ["graph_based+"]       # or ["graph_based", "graph_based+"] or None

# Date threshold: include only runs modified on/after this date (optional)
min_date = "2025-09-20"  # or None

# Expected number of splits per (Experiment, Dataset, Feature, Model)
expected_splits = 10
strict_expected_splits = True  # see explanation below

df = summarize_runs(
    runs_root=runs_root,
    include_experiments=include_experiments,
    include_datasets=include_datasets,
    include_models=include_models,
    include_features=include_features,
    min_date=min_date,
    expected_splits=expected_splits,
    strict_expected_splits=strict_expected_splits,
)
df  # displays nicely in a Jupyter cell

# order datasets by their top Mean; within each dataset keep best-to-worst
df["_best_of_ds"] = df.groupby("Dataset")["Mean"].transform("max")
df = (
    df.sort_values(["_best_of_ds", "Dataset", "Mean"],
                   ascending=[False, True, False],
                   kind="mergesort")  # stable: preserves ties consistently
      .drop(columns="_best_of_ds")
      .reset_index(drop=True)
)
df


Unnamed: 0,Experiment,Dataset,Feature,Model,N,Mean,Std
0,real_data_mask_0,CR,graph_based+,GL_MLP_3l_512h,10,0.442043,0.0
1,real_data_mask_0,CR,graph_based+,GL_SAGEConv_3l_512h_de,10,0.439441,0.0
2,real_data_mask_0,CR,graph_based+,GL_SAGEConv_3l_512h,10,0.434541,0.0
3,real_data_mask_0,CR,graph_based+,GL_TAGConv_3l_512h_w_k3_gnorm,10,0.413659,0.0
4,real_data_mask_0,CR,graph_based+,GL_SAGEConv_9l_512h,10,0.411341,0.0
5,real_data_mask_0,CR,graph_based+,GL_TAGConv_3l_512h_w_k3,10,0.406897,0.0
6,real_data_mask_0,CR,graph_based+,GL_SAGEConv_3l_128h_de,10,0.405692,0.0
7,real_data_mask_0,CR,graph_based+,GL_GINConv_3l_512h,10,0.40499,0.006435
8,real_data_mask_0,CR,graph_based+,GL_SAGEConv_3l_128h,10,0.404571,0.0
9,real_data_mask_0,CR,graph_based+,GL_GATConv_3l_512h,10,0.400482,0.002824
