# BNLearn Results

In [17]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import sys
import numpy as np
import pandas as pd
from IPython.display import display

pd.set_option('display.max_rows', 2000)

REPO_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(REPO_ROOT))
sys.path.insert(0, str(REPO_ROOT / 'utils'))

from plotting import *

save_figs = False
debug = False

RESULTS_DIR = REPO_ROOT / 'results'

# Two running versions (FGS/NT and Random/MPC)
VERSIONS = ['bnlearn_big_fgs_nt', 'bnlearn_big_rnd_mpc','bnlearn_child_base', 'bnlearn_child_base2', 'bnlearn_50rep_mpc',
            'bnlearn_50rep_abapc']
# CSV with ABAPC and ABAPC-LLM single-run results to aggregate
CSV_ABAPC = RESULTS_DIR / 'ABAPC-LLM/merged_bnlearn-desc.csv'
CSV_BFS = RESULTS_DIR / 'causal-bfs-bnlearn-results.csv'

# Display labels and styles (aligned with causenet notebook)
names_dict = {
    'random': 'Random',
    'pc': 'PC',
    'pc_max': 'Max-PC',
    'fgs': 'FGS',
    'spc': 'Shapley-PC',
    'mpc': 'MPC',
    'cpc': 'CPC',
    'abapc_orig': 'ABAPC (Orig)',
    'abapc': 'ABAPC (Ours)',
    'abapc_llm': 'ABAPC-LLM',
    'abapc_llm_d': 'ABAPC-LLM-d',
    'cam': 'CAM',
    'nt': 'NOTEARS-MLP',
    'mcsl': 'MCSL-MLP',
    'ges': 'GES',
    'aspcr': 'ASPCR',
    'llm_bfs': 'LLM-BFS',
}

symbols_dict = {
    'random': 'x',
    'pc': 'circle',
    'pc_max': 'circle-open',
    'fgs': 'circle-open-dot',
    'spc': 'hexagon2-dot',
    'mpc': 'diamond-dot',
    'cpc': 'square-dot',
    'abapc_orig': 'triangle-down-dot',
    'abapc': 'triangle-up-dot',
    'abapc_llm': 'triangle-down-open',
    'abapc_llm_d': 'triangle-left-open',
    'cam': 'star',
    'nt': 'x',
    'mcsl': 'pentagon-dot',
    'ges': 'circle-cross',
    'aspcr': 'diamond-open',
    'llm_bfs': 'star-diamond-dot',
}

colors_dict = {
    'random': '#7f7f7f',
    'pc': main_blue,
    'pc_max': sec_blue,
    'fgs': sec_orange,
    'spc': sec_green,
    'mpc': main_green,
    'cpc': '#c678dd',
    'abapc_orig': '#c678dd',
    'abapc': '#bcbd22',
    'abapc_llm': sec_purple,
    'abapc_llm_d': main_purple,
    'cam': main_orange,
    'nt': sec_blue,
    'mcsl': '#bcbd22',
    'ges': '#17becf',
    'aspcr': sec_purple,
    'llm_bfs': "#e377c2", 
}


# For labeling datasets (|V|, |E|). Use constants as in original bnlearn notebook.
dags_nodes_map = {'asia':8, 'cancer':5, 'earthquake':5, 'sachs':11, 'survey':6, 'alarm':37, 'child':20, 'insurance':27, 'hailfinder':56, 'hepar2':70}
dags_arcs_map = {'asia':8, 'cancer':4, 'earthquake':4, 'sachs':17, 'survey':6, 'alarm':46, 'child':25, 'insurance':52, 'hailfinder':66, 'hepar2':123}

default_methods = ['Random', 'FGS', 'NOTEARS-MLP', 'MPC',  'ABAPC (Ours)', 'LLM-BFS', 'ABAPC-LLM', 'ABAPC-LLM-d']
inverse_names = {v: k for k, v in names_dict.items()}


In [18]:
# Optional: per-run configuration with method selection (mirrors causenet notebook)
MODEL_ALIASES = {#'ABAPC (Ours)': 'ABAPC (Opt)'
    }
EXTRA_MODELS = {}

# Metrics schema used when loading CSV aggregates
COLUMNS_MAP = {
    'dag': [
        'dataset', 'model',
        'elapsed_mean', 'elapsed_std',
        'nnz_mean', 'nnz_std',
        'fdr_mean', 'fdr_std',
        'tpr_mean', 'tpr_std',
        'fpr_mean', 'fpr_std',
        'precision_mean', 'precision_std',
        'recall_mean', 'recall_std',
        'F1_mean', 'F1_std',
        'shd_mean', 'shd_std',
        'sid_mean', 'sid_std',
    ],
}

# Mapping from desired metric names to CSV column names
CSV_METRIC_MAP = {
    'dag': {
        'elapsed': 'time',
        'nnz': 'dag_nnz',
        'fdr': 'dag_fdr',
        'tpr': 'dag_tpr',
        'fpr': 'dag_fpr',
        'precision': 'dag_precision',
        'recall': 'dag_recall',
        'F1': 'dag_F1',
        'shd': 'dag_shd',
        'sid': 'dag_sid',
    }
}

def load_npy(version: str, kind: str):
    path = RESULTS_DIR / f'stored_results_{version}.npy'
    cols = ['dataset', 'model', 'elapsed_mean', 'elapsed_std', 'nnz_mean', 'nnz_std',
            'fdr_mean', 'fdr_std', 'tpr_mean', 'tpr_std', 'fpr_mean', 'fpr_std',
            'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
            'F1_mean', 'F1_std', 'shd_mean', 'shd_std', 'SID_mean', 'SID_std']
    if not path.exists():
        return pd.DataFrame(columns=cols)
    arr = np.load(path, allow_pickle=True)
    df = pd.DataFrame(arr, columns=cols)
    df['dataset'] = df['dataset'].astype(str)
    df['model'] = df['model'].astype(str)
    return df

def resolve_csv_path(spec, base_dir):
    csv_entry = spec.get('csv_path')
    if csv_entry is None:
        csv_entry = spec.get('csv_name', 'agg.csv')
    csv_path = Path(csv_entry)
    if not csv_path.is_absolute():
        csv_path = base_dir / csv_path
    return csv_path

def load_csv_results(kind, spec, columns, base_dir, csv_path=None):
    csv_path = csv_path or resolve_csv_path(spec, base_dir)
    if not csv_path.exists():
        raise FileNotFoundError(f'Missing CSV results file: {csv_path}')
    df = pd.read_csv(csv_path)
    dataset_col = spec.get('csv_dataset_column', 'dataset')
    model_constant = spec.get('csv_model_constant')
    model_col = spec.get('csv_model_column', 'model')
    # Normalise dataset/model columns
    df = df.rename(columns={dataset_col: 'dataset'})
    if model_constant is not None:
        df['model'] = str(model_constant)
    elif model_col in df.columns:
        df = df.rename(columns={model_col: 'model'})
    else:
        # If no model column and no constant provided, try to infer a single model
        if 'impl' in df.columns:
            df = df.rename(columns={'impl': 'model'})
        else:
            raise ValueError(f'CSV lacks a model column and no csv_model_constant provided: {csv_path}')
    model_mapping = spec.get('csv_model_mapping')
    if model_mapping:
        df['model'] = df['model'].replace(model_mapping)
    df['dataset'] = df['dataset'].astype(str)
    df['model'] = df['model'].astype(str)
    # Metric columns mapping and aggregation
    metrics_map = CSV_METRIC_MAP[kind].copy()
    overrides = spec.get('csv_metric_map')
    if overrides:
        metrics_map.update(overrides)
    missing = [src for src in metrics_map.values() if src not in df.columns]
    if missing:
        raise ValueError(f'Missing columns in CSV results ({csv_path}): {missing}')
    rename_map = {src: name for name, src in metrics_map.items()}
    value_cols = list(rename_map.keys())
    df = df[['dataset', 'model'] + value_cols].rename(columns=rename_map)
    grouped = (
        df.groupby(['dataset', 'model'], as_index=False)
          .agg({col: ['mean', 'std'] for col in rename_map.values()})
    )
    # Flatten multi-index columns
    flat_columns = []
    for col in grouped.columns:
        if isinstance(col, tuple):
            metric, agg = col
            flat_columns.append(f'{metric}_{agg}' if agg != '' else metric)
        else:
            flat_columns.append(col)
    grouped.columns = flat_columns
    # Ensure presence of expected columns
    for metric in rename_map.values():
        for agg in ('mean','std'):
            col = f'{metric}_{agg}'
            if col not in grouped.columns:
                grouped[col] = np.nan
    for col in columns:
        if col not in grouped.columns:
            grouped[col] = np.nan
    return grouped[columns]

def load_results(kind, run_specs=None):
    columns = COLUMNS_MAP[kind]
    specs = run_specs or RUN_CONFIGS
    frames = []
    for spec in specs:
        base_dir = Path(spec.get('results_dir', RESULTS_DIR))
        include_methods = spec.get('include_methods')
        label = spec.get('label', spec.get('version', 'run'))
        fmt = str(spec.get('format', 'npy')).lower()
        if fmt == 'csv':
            csv_path = resolve_csv_path(spec, base_dir)
            if not csv_path.exists():
                print(f"[WARN] Missing results file: {csv_path}")
                continue
            df = load_csv_results(kind, spec, columns, base_dir, csv_path=csv_path)
        else:
            version = spec.get('version')
            if not version:
                print('[WARN] Missing version in run spec; skipping')
                continue
            try:
                df = load_npy(version, kind)
            except FileNotFoundError:
                print(f"[WARN] Missing results file for version: {version}")
                continue
        # Harmonise column names and models
        df = df.rename(columns=lambda col: col.replace('sid', 'SID'))
        if 'model' in df:
            df['model'] = df['model'].replace(MODEL_ALIASES)
        if include_methods:
            df = df[df['model'].isin(include_methods)].reset_index(drop=True)
        df['run_label'] = label
        frames.append(df)
    if not frames:
        cols_with_label = columns + ['run_label']
        return pd.DataFrame(columns=cols_with_label)
    return pd.concat(frames, ignore_index=True)

# Example RUN_CONFIGS; adjust to your runs
RUN_CONFIGS = [
    {
        'results_dir': RESULTS_DIR,
        'version': 'bnlearn_big_rnd_mpc',
        'include_methods': ['Random'],
        'label': 'rnd_mpc',
    },
    {
        'results_dir': RESULTS_DIR,
        'version': 'bnlearn_big_fgs_nt',
        'include_methods': ['FGS', 'NOTEARS-MLP'],
        'label': 'fgs_nt',
    },
    {
        'results_dir': RESULTS_DIR,
        'version': 'bnlearn_child_base',
        'include_methods': ['Random', 'FGS', 'NOTEARS-MLP'],
        'label': 'rnd_mpc',
    },
    {
        'results_dir': RESULTS_DIR,
        'version': 'bnlearn_child_base2',
        'include_methods': ['Random', 'FGS', 'NOTEARS-MLP'],
        'label': 'rnd_mpc',
    },
    {
        'results_dir': RESULTS_DIR,
        'version': 'bnlearn_50rep_mpc',
        'include_methods': ['MPC'],
        'label': 'mpc',
    },    
    {
        'results_dir': RESULTS_DIR,
        'version': 'bnlearn_50rep_abapc',
        'include_methods': ['ABAPC (Ours)'],
        'label': 'abapc',
    },

]


In [19]:
datasets_to_plot = {'cancer', 'earthquake', 'asia', 'survey', 'sachs', 'child'}

# Merge aggregates using RUN_CONFIGS (method-selectable)
dag_all = load_results('dag')

## filter to datasets of interest
dag_all = dag_all[dag_all['dataset'].str.lower().isin(datasets_to_plot)].copy()

# Add node/edge counts for label/normalisation
if 'num_edges' not in dag_all.columns: dag_all['num_edges'] = np.nan
if 'num_nodes' not in dag_all.columns: dag_all['num_nodes'] = np.nan
# Fill from known maps where missing
base_names = dag_all['dataset'].astype(str).str.replace('<br>.*','', regex=True).str.lower()
dag_all['n_edges'] = base_names.map(dags_arcs_map).fillna(dag_all['num_edges']).astype(float)
dag_all['n_nodes'] = base_names.map(dags_nodes_map).fillna(dag_all['num_nodes']).astype(float)
# Nice dataset label
ds_upper = base_names.str.upper()
dag_all['dataset'] = ds_upper + '<br> |V|=' + dag_all['n_nodes'].fillna(0).astype(int).astype(str) + ', |E|=' + dag_all['n_edges'].fillna(0).astype(int).astype(str)

# Compute normalised metrics
def add_normals(df, sid_cols):
    if 'shd_mean' in df and 'n_edges' in df:
        df['p_shd_mean'] = df['shd_mean'].astype(float) / df['n_edges'].astype(float)
        df['p_shd_std'] = df['shd_std'].astype(float) / df['n_edges'].astype(float)
    for c in sid_cols:
        mean_c, std_c = f'{c}_mean', f'{c}_std'
        if mean_c in df and 'n_edges' in df:
            df[f'p_{c}_mean'] = df[mean_c].astype(float) / df['n_edges'].astype(float)
            df[f'p_{c}_std'] = df[std_c].astype(float) / df['n_edges'].astype(float)
    return df

dag_all = add_normals(dag_all, sid_cols=['SID'])

available_models_dag = sorted(dag_all['model'].unique()) if len(dag_all)>0 else []
print('DAG models:', available_models_dag)


DAG models: ['ABAPC (Ours)', 'FGS', 'MPC', 'NOTEARS-MLP', 'Random']


In [20]:
# Count runs across VERSIONS using progress CSVs if present, else parse logs.
# Also count ABAPC/LLM-BFS runs from CSVs, splitting ABAPC by impl.

from pathlib import Path
import pandas as pd
import numpy as np
import re

# Uses existing: RESULTS_DIR (Path), VERSIONS (list[str]), names_dict, CSV_ABAPC, CSV_BFS
expected_n_runs = None  # set to int to flag incomplete, e.g., 50

def count_progress_dir(progress_dir: Path):
    rx = re.compile(r'^(?P<dataset>.+?)__+(?P<method>.+?)_(?P<kind>dag|cpdag)\.csv$')
    dag_counts, cpdag_counts = {}, {}
    if not progress_dir.exists():
        return pd.DataFrame(columns=['dataset','method_key','model','dag_runs','cpdag_runs','runs'])
    for p in progress_dir.glob('*.csv'):
        m = rx.match(p.name)
        if not m:
            continue
        key = (m.group('dataset'), m.group('method'))
        try:
            n = sum(1 for _ in open(p, 'r', encoding='utf-8', errors='ignore')) - 1
            n = max(0, n)
        except Exception:
            n = 0
        if m.group('kind') == 'dag':
            dag_counts[key] = n
        else:
            cpdag_counts[key] = n
    keys = sorted(set(dag_counts.keys()) | set(cpdag_counts.keys()))
    rows = []
    for dataset, method in keys:
        dag_n = dag_counts.get((dataset, method), 0)
        cpdag_n = cpdag_counts.get((dataset, method), 0)
        runs = min(dag_n, cpdag_n) if (dag_n and cpdag_n) else max(dag_n, cpdag_n)
        rows.append({'dataset':dataset,'method_key':method,'model':names_dict.get(method,method),
                     'dag_runs':dag_n,'cpdag_runs':cpdag_n,'runs':runs})
    return pd.DataFrame(rows).sort_values(['dataset','model']).reset_index(drop=True)

def count_from_log(log_path: Path):
    # Count lines like: {'dataset': 'child', 'model': 'FGS', 'elapsed': ... , ...}
    # Two lines per run (DAG and CPDAG) => runs = floor(count/2)
    if not log_path.exists():
        return pd.DataFrame(columns=['dataset','method_key','model','runs'])
    pat = re.compile(r"""\{[^}]*'dataset'\s*:\s*'(?P<dataset>[^']+)'.*?'model'\s*:\s*'(?P<model>[^']+)'.*?'elapsed'""")
    counts = {}
    with open(log_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            m = pat.search(line)
            if not m:
                continue
            key = (m.group('dataset'), m.group('model'))
            counts[key] = counts.get(key, 0) + 1
    rows = []
    inverse_names = {v: k for k, v in names_dict.items()}
    for (dataset, model_pretty), line_count in counts.items():
        runs = line_count // 2  # two metric logs per run
        method_key = inverse_names.get(model_pretty, model_pretty)
        rows.append({'dataset':dataset,'method_key':method_key,'model':model_pretty,'runs':runs})
    return pd.DataFrame(rows)

def count_csv_abapc_split(csv_path: Path):
    # Split ABAPC by impl column if present: org -> abapc, new -> abapc_llm
    if not csv_path.exists():
        return pd.DataFrame(columns=['dataset','method_key','model','runs'])
    df = pd.read_csv(csv_path)
    # Guess dataset column
    dcol = None
    for c in ['dataset','Dataset','data','graph','name']:
        if c in df.columns:
            dcol = c; break
    if dcol is None:
        return pd.DataFrame(columns=['dataset','method_key','model','runs'])
    impl_col = 'impl' if 'impl' in df.columns else None
    if impl_col:
        impl_map = {'org':'abapc','new':'abapc_llm'}
        df['_method_key'] = df[impl_col].map(impl_map).fillna('abapc')
        df['_model'] = df['_method_key'].map(lambda k: names_dict.get(k, k))
        g = df.groupby([df[dcol].astype(str),'_method_key','_model']).size().rename('runs').reset_index()
        g.columns = ['dataset','method_key','model','runs']
        return g
    # fallback: count all as abapc
    g = df.groupby(df[dcol].astype(str)).size().rename('runs').reset_index()
    g.columns = ['dataset','runs']
    g['method_key'] = 'abapc'
    g['model'] = names_dict.get('abapc','abapc')
    return g[['dataset','method_key','model','runs']]

def count_csv_llm_bfs(csv_path: Path):
    if not csv_path.exists():
        return pd.DataFrame(columns=['dataset','method_key','model','runs'])
    df = pd.read_csv(csv_path)
    dcol = None
    for c in ['dataset','Dataset','data','graph','name']:
        if c in df.columns:
            dcol = c; break
    if dcol is None:
        return pd.DataFrame(columns=['dataset','method_key','model','runs'])
    g = df.groupby(df[dcol].astype(str)).size().rename('runs').reset_index()
    g.columns = ['dataset','runs']
    g['method_key'] = 'llm_bfs'
    g['model'] = names_dict.get('llm_bfs','llm_bfs')
    return g[['dataset','method_key','model','runs']]

# Per-version counting
counts_per_version = {}
for v in VERSIONS:
    pdir = RESULTS_DIR / 'progress' / v
    dfp = count_progress_dir(pdir)
    if dfp.empty:
        # fallback to log parsing
        dfl = count_from_log(RESULTS_DIR / f'log_{v}.log')
        counts_per_version[v] = dfl
    else:
        counts_per_version[v] = dfp[['dataset','method_key','model','runs']]

# Show per-version pivots
for v, dfv in counts_per_version.items():
    print(f'Version: {v}')
    if dfv.empty:
        # display(pd.DataFrame({'note':[f'No progress CSVs or log entries found in {RESULTS_DIR} for {v}']}))
        continue
    piv = dfv.pivot(index='dataset', columns='model', values='runs').fillna(0).astype(int)
    # display(piv)

# Combine across versions (take max per dataset/method)
combined = (pd.concat(list(counts_per_version.values()), ignore_index=True)
            if counts_per_version else
            pd.DataFrame(columns=['dataset','method_key','model','runs']))
if not combined.empty:
    combined = (combined.groupby(['dataset','method_key','model'], as_index=False)['runs']
                .max())

# Add CSVs with split ABAPC implementations + LLM-BFS
csv_counts = []
csv_counts.append(count_csv_abapc_split(CSV_ABAPC))
csv_counts.append(count_csv_llm_bfs(CSV_BFS))
csv_counts = pd.concat(csv_counts, ignore_index=True) if csv_counts else pd.DataFrame(columns=['dataset','method_key','model','runs'])

all_counts = combined
if not csv_counts.empty:
    all_counts = pd.concat([all_counts, csv_counts], ignore_index=True)

all_counts = all_counts.sort_values(['dataset','model']).reset_index(drop=True)
# display(all_counts)

# Pivot view
pivot_all = all_counts.pivot(index='dataset', columns='model', values='runs').fillna(0).astype(int)
# display(pivot_all)

# Optional incomplete flagging
if expected_n_runs is not None and not all_counts.empty:
    incomplete = all_counts[all_counts['runs'] < int(expected_n_runs)].copy()
    if not incomplete.empty:
        print(f'Incomplete (runs < {expected_n_runs}):')
        # display(incomplete.sort_values(['runs','dataset','model']))
    else:
        print('All dataset/model pairs reached expected_n_runs.')


Version: bnlearn_big_fgs_nt
Version: bnlearn_big_rnd_mpc
Version: bnlearn_child_base
Version: bnlearn_child_base2
Version: bnlearn_50rep_mpc
Version: bnlearn_50rep_abapc


## Structure Reconstruction Evaluation

In [21]:
# Plots by dataset (DAG)
if len(dag_all):
    methods = [m for m in default_methods if m in dag_all['model'].unique()]
    print('DAG plotting methods:', methods)
    double_bar_chart_plotly(
        dag_all, ['p_shd','F1'], names_dict, colors_dict, methods,
        save_figs=save_figs, font_size=23, output_name=str(REPO_ROOT / 'results/figs/Fig.bn_dag_SHD_SID.html'),
        debug=False, range_y1=[0,2.6], range_y2=[0,5.6], rect_exp=0.01
    )
    bar_chart_plotly(
        dag_all, 'p_SID', names_dict, colors_dict, methods,
        save_figs=save_figs, font_size=23, output_name=str(REPO_ROOT / 'results/figs/Fig.bn_dag_F1.html'),
        debug=False
    )
    double_bar_chart_plotly(
        dag_all, ['precision','recall'], names_dict, colors_dict, methods,
        save_figs=save_figs, font_size=23, output_name=str(REPO_ROOT / 'results/figs/Fig.bn_dag_prec_rec.html'),
        debug=False
    )
else:
    print('No DAG results found.')


DAG plotting methods: ['Random', 'FGS', 'NOTEARS-MLP', 'MPC', 'ABAPC (Ours)']


## CPDAG Evaluation

In [22]:
# Load and plot CPDAG metrics (SID best/worst) to mirror experiments.ipynb

# Extend schemas to support CPDAG files
COLUMNS_MAP.update({
    'cpdag': [
        'dataset','model',
        'elapsed_mean','elapsed_std',
        'nnz_mean','nnz_std',
        'fdr_mean','fdr_std',
        'tpr_mean','tpr_std',
        'fpr_mean','fpr_std',
        'precision_mean','precision_std',
        'recall_mean','recall_std',
        'F1_mean','F1_std',
        'shd_mean','shd_std',
        'SID_low_mean','SID_low_std','SID_high_mean','SID_high_std'
    ]
})

# Optional CSV metric mapping for CPDAG if ever needed (not used here)
CSV_METRIC_MAP.update({
    'cpdag': {
        'elapsed': 'time',
        'nnz': 'cpdag_nnz',
        'fdr': 'cpdag_fdr',
        'tpr': 'cpdag_tpr',
        'fpr': 'cpdag_fpr',
        'precision': 'cpdag_precision',
        'recall': 'cpdag_recall',
        'F1': 'cpdag_F1',
        'shd': 'cpdag_shd',
        # best/worst SID if present in CSV aggregates
        'SID_low': 'cpdag_sid_low',
        'SID_high': 'cpdag_sid_high',
    }
})

# Redefine loader to read cpdag npy files when requested
def load_npy(version: str, kind: str):
    if kind == 'cpdag':
        path = RESULTS_DIR / f'stored_results_{version}_cpdag.npy'
        cols = ['dataset','model',
                'elapsed_mean','elapsed_std',
                'nnz_mean','nnz_std',
                'fdr_mean','fdr_std',
                'tpr_mean','tpr_std',
                'fpr_mean','fpr_std',
                'precision_mean','precision_std',
                'recall_mean','recall_std',
                'F1_mean','F1_std',
                'shd_mean','shd_std',
                'SID_low_mean','SID_low_std','SID_high_mean','SID_high_std']
    else:
        path = RESULTS_DIR / f'stored_results_{version}.npy'
        cols = ['dataset', 'model', 'elapsed_mean', 'elapsed_std', 'nnz_mean', 'nnz_std',
                'fdr_mean', 'fdr_std', 'tpr_mean', 'tpr_std', 'fpr_mean', 'fpr_std',
                'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
                'F1_mean', 'F1_std', 'shd_mean', 'shd_std', 'SID_mean', 'SID_std']
    if not path.exists():
        raise FileNotFoundError(f'Missing results file: {path}')
    arr = np.load(path, allow_pickle=True)
    df = pd.DataFrame(arr, columns=cols)
    df['dataset'] = df['dataset'].astype(str)
    df['model'] = df['model'].astype(str)
    return df

# Load combined CPDAG aggregates across RUN_CONFIGS
cpdag_all = load_results('cpdag')

# Restrict to the same datasets as DAG plots
if 'dataset' in cpdag_all.columns and len(cpdag_all):
    base_names = cpdag_all['dataset'].astype(str).str.replace('<br>.*','', regex=True).str.lower()
    cpdag_all = cpdag_all[base_names.isin(datasets_to_plot)].copy()

# Add node/edge counts and pretty labels
if len(cpdag_all):
    base_names = cpdag_all['dataset'].astype(str).str.replace('<br>.*','', regex=True).str.lower()
    cpdag_all['n_edges'] = base_names.map(dags_arcs_map).fillna(cpdag_all.get('num_edges', np.nan)).astype(float)
    cpdag_all['n_nodes'] = base_names.map(dags_nodes_map).fillna(cpdag_all.get('num_nodes', np.nan)).astype(float)
    ds_upper = base_names.str.upper()
    cpdag_all['dataset'] = ds_upper + '<br> |V|=' + cpdag_all['n_nodes'].fillna(0).astype(int).astype(str) + ', |E|=' + cpdag_all['n_edges'].fillna(0).astype(int).astype(str)

    # Normalise SHD and SID best/worst by number of edges
    if 'shd_mean' in cpdag_all and 'n_edges' in cpdag_all:
        cpdag_all['p_shd_mean'] = cpdag_all['shd_mean'].astype(float) / cpdag_all['n_edges'].astype(float)
        cpdag_all['p_shd_std'] = cpdag_all['shd_std'].astype(float) / cpdag_all['n_edges'].astype(float)
    for c in ['SID_low','SID_high']:
        mean_c, std_c = f'{c}_mean', f'{c}_std'
        if mean_c in cpdag_all and 'n_edges' in cpdag_all:
            cpdag_all[f'p_{c}_mean'] = cpdag_all[mean_c].astype(float) / cpdag_all['n_edges'].astype(float)
            cpdag_all[f'p_{c}_std'] = cpdag_all[std_c].astype(float) / cpdag_all['n_edges'].astype(float)
    # Ensure non-zero tiny values for visibility
    if 'p_SID_low_mean' in cpdag_all: cpdag_all['p_SID_low_mean'] = cpdag_all['p_SID_low_mean'].replace(0, 0.03)
    if 'p_SID_high_mean' in cpdag_all: cpdag_all['p_SID_high_mean'] = cpdag_all['p_SID_high_mean'].replace(0, 0.03)



# Plot CPDAG NSID best vs worst
if len(cpdag_all):
    # Preferred ordering; include only those present
    preferred = ['Random','FGS','NOTEARS-MLP','Shapley-PC','ASPCR','ABAPC (Ours)','ABAPC-LLM','ABAPC-LLM-d','LLM-BFS','MPC']
    ## Force the same ordering of methods as in DAG plots
    available_models_cpdag = [m for m in methods if m in cpdag_all['model'].unique()]
    print('CPDAG models:', available_models_cpdag)    
    double_bar_chart_plotly(
        cpdag_all, ['p_SID_low','p_SID_high'], names_dict, colors_dict, available_models_cpdag,
        save_figs=save_figs, font_size=23, output_name=str(REPO_ROOT / 'results/figs/Fig.bn_cpdag_SID_best_worst.html'),
        debug=False, range_y1=[0,6], range_y2=[0,6], rect_exp=0.01
    )
    double_bar_chart_plotly(
        cpdag_all, ['p_shd','F1'], names_dict, colors_dict, available_models_cpdag,
        save_figs=save_figs, font_size=23, output_name=str(REPO_ROOT / 'results/figs/Fig.bn_cpdag_SID_best_worst.html'),
        debug=False, range_y1=[0,6], range_y2=[0,6], rect_exp=0.01
    )
else:
    print('No CPDAG results found.')


CPDAG models: ['Random', 'FGS', 'NOTEARS-MLP', 'MPC', 'ABAPC (Ours)']


In [23]:
# Add ABAPC (Orig) runtime series from older stored results, if available
def load_abapc_orig_runtime():
    cols = ['dataset', 'model', 'elapsed_mean', 'elapsed_std', 'nnz_mean', 'nnz_std',
            'fdr_mean', 'fdr_std', 'tpr_mean', 'tpr_std', 'fpr_mean', 'fpr_std',
            'precision_mean', 'precision_std', 'recall_mean', 'recall_std',
            'F1_mean', 'F1_std', 'shd_mean', 'shd_std', 'SID_mean', 'SID_std']
    for v in ['bnlearn_dag_v5_2000', 'bnlearn_dag_v5']:
        p = RESULTS_DIR / f'stored_results_{v}.npy'
        if p.exists():
            arr = np.load(p, allow_pickle=True)
            df = pd.DataFrame(arr, columns=cols)
            df['dataset'] = df['dataset'].astype(str)
            # Select ABAPC rows (previous implementation)
            sub = df[df['model'].astype(str).str.contains('ABAPC', case=False, na=False)].copy()
            if not len(sub):
                continue
            # Map nodes/edges and set pretty label
            base = sub['dataset'].str.lower()
            sub['n_nodes'] = base.map(dags_nodes_map).astype(float)
            sub['n_edges'] = base.map(dags_arcs_map).astype(float)
            sub['model'] = 'ABAPC (Orig)'
            return sub
    return pd.DataFrame(columns=cols + ['n_nodes','n_edges'])

abapc_orig_df = load_abapc_orig_runtime()
if len(abapc_orig_df):
    # Align columns and append
    needed_cols = sorted(set(dag_all.columns) | set(abapc_orig_df.columns))
    for c in needed_cols:
        if c not in dag_all.columns:
            dag_all[c] = np.nan
        if c not in abapc_orig_df.columns:
            abapc_orig_df[c] = np.nan
    dag_all = pd.concat([dag_all, abapc_orig_df[dag_all.columns]], ignore_index=True)
    print('Added ABAPC (Orig) runtime rows:', len(abapc_orig_df))
else:
    print('ABAPC (Orig) stored results not found; skipping.')

model_aliases = names_dict.copy()
model_aliases['abapc'] = 'ABAPC (Opt)'

plot_runtime(dag_all, ['n_nodes'], "", 
                     names_dict, symbols_dict, colors_dict, ['fgs', 'nt', 'mpc', 'abapc_orig', 'abapc'],
                         share_y=False, save_figs=save_figs, model_aliases=model_aliases,
                            output_name="../results/figs/Fig.2_runtime.html", debug=False, font_size=20,
                            plot_height=370, plot_width=800)

Added ABAPC (Orig) runtime rows: 4
