# Notebook for analyzing the results obtained from smaller networks

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from bnsl.metrics import compute_shd

Store the records as a dataframe

In [None]:
root = Path.cwd().parents[1]    # two levels up
files = list((root / "data" / "results" / "small").rglob("*.json"))

In [None]:
records = []
for f in files:
    with open(f) as fp:
        r = json.load(fp)
        record = {
            "algorithm": r["algorithm"],
            "network": r["network"].split("/")[-1].split(".")[0],
            "num_samples": r["num_samples"],
            "score": r["score"],
            "theoretical_upper_bound": r["bounds"].get("theoretical_upper_bound"),
            "naive_upper_bound": r["bounds"].get("naive_upper_bound"),
            "runtime": r["seconds_elapsed"],
            "k": r["params"].get("k"),
            "l": r["params"].get("l"),
            "num_vars": r["num_variables"],
            "seed": r["seed"],
            "parent_map": r.get("parent_map"),
        }
        records.append(record)

df = pd.DataFrame(records)

In [None]:
df.head()

 Create a table with only approximation algorithm, and true fields from dp

In [None]:
approx_df = df[df["algorithm"] == "approximation_algorithm"]

In [None]:
# apply the true score from silander myllymaki to the corresponding approximation results (important that dp has been run on the same network, num_samples, and seed as 
# the approximation result)
approx_df["dp_score"] = approx_df.apply(
    lambda row: df[
        (df["algorithm"] == "silander_myllymaki") &
        (df["network"] == row["network"]) &
        (df["num_samples"] == row["num_samples"]) &
        (df["seed"] == row["seed"])
    ]["score"].values[0], axis=1
)

In [None]:
# apply the true pm from silander myllymaki to the corresponding approximation results
# TODO: check if this is actually used anywhere
approx_df["dp_parent_map"] = approx_df.apply(
    lambda row: df[
        (df["algorithm"] == "silander_myllymaki") &
        (df["network"] == row["network"]) &
        (df["num_samples"] == row["num_samples"]) &
        (df["seed"] == row["seed"])
    ]["parent_map"].values[0], axis=1
).drop(columns=["algorithm"])

In [None]:
approx_df.head()

Compute SHD from the DP network

In [None]:
approx_df["SHD"] = approx_df.apply(
    lambda row: compute_shd(
        root / "networks" /  "small" / f"{row['network']}.bif",
       row["parent_map"],
    ), axis=1
)

In [None]:
approx_df.sort_values(["network", "num_samples"]).head(20)

Make k and l ints

In [None]:
approx_df["k"] = approx_df["k"].astype(int)
approx_df["l"] = approx_df["l"].astype(int)

Plots comparing scores to upper bound

In [None]:
# Aggregate over seeds
df_agg = (
    approx_df.groupby(["num_samples", "network", "k", "l"], as_index=False)
      .agg(
          score=("score", "mean"),
          dp_score=("dp_score", "mean"),
          theoretical_upper_bound=("theoretical_upper_bound", "mean"),
          naive_upper_bound=("naive_upper_bound", "mean"),
      )
)

for (network, n), sub in df_agg.groupby(["network", "num_samples"]):

    sub = sub.sort_values(["k", "l"])

    x = np.arange(len(sub))
    width = 0.18

    plt.figure(figsize=(7, 4))

    plt.bar(x - 1.5*width, sub["score"], width, label="Approx score")
    plt.bar(x - 0.5*width, sub["dp_score"], width, label="DP (optimal)")
    plt.bar(x + 0.5*width, sub["theoretical_upper_bound"], width, label="Theoretical upper bound")
    plt.bar(x + 1.5*width, sub["naive_upper_bound"], width, label="Naive upper bound")

    ratio_labels = [fr"$\frac{{{l}}}{{{k}}}$" for k, l in zip(sub["k"], sub["l"])]

    plt.xticks(x, ratio_labels)
    plt.xlabel("Approximation ratio $\\frac{l}{k}$")
    plt.ylabel("Score")
    plt.title(f"Scores per approximation ratio\nNetwork={network}, n={n}")

    # Upper-right legend
    plt.legend(fontsize=7, loc="upper right")

    plt.tight_layout()

    outfile = root / f"experiments/plots/plot_scores_{network}_n{n}.png"
    plt.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close()


Plot per sample size, with subplots per network

In [None]:
import math
for n, df_n in df_agg.groupby("num_samples"):
    networks = sorted(df_n["network"].unique())
    num_nets = len(networks)

    ncols = 3
    nrows = math.ceil(num_nets / ncols)

    fig, axes = plt.subplots(
        nrows, ncols,
        figsize=(5 * ncols, 3.5 * nrows), 
        sharey=True
    )
    axes = np.array(axes).reshape(-1)  # flatten to 1D list

    legend_labels = ["Approx score", "DP (optimal)", "Theoretical upper bound", "Naive upper bound"]
    first_handles = None

    for ax_idx, (ax, net) in enumerate(zip(axes, networks)):
        sub = df_n[df_n["network"] == net].sort_values(["k", "l"])

        x = np.arange(len(sub))
        width = 0.18

        # Bars
        b1 = ax.bar(x - 1.5 * width, sub["score"], width)
        b2 = ax.bar(x - 0.5 * width, sub["dp_score"], width)
        b3 = ax.bar(x + 0.5 * width, sub["theoretical_upper_bound"], width)
        b4 = ax.bar(x + 1.5 * width, sub["naive_upper_bound"], width)

        if first_handles is None:
            first_handles = [b1, b2, b3, b4]

        # x-labels as l/k
        ratio_labels = [fr"$\frac{{{l}}}{{{k}}}$" for k, l in zip(sub["k"], sub["l"])]
        ax.set_xticks(x)
        ax.set_xticklabels(ratio_labels)

        ax.set_title(net)

        # Label Y-axis only for the leftmost column
        if ax_idx % ncols == 0:
            ax.set_ylabel("Score")

    # Hide unused subplot(s)
    for ax in axes[num_nets:]:
        ax.set_visible(False)

    # Common xlabel
    fig.text(0.5, 0.04, r"Approximation ratio $\frac{l}{k}$", ha="center")

    # Global legend
    fig.legend(
        first_handles,
        legend_labels,
        loc="upper right",
        fontsize=8,
        bbox_to_anchor=(0.98, 0.98),
    )

    fig.suptitle(f"Scores per approximation ratio (n = {n})", y=0.99)
    fig.tight_layout(rect=[0.02, 0.06, 0.96, 0.95])

    outfile = root / f"experiments/plots/plot_scores_n{n}_subplots.png"
    fig.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close(fig)


Plot comparing SHD

In [None]:
# Aggregate over seeds (include SHD)
df_agg = (
    approx_df.groupby(["num_samples", "network", "k", "l"], as_index=False)
             .agg(
                 score=("score", "mean"),
                 dp_score=("dp_score", "mean"),
                 upper_bound=("theoretical_upper_bound", "mean"),
                 SHD=("SHD", "mean"),
             )
)

# One plot per sample size, bars = (k,l) configs, x = network, y = SHD
for n, sub in df_agg.groupby("num_samples"):
    pivot = sub.pivot(index="network", columns=["k", "l"], values="SHD")

    pivot = pivot.sort_index()
    networks = pivot.index.to_list()
    configs = list(pivot.columns)

    x = np.arange(len(networks))
    num_cfgs = len(configs)
    width = 0.8 / num_cfgs

    plt.figure(figsize=(8, 4))  # slightly wider for the legend

    for i, (k, l) in enumerate(configs):
        offsets = x + (i - (num_cfgs - 1) / 2) * width
        shd_vals = pivot[(k, l)].values

        plt.bar(
            offsets,
            shd_vals,
            width,
            label=fr"$\frac{{{int(l)}}}{{{int(k)}}}$",
        )

    plt.xticks(x, networks, rotation=30, ha="right")
    plt.ylabel("Mean SHD (over seeds)")
    plt.title(f"SHD per network for different approximation ratios (n={n})")

    plt.legend(
        fontsize=7,
        title=r"Approx. ratio $\frac{l}{k}$",
        loc="center left",
        bbox_to_anchor=(1.02, 0.5),  # outside right
        borderaxespad=0,
    )

    plt.tight_layout()

    outfile = root / f"experiments/plots/plot_shd_n{n}.png"
    plt.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close()

    print(f"Saved: {outfile}")


Inspect closer why it can be that the higher approximation ratios get higher SHD for suvey network

In [None]:
# look at the entries where the network is survey and sample size is 10000
df_filtered = approx_df[
    (approx_df["network"] == "survey") &
    (approx_df["num_samples"] == 10000)
]
df_filtered.head()
# Look only at k, l and parent map, print all parent maps
for idx, row in df_filtered.iterrows():
    k = row["k"]
    l = row["l"]
    pm = row["parent_map"]
    pm_dp = row["dp_parent_map"]
    print(f"k={k}, l={l}, parent_map={pm}, dp_parent_map={pm_dp}")
    

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Filter to a single sample size
TARGET_N = 100
df_1000 = df_agg[df_agg["num_samples"] == TARGET_N].copy()

# Add approximation ratio
df_1000["ratio"] = df_1000["l"] / df_1000["k"]

# Plot one figure per network
for net, sub in df_1000.groupby("network"):
    plt.figure(figsize=(6, 4))

    # Sort by ratio so the line is monotone
    sub = sub.sort_values("ratio")

    x = sub["ratio"].to_numpy()
    y = sub["score"].to_numpy()

    # Approximation score curve
    plt.plot(
        x,
        y,
        marker="o",
        linestyle="-",
        label=f"Approx score (n={TARGET_N})"
    )

    #DP score as a dotted line
    dp_val = sub["dp_score"].iloc[0]
    plt.axhline(
        dp_val,
        linestyle=":",
        linewidth=1,
        alpha=0.6,
        color="black",
        label="DP score (optimal)"
    )

    kl_list = [tuple(x) for x in sub[["k", "l"]].drop_duplicates().to_numpy()]
    kl_list = sorted(kl_list)

    xticks = [l / k for (k, l) in kl_list]
    xticklabels = [fr"$\frac{{{int(l)}}}{{{int(k)}}}$" for (k, l) in kl_list]

    plt.xticks(xticks, xticklabels)

    plt.xlabel(r"Approximation ratio $\frac{l}{k}$")
    plt.ylabel("Score")
    plt.title(f"Score vs. approximation ratio ({net}, n={TARGET_N})")
    plt.legend(fontsize=7)
    plt.tight_layout()

    outfile = root / f"experiments/plots/plot_score_vs_ratio_{net}_n{TARGET_N}.png"
    plt.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close()

    print(f"Saved: {outfile}")

Write pandas table to latex

In [None]:
column_order = [
    "num_samples",
    "seed",
    "l",
    "k",
    "runtime",
    "score",
    "dp_score",
    "theoretical_upper_bound",
    "SHD",
]


In [None]:
# Sort globally before grouping
approx_df = approx_df.sort_values(["seed", "num_samples", "k"])

for net, df_net in approx_df.groupby("network"):
    print(f"% --- {net} ---")

    df_print = df_net.drop(columns=["network", "parent_map", "dp_parent_map", "num_vars"])

    df_print = df_print[column_order]

    # dynamically create column format string
    colfmt = "r" * len(df_print.columns)

    # generate the tabular
    tabular = df_print.to_latex(
        index=False,
        float_format="%.3f",
        column_format=colfmt,
        escape=False,
    )

    # wrap in table + resizebox
    print(
f"""\\begin{{table}}[H]
\\centering
\\scriptsize
\\caption{{Approximation algorithm vs DP on the {net} network.}}
\\label{{tab:approx_vs_dp_{net}}}
\\resizebox{{\\textwidth}}{{!}}{{%
{tabular}
}}
\\end{{table}}

"""
    )
