# Notebook for analyzing the results obtained from smaller networks

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from bnsl.metrics import compute_shd

INFO:rpy2.situation:cffi mode is CFFI_MODE.ANY
INFO:rpy2.situation:R home found: /usr/lib/R
INFO:rpy2.situation:R library path: /usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server
INFO:rpy2.situation:LD_LIBRARY_PATH: /usr/lib/R/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server
INFO:rpy2.rinterface_lib.embedded:Default options to initialize R: rpy2, --quiet, --no-save
INFO:rpy2.rinterface:Environment variable "PWD" redefined by R and overriding existing variable. Current: "/mnt/c/Users/Aurora Ingebrigtsen/AppData/Local/Programs/Microsoft VS Code", R: "/home/aurora/source/repos/bn-structure-learning/experiments/notebooks"
INFO:rpy2.rinterface:R is already initialized. No need to initialize.


Store the records as a dataframe

In [2]:
root = Path.cwd().parents[1]    # two levels up
files = list((root / "data" / "results" / "small").rglob("*.json"))

In [3]:
records = []
for f in files:
    with open(f) as fp:
        r = json.load(fp)
        record = {
            "algorithm": r["algorithm"],
            "network": r["network"].split("/")[-1].split(".")[0],
            "num_samples": r["num_samples"],
            "score": r["score"],
            "theoretical_upper_bound": r["bounds"].get("theoretical_upper_bound"),
            "naive_upper_bound": r["bounds"].get("naive_upper_bound"),
            "runtime": r["seconds_elapsed"],
            "k": r["params"].get("k"),
            "l": r["params"].get("l"),
            "num_vars": r["num_variables"],
            "seed": r["seed"],
            "parent_map": r.get("parent_map"),
        }
        records.append(record)

df = pd.DataFrame(records)

In [4]:
df.head()

Unnamed: 0,algorithm,network,num_samples,score,theoretical_upper_bound,naive_upper_bound,runtime,k,l,num_vars,seed,parent_map
0,approximation_algorithm,sachs,1000,-7629.01,-3373.455,-6831.66,0.008,2.0,1.0,11,43,"{'Akt': ['Erk', 'PKA'], 'Erk': ['Mek', 'PKA'],..."
1,silander_myllymaki,survey,100,-423.143,,-420.051,0.0,,,6,42,"{'A': [], 'E': ['A'], 'O': [], 'R': [], 'S': [..."
2,approximation_algorithm,cancer,10000,-20940.605,-10835.479,-20744.596,0.001,2.0,1.0,5,43,"{'Cancer': ['Dyspnoea'], 'Dyspnoea': [], 'Poll..."
3,approximation_algorithm,cancer,100,-219.777,-184.72,-219.777,0.001,4.0,3.0,5,42,"{'Cancer': [], 'Dyspnoea': [], 'Pollution': []..."
4,silander_myllymaki,asia,10000,-22193.508,,-16725.365,0.004,,,8,44,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."


 Create a table with only approximation algorithm, and true fields from dp

In [5]:
approx_df = df[df["algorithm"] == "approximation_algorithm"]

In [6]:
# apply the true score from silander myllymaki to the corresponding approximation results (important that dp has been run on the same network, num_samples, and seed as 
# the approximation result)
approx_df["dp_score"] = approx_df.apply(
    lambda row: df[
        (df["algorithm"] == "silander_myllymaki") &
        (df["network"] == row["network"]) &
        (df["num_samples"] == row["num_samples"]) &
        (df["seed"] == row["seed"])
    ]["score"].values[0], axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approx_df["dp_score"] = approx_df.apply(


In [7]:
# apply the true pm from silander myllymaki to the corresponding approximation results
# TODO: check if this is actually used anywhere
approx_df["dp_parent_map"] = approx_df.apply(
    lambda row: df[
        (df["algorithm"] == "silander_myllymaki") &
        (df["network"] == row["network"]) &
        (df["num_samples"] == row["num_samples"]) &
        (df["seed"] == row["seed"])
    ]["parent_map"].values[0], axis=1
).drop(columns=["algorithm"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approx_df["dp_parent_map"] = approx_df.apply(


In [8]:
approx_df.head()

Unnamed: 0,algorithm,network,num_samples,score,theoretical_upper_bound,naive_upper_bound,runtime,k,l,num_vars,seed,parent_map,dp_score,dp_parent_map
0,approximation_algorithm,sachs,1000,-7629.01,-3373.455,-6831.66,0.008,2.0,1.0,11,43,"{'Akt': ['Erk', 'PKA'], 'Erk': ['Mek', 'PKA'],...",-7629.01,"{'Akt': ['Erk', 'PKA'], 'Erk': [], 'Jnk': ['PK..."
2,approximation_algorithm,cancer,10000,-20940.605,-10835.479,-20744.596,0.001,2.0,1.0,5,43,"{'Cancer': ['Dyspnoea'], 'Dyspnoea': [], 'Poll...",-20936.911,"{'Cancer': ['Pollution', 'Smoker'], 'Dyspnoea'..."
3,approximation_algorithm,cancer,100,-219.777,-184.72,-219.777,0.001,4.0,3.0,5,42,"{'Cancer': [], 'Dyspnoea': [], 'Pollution': []...",-219.777,"{'Cancer': [], 'Dyspnoea': [], 'Pollution': []..."
5,approximation_algorithm,earthquake,1000,-415.992,-68.773,-382.674,0.001,3.0,2.0,5,42,"{'Alarm': ['Burglary'], 'Burglary': [], 'Earth...",-415.992,"{'Alarm': ['Earthquake'], 'Burglary': ['Alarm'..."
6,approximation_algorithm,sachs,1000,-7629.01,-6210.492,-6831.66,0.064,4.0,3.0,11,43,"{'Akt': ['Erk', 'PKA'], 'Erk': ['Mek', 'PKA'],...",-7629.01,"{'Akt': ['Erk', 'PKA'], 'Erk': [], 'Jnk': ['PK..."


Compute SHD from the DP network

In [9]:
# approx_df["SHD"] = approx_df.apply(
#     lambda row: compute_shd(
#         root / "networks" /  "small" / f"{row['network']}.bif",
#        row["parent_map"],
#     ), axis=1
# )

In [10]:
approx_df.sort_values(["network", "num_samples"]).head(20)

Unnamed: 0,algorithm,network,num_samples,score,theoretical_upper_bound,naive_upper_bound,runtime,k,l,num_vars,seed,parent_map,dp_score,dp_parent_map
17,approximation_algorithm,asia,100,-251.306,-170.938,-205.545,0.016,5.0,4.0,8,42,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
30,approximation_algorithm,asia,100,-256.712,59.354,-205.545,0.001,2.0,1.0,8,44,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
85,approximation_algorithm,asia,100,-252.923,-92.995,-205.545,0.006,3.0,2.0,8,44,"{'asia': [], 'bronc': ['dysp'], 'dysp': [], 'e...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
87,approximation_algorithm,asia,100,-252.923,-92.995,-205.545,0.005,3.0,2.0,8,43,"{'asia': [], 'bronc': ['dysp'], 'dysp': [], 'e...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
119,approximation_algorithm,asia,100,-252.923,-146.304,-205.545,0.012,4.0,3.0,8,43,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
137,approximation_algorithm,asia,100,-251.306,-170.938,-205.545,0.014,5.0,4.0,8,44,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
144,approximation_algorithm,asia,100,-252.923,-146.304,-205.545,0.012,4.0,3.0,8,44,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
146,approximation_algorithm,asia,100,-252.923,-92.995,-205.545,0.004,3.0,2.0,8,42,"{'asia': [], 'bronc': ['dysp'], 'dysp': [], 'e...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
151,approximation_algorithm,asia,100,-252.923,-146.304,-205.545,0.007,4.0,3.0,8,42,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."
165,approximation_algorithm,asia,100,-256.712,59.354,-205.545,0.002,2.0,1.0,8,42,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro...",-251.306,"{'asia': [], 'bronc': ['smoke'], 'dysp': ['bro..."


Make k and l ints

In [11]:
approx_df["k"] = approx_df["k"].astype(int)
approx_df["l"] = approx_df["l"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approx_df["k"] = approx_df["k"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approx_df["l"] = approx_df["l"].astype(int)


Plots comparing scores to upper bound

In [None]:
# Aggregate over seeds
df_agg = (
    approx_df.groupby(["num_samples", "network", "k", "l"], as_index=False)
      .agg(
          score=("score", "mean"),
          dp_score=("dp_score", "mean"),
          theoretical_upper_bound=("theoretical_upper_bound", "mean"),
          naive_upper_bound=("naive_upper_bound", "mean"),
      )
)

for (network, n), sub in df_agg.groupby(["network", "num_samples"]):

    sub = sub.sort_values(["k", "l"])

    x = np.arange(len(sub))
    width = 0.18

    plt.figure(figsize=(7, 4))

    plt.bar(x - 1.5*width, sub["score"], width, label="Approx score")
    plt.bar(x - 0.5*width, sub["dp_score"], width, label="DP (optimal)")
    plt.bar(x + 0.5*width, sub["theoretical_upper_bound"], width, label="Theoretical upper bound")
    plt.bar(x + 1.5*width, sub["naive_upper_bound"], width, label="Naive upper bound")

    # LaTeX-style ratio l/k
    ratio_labels = [fr"$\frac{{{l}}}{{{k}}}$" for k, l in zip(sub["k"], sub["l"])]

    plt.xticks(x, ratio_labels)
    plt.xlabel("Approximation ratio $\\frac{l}{k}$")
    plt.ylabel("Score")
    plt.title(f"Scores per approximation ratio\nNetwork={network}, n={n}")

    # Upper-right legend
    plt.legend(fontsize=7, loc="upper right")

    plt.tight_layout()

    outfile = root / f"experiments/plots/plot_scores_{network}_n{n}.png"
    plt.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close()


NameError: name 'df_agg' is not defined

Plot per sample size, with subplots per network

In [None]:
import math
for n, df_n in df_agg.groupby("num_samples"):
    networks = sorted(df_n["network"].unique())
    num_nets = len(networks)

    # Use 3 columns, so layout is 2 rows Ã— 3 cols (last cell empty)
    ncols = 3
    nrows = math.ceil(num_nets / ncols)

    fig, axes = plt.subplots(
        nrows, ncols,
        figsize=(5 * ncols, 3.5 * nrows), 
        sharey=True
    )
    axes = np.array(axes).reshape(-1)  # flatten to 1D list

    legend_labels = ["Approx score", "DP (optimal)", "Theoretical upper bound", "Naive upper bound"]
    first_handles = None

    for ax_idx, (ax, net) in enumerate(zip(axes, networks)):
        sub = df_n[df_n["network"] == net].sort_values(["k", "l"])

        x = np.arange(len(sub))
        width = 0.18

        # Bars
        b1 = ax.bar(x - 1.5 * width, sub["score"], width)
        b2 = ax.bar(x - 0.5 * width, sub["dp_score"], width)
        b3 = ax.bar(x + 0.5 * width, sub["theoretical_upper_bound"], width)
        b4 = ax.bar(x + 1.5 * width, sub["naive_upper_bound"], width)

        if first_handles is None:
            first_handles = [b1, b2, b3, b4]

        # x-labels as l/k
        ratio_labels = [fr"$\frac{{{l}}}{{{k}}}$" for k, l in zip(sub["k"], sub["l"])]
        ax.set_xticks(x)
        ax.set_xticklabels(ratio_labels)

        ax.set_title(net)

        # Label Y-axis only for the leftmost column
        if ax_idx % ncols == 0:
            ax.set_ylabel("Score")

    # Hide unused subplot(s)
    for ax in axes[num_nets:]:
        ax.set_visible(False)

    # Common xlabel
    fig.text(0.5, 0.04, r"Approximation ratio $\frac{l}{k}$", ha="center")

    # Global legend
    fig.legend(
        first_handles,
        legend_labels,
        loc="upper right",
        fontsize=8,
        bbox_to_anchor=(0.98, 0.98),
    )

    fig.suptitle(f"Scores per approximation ratio (n = {n})", y=0.99)
    fig.tight_layout(rect=[0.02, 0.06, 0.96, 0.95])

    outfile = root / f"experiments/plots/plot_scores_n{n}_subplots.png"
    fig.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close(fig)


Plot comparing SHD

In [None]:
# Aggregate over seeds (include SHD)
df_agg = (
    approx_df.groupby(["num_samples", "network", "k", "l"], as_index=False)
             .agg(
                 score=("score", "mean"),
                 dp_score=("dp_score", "mean"),
                 upper_bound=("theoretical_upper_bound", "mean"),
                 SHD=("SHD", "mean"),
             )
)

# One plot per sample size, bars = (k,l) configs, x = network, y = SHD
for n, sub in df_agg.groupby("num_samples"):
    pivot = sub.pivot(index="network", columns=["k", "l"], values="SHD")

    pivot = pivot.sort_index()
    networks = pivot.index.to_list()
    configs = list(pivot.columns)

    x = np.arange(len(networks))
    num_cfgs = len(configs)
    width = 0.8 / num_cfgs

    plt.figure(figsize=(8, 4))  # slightly wider for the legend

    for i, (k, l) in enumerate(configs):
        offsets = x + (i - (num_cfgs - 1) / 2) * width
        shd_vals = pivot[(k, l)].values

        plt.bar(
            offsets,
            shd_vals,
            width,
            label=fr"$\frac{{{int(l)}}}{{{int(k)}}}$",
        )

    plt.xticks(x, networks, rotation=30, ha="right")
    plt.ylabel("Mean SHD (over seeds)")
    plt.title(f"SHD per network for different approximation ratios (n={n})")

    # --- Clean legend placement ---
    plt.legend(
        fontsize=7,
        title=r"Approx. ratio $\frac{l}{k}$",
        loc="center left",
        bbox_to_anchor=(1.02, 0.5),  # outside right
        borderaxespad=0,
    )

    plt.tight_layout()

    outfile = root / f"experiments/plots/plot_shd_n{n}.png"
    plt.savefig(outfile, dpi=200, bbox_inches="tight")
    plt.close()

    print(f"Saved: {outfile}")


In [None]:
# look at the entries where the network is survey and sample size is 10000
df_filtered = approx_df[
    (approx_df["network"] == "survey") &
    (approx_df["num_samples"] == 10000)
]
df_filtered.head()
# Look only at k, l and parent map, print all parent maps
for idx, row in df_filtered.iterrows():
    k = row["k"]
    l = row["l"]
    pm = row["parent_map"]
    print(f"k={k}, l={l}, parent_map={pm}")
    

In [None]:
compute_shd(
    root / "networks" /  "small" / "survey.bif",
    {'A': ['E'], 'E': ['O', 'R'], 'O': ['T'], 'R': ['T'], 'S': ['E'], 'T': []}
)

Write pandas table to latex

In [None]:
column_order = [
    "num_samples",
    "seed",
    "l",
    "k",
    "runtime",
    "score",
    "dp_score",
    "theoretical_upper_bound",
    "SHD",
]


In [None]:
# Sort globally before grouping
approx_df = approx_df.sort_values(["seed", "num_samples", "k"])

for net, df_net in approx_df.groupby("network"):
    print(f"% --- {net} ---")

    df_print = df_net.drop(columns=["network", "parent_map", "dp_parent_map", "num_vars"])

    df_print = df_print[column_order]

    # dynamically create column format string
    colfmt = "r" * len(df_print.columns)

    # generate the tabular
    tabular = df_print.to_latex(
        index=False,
        float_format="%.3f",
        column_format=colfmt,
        escape=False,
    )

    # wrap in table + resizebox
    print(
f"""\\begin{{table}}[H]
\\centering
\\scriptsize
\\caption{{Approximation algorithm vs DP on the {net} network.}}
\\label{{tab:approx_vs_dp_{net}}}
\\resizebox{{\\textwidth}}{{!}}{{%
{tabular}
}}
\\end{{table}}

"""
    )


Generate network visualizations

In [None]:
from pgmpy.models import DiscreteBayesianNetwork

def draw_bn(pm, filename="bn.png"):
    # Convert pm into edge list
    edges = [(parent, child) for child, parents in pm.items() for parent in parents]
    bn = DiscreteBayesianNetwork(edges)

    # Convert model into pygraphviz object
    model_graphviz = bn.to_graphviz()

    # Plot the model.
    model_graphviz.draw(f"plots/{filename}", prog="dot")

In [None]:
asia = network_dfs["asia"]

In [None]:
for row_idx, row in asia.iterrows():
    k = row["k"]
    l = row["l"]
    seed = row["seed"]
    num_samples = row["num_samples"]
    filename = f"asia_k{k}_l{l}_seed{seed}_samples{num_samples}.png"
    draw_bn(row["parent_map"], filename=filename)