## LLM Priors Assessments

### Define functions to generate descriptions and priors for synthetic datasets

In [None]:
from pathlib import Path

import pandas as pd
import pyagrum as gum
from tqdm.asyncio import tqdm

from priors.llm import (
    GraphDescriptionBase,
    extract,
    parse_graph_description,
    parse_priors,
)
from priors.prompt import prepare_graph_description, prepare_priors

async def generate_graph_description(
    causal_graph: gum.BayesNet,
    model: str = "gemini-2.5-flash",
) -> GraphDescriptionBase:
    graph_prompt = prepare_graph_description(causal_graph)
    graph_description_raw = (
        (await extract(graph_prompt, None, model=model)).choices[0].message.content
    )
    assert graph_description_raw is not None, "Failed to obtain graph description"

    return await parse_graph_description(
        graph_description_raw, parse_method="llm", valid_vars=causal_graph.names()
    )


async def enrich_graph(
    causal_graph: gum.BayesNet,
    model: str = "gemini-2.5-flash",
    save_dir: str | Path | None = None,
):
    graph_description = await generate_graph_description(causal_graph, model=model)
    for name, description in graph_description.variable_descriptions.items():
        causal_graph.variableFromName(name).setDescription(description)
    causal_graph.setProperty("name", graph_description.title)  # pyagrum ignores the name property when loading from BIFXML
    if save_dir is not None:
        if isinstance(save_dir, str):
            save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        save_path = save_dir / f"{graph_description.identifier}.bifxml"
        causal_graph.saveBIFXML(str(save_path))

    return graph_description


async def generate_priors(
    bn: gum.BayesNet,
    variable_descriptions: dict[str, str] | None = None,
    prior_model: str = "gemini-2.5-flash",
    parse_model: str = "gemini-2.5-flash-lite",
) -> dict:
    priors_prompt = prepare_priors(bn, descriptions=variable_descriptions)
    priors_raw = (
        (await extract(priors_prompt, None, model=prior_model)).choices[0].message.content
    )
    assert priors_raw is not None, "Failed to obtain priors"

    priors = await parse_priors(
        prior_response=priors_raw, model=parse_model, valid_vars=bn.names()
    )
    edges_to_remove = {frozenset(pair) for pair in priors}
    true_edges = {
        frozenset((bn.variable(id1).name(), bn.variable(id2).name()))
        for id1, id2 in bn.arcs()
    }

    # Positive represents the number of edges classified as forbidden (edges_to_remove)
    false_positive = len(edges_to_remove & true_edges)
    true_positive = len(edges_to_remove) - false_positive

    return {
        "priors": priors,
        "TP": true_positive,
        "FP": false_positive,
    }


async def evaluate_priors(bif_paths: list[str | Path], prior_model: str, parse_model: str = None, exclude_descriptions: bool = False) -> pd.DataFrame:
    if parse_model is None:
        parse_model = prior_model
    prior_results = []
    prior_tasks = []
    for bif_path in bif_paths:
        if isinstance(bif_path, str):
            bif_path = Path(bif_path)
        bn = gum.loadBN(str(bif_path))
        variable_descriptions = {
            name: bn.variable(name).description() for name in bn.names()
        }
        if exclude_descriptions or not any(variable_descriptions.values()):
            variable_descriptions = None
        prior_results.append(
            {
                "bn": bn,
                "title": bn.propertyWithDefault("name", "no_name"),
                "filename": bif_path.stem,
                "num_nodes": bn.size(),
                "num_edges": len(bn.arcs()),
                "variable_descriptions": variable_descriptions,
            }
        )
        prior_tasks.append(
            generate_priors(
                bn=bn,
                variable_descriptions=variable_descriptions,
                prior_model=prior_model,
                parse_model=parse_model,
            )
        )
    prior_res = await tqdm.gather(*prior_tasks)
    for res_dict, prior_res in zip(prior_results, prior_res):
        res_dict.update(prior_res)
    df = pd.DataFrame(prior_results)

    beta = 0.2
    df["TN"] = df["num_edges"] - df["FP"]
    df["FN"] = df["num_nodes"] * (df["num_nodes"] - 1) // 2 - df["num_edges"] - df["TP"]
    df["Precision"] = df["TP"] / (df["TP"] + df["FP"])
    df["Recall"] = df["TP"] / (df["TP"] + df["FN"])
    df["F-beta"] = (
        (1 + beta**2)
        * df["Precision"]
        * df["Recall"]
        / (beta**2 * df["Precision"] + df["Recall"])
    )

    return df

### Define experiment and helper function to evaluate the precision of LLM removed edges as causal priors

In [None]:
async def run_experiments(
    dataset_paths,
    n_runs=5,
    exclude_descriptions=True,
    prior_model="gemini-2.5-flash",
    parse_model=None,
):
    """
    Run multiple experiments and return combined results.

    Usage:
        results_no_desc = await run_experiments(bnlearn_small_datasets, n_runs=5, exclude_descriptions=True)
        results_with_desc = await run_experiments(bnlearn_small_datasets, n_runs=5, exclude_descriptions=False)
    """
    all_results = []

    for run_id in range(n_runs):
        print(f"Run {run_id + 1}/{n_runs}")
        df = await evaluate_priors(
            bif_paths=dataset_paths,
            prior_model=prior_model,
            parse_model=parse_model,
            exclude_descriptions=exclude_descriptions,
        )
        df["run_id"] = run_id
        all_results.append(df)

    return pd.concat(all_results, ignore_index=True)


def get_summary(df, metrics=["Precision", "Recall", "F-beta"]):
    """
    Get summary statistics for each dataset.

    Usage:
        summary = get_summary(results_no_desc)
    """
    summary_data = []

    for dataset in df["filename"].unique():
        dataset_df = df[df["filename"] == dataset]

        for metric in metrics:
            values = dataset_df[metric].dropna()

            if len(values) > 0:
                summary_data.append(
                    {
                        "Dataset": dataset,
                        "Metric": metric,
                        "Mean": values.mean(),
                        "Std": values.std(),
                        "Min": values.min(),
                        "Max": values.max(),
                        "Runs": len(values),
                    }
                )

    return pd.DataFrame(summary_data)


def show_report(df, metrics=["Precision", "Recall", "F-beta"]):
    """
    Get summary statistics for each dataset.

    Usage:
        summary = show_report(results_no_desc)
    """
    groups = df.groupby(["filename", "with_desc"])
    meta = groups.agg(
        {
            "num_nodes": "first",
            "num_edges": "first",
        }
    )
    meta["repeats"] = groups.size()
    meta.columns = pd.MultiIndex.from_product([["meta"], meta.columns])
    index = (
        groups["num_nodes"]
        .first()
        .reset_index()
        .sort_values(["num_nodes", "filename", "with_desc"])
        .set_index(["filename", "with_desc"])
        .index
    )
    summary = pd.concat([meta, groups[metrics].agg(["mean", "std"])], axis=1)
    return summary.loc[index]

### Experiment Configurations

In [None]:
repeats = 10
prior_model = "gemini-2.5-flash"
parse_model = "gemini-2.5-flash-lite"

### Run experiments on bnlearn datasets with/without variable descriptions

In [None]:
bnlearn_small_datasets = list(Path("priors/datasets/bnlearn/").glob("*.bifxml"))

bnlearn_results_no_desc = await run_experiments(
    bnlearn_small_datasets,
    n_runs=repeats,
    exclude_descriptions=True,
    prior_model=prior_model,
    parse_model=parse_model,
)
# Save complete results for the record
bnlearn_results_no_desc.drop(columns="bn").to_json("results/2025/llm/bnlearn_no_desc.json", orient="records", indent=4)
get_summary(bnlearn_results_no_desc)

Unnamed: 0,Dataset,Metric,Mean,Std,Min,Max,Runs
0,survey,Precision,0.641212,0.129287,0.545455,1.0,10
1,survey,Recall,0.744444,0.291865,0.333333,1.0,10
2,survey,F-beta,0.636324,0.105949,0.549296,0.928571,10
3,cancer,Precision,1.0,0.0,1.0,1.0,10
4,cancer,Recall,0.533333,0.172133,0.333333,0.833333,10
5,cancer,F-beta,0.961965,0.021304,0.928571,0.992366,10
6,sachs,Precision,0.791352,0.030685,0.738095,0.846154,10
7,sachs,Recall,0.710526,0.213069,0.289474,0.947368,10
8,sachs,F-beta,0.782865,0.025311,0.740809,0.809969,10
9,asia,Precision,0.959048,0.065991,0.857143,1.0,10


In [None]:
bnlearn_results_with_desc = await run_experiments(
    bnlearn_small_datasets,
    n_runs=repeats,
    exclude_descriptions=False,
    prior_model=prior_model,
    parse_model=parse_model,
)
# Save complete results for the record
bnlearn_results_with_desc.drop(columns="bn").to_json("results/2025/llm/bnlearn_with_desc.json", orient="records", indent=4)
get_summary(bnlearn_results_with_desc)

Unnamed: 0,Dataset,Metric,Mean,Std,Min,Max,Runs
0,survey,Precision,0.913333,0.114342,0.75,1.0,10
1,survey,Recall,0.3,0.139074,0.111111,0.555556,10
2,survey,F-beta,0.817419,0.082249,0.715596,0.928571,10
3,cancer,Precision,1.0,0.0,1.0,1.0,10
4,cancer,Recall,0.583333,0.211549,0.166667,0.833333,10
5,cancer,F-beta,0.961176,0.04513,0.83871,0.992366,10
6,sachs,Precision,0.753366,0.034493,0.734694,0.818182,10
7,sachs,Recall,0.942105,0.016644,0.894737,0.947368,10
8,sachs,F-beta,0.759171,0.033694,0.741093,0.822496,10
9,asia,Precision,0.893423,0.050324,0.833333,1.0,10


In [None]:
# bnlearn_results_no_desc = pd.read_json("results/2025/llm/bnlearn_no_desc.json")
# bnlearn_results_with_desc = pd.read_json("results/2025/llm/bnlearn_with_desc.json")
bnlearn_results_no_desc["with_desc"] = False
bnlearn_results_with_desc["with_desc"] = True

show_report(pd.concat([bnlearn_results_no_desc, bnlearn_results_with_desc], ignore_index=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,meta,meta,meta,Precision,Precision,Recall,Recall,F-beta,F-beta
Unnamed: 0_level_1,Unnamed: 1_level_1,num_nodes,num_edges,repeats,mean,std,mean,std,mean,std
filename,with_desc,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
cancer,False,5,4,10,1.0,0.0,0.533333,0.172133,0.961965,0.021304
cancer,True,5,4,10,1.0,0.0,0.583333,0.211549,0.961176,0.04513
earthquake,False,5,4,10,1.0,0.0,0.833333,0.0,0.992366,0.0
earthquake,True,5,4,10,1.0,0.0,0.833333,0.0,0.992366,0.0
survey,False,6,6,10,0.641212,0.129287,0.744444,0.291865,0.636324,0.105949
survey,True,6,6,10,0.913333,0.114342,0.3,0.139074,0.817419,0.082249
asia,False,8,8,10,0.959048,0.065991,0.82,0.129529,0.952485,0.069737
asia,True,8,8,10,0.893423,0.050324,0.7,0.113039,0.883523,0.053928
sachs,False,11,17,10,0.791352,0.030685,0.710526,0.213069,0.782865,0.025311
sachs,True,11,17,10,0.753366,0.034493,0.942105,0.016644,0.759171,0.033694


Save the best prior record for each dataset for combining with CausalABA.

Best is defined as the highest precision, and in case of ties, the highest F-beta (Recall).

In [None]:
bnlearn_all_results = pd.concat(
    [bnlearn_results_no_desc, bnlearn_results_with_desc], ignore_index=True
).drop(columns="bn")
bnlearn_best_results = (
    bnlearn_all_results.sort_values(
        by=["filename", "Precision", "F-beta"], ascending=[True, False, False]
    )
    .groupby("filename", as_index=False)
    .first()
)
bnlearn_best_results.to_json(
    "priors/datasets/bnlearn/prior_df.json", orient="records", indent=4
)
bnlearn_best_results

Unnamed: 0,filename,title,num_nodes,num_edges,variable_descriptions,priors,TP,FP,TN,FN,Precision,Recall,F-beta,run_id
0,asia,asia,8,8,{'smoke': 'smoking: whether or not the patient...,"{(bronc, tub), (xray, bronc), (smoke, asia), (...",18,0,8,2,1.0,0.9,0.995745,0
1,cancer,cancer,5,4,{'Cancer': 'Cancer: Indicates whether the pers...,"{(Xray, Dyspnoea), (Dyspnoea, Smoker), (Pollut...",5,0,4,1,1.0,0.833333,0.992366,2
2,earthquake,earthquake,5,4,{'Earthquake': 'The occurrence of seismic acti...,"{(Earthquake, JohnCalls), (Burglary, JohnCalls...",5,0,4,1,1.0,0.833333,0.992366,0
3,sachs,sachs,11,17,{'PKA': 'PKA (Protein Kinase A): cAMP-dependen...,"{(Jnk, Akt), (PKA, P38), (Akt, P38), (Plcg, P3...",11,2,15,27,0.846154,0.289474,0.787879,1
4,survey,survey,6,6,"{'A': 'Age: the age, recorded as young (young)...","{(A, S), (A, R), (A, O)}",3,0,6,6,1.0,0.333333,0.928571,3


### Run experiments on randomly generated datasets with/without variable descriptions

In [None]:
synthetic_datasets = list(Path("priors/datasets/random_graphs/heuristic_by_semantic").glob("*.bifxml"))

synthetic_results_no_desc = await run_experiments(
    synthetic_datasets,
    n_runs=repeats,
    exclude_descriptions=True,
    prior_model=prior_model,
    parse_model=parse_model,
)
# Save complete results for the record
synthetic_results_no_desc.drop(columns="bn").to_json("results/2025/llm/synthetic_no_desc.json", orient="records", indent=4)
get_summary(synthetic_results_no_desc)

Run 1/10



[A


[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [02:35<00:00, 22.28s/it]


Run 2/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [02:54<00:00, 24.96s/it]


Run 3/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [04:51<00:00, 41.69s/it]


Run 4/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [02:56<00:00, 25.23s/it]


Run 5/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [01:52<00:00, 16.11s/it]


Run 6/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [01:46<00:00, 15.16s/it]


Run 7/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [04:42<00:00, 40.39s/it]


Run 8/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [01:39<00:00, 14.23s/it]


Run 9/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [02:44<00:00, 23.51s/it]


Run 10/10



[A
[A
[A
[A
[A
[A
[A
100%|██████████| 7/7 [02:23<00:00, 20.49s/it]


Unnamed: 0,Dataset,Metric,Mean,Std,Min,Max,Runs
0,factors_influencing_well_being_and_resources,Precision,0.947457,0.070291,0.833333,1.0,10
1,factors_influencing_well_being_and_resources,Recall,0.232143,0.111993,0.107143,0.428571,10
2,factors_influencing_well_being_and_resources,F-beta,0.819527,0.034739,0.757282,0.876404,10
3,pharmacological_and_cardiac_health_dynamics,Precision,0.938095,0.112351,0.666667,1.0,10
4,pharmacological_and_cardiac_health_dynamics,Recall,0.522222,0.181821,0.222222,0.777778,10
5,pharmacological_and_cardiac_health_dynamics,F-beta,0.903653,0.112139,0.619048,0.98913,10
6,interacting_health_conditions_and_outcomes,Precision,0.951453,0.046112,0.870968,1.0,10
7,interacting_health_conditions_and_outcomes,Recall,0.333333,0.211338,0.022222,0.6,10
8,interacting_health_conditions_and_outcomes,F-beta,0.794582,0.182864,0.371429,0.907383,10
9,clinical_variables_in_cardiovascular_and_gastr...,Precision,0.95,0.105409,0.75,1.0,10


In [None]:
synthetic_results_with_desc = await run_experiments(
    synthetic_datasets,
    n_runs=repeats,
    exclude_descriptions=False,
    prior_model=prior_model,
    parse_model=parse_model,
)
# Save complete results for the record
synthetic_results_with_desc.drop(columns="bn").to_json("results/2025/llm/synthetic_with_desc.json", orient="records", indent=4)
get_summary(synthetic_results_with_desc)

Run 1/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [02:45<00:00, 23.68s/it]


Run 2/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [02:36<00:00, 22.34s/it]


Run 3/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [01:52<00:00, 16.01s/it]


Run 4/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [02:14<00:00, 19.21s/it]


Run 5/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [01:46<00:00, 15.20s/it]


Run 6/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [02:09<00:00, 18.46s/it]


Run 7/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [02:34<00:00, 22.14s/it]


Run 8/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [01:56<00:00, 16.65s/it]


Run 9/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [03:59<00:00, 34.27s/it]


Run 10/10




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

100%|██████████| 7/7 [04:41<00:00, 40.17s/it]


Unnamed: 0,Dataset,Metric,Mean,Std,Min,Max,Runs
0,factors_influencing_well_being_and_resources,Precision,0.836667,0.314446,0.0,1.0,10
1,factors_influencing_well_being_and_resources,Recall,0.182143,0.137241,0.0,0.464286,10
2,factors_influencing_well_being_and_resources,F-beta,0.76715,0.161786,0.490566,0.924901,9
3,pharmacological_and_cardiac_health_dynamics,Precision,0.983333,0.052705,0.833333,1.0,10
4,pharmacological_and_cardiac_health_dynamics,Recall,0.466667,0.126144,0.222222,0.666667,10
5,pharmacological_and_cardiac_health_dynamics,F-beta,0.936548,0.050264,0.81761,0.981132,10
6,interacting_health_conditions_and_outcomes,Precision,0.975942,0.044172,0.869565,1.0,10
7,interacting_health_conditions_and_outcomes,Recall,0.244444,0.141712,0.066667,0.488889,10
8,interacting_health_conditions_and_outcomes,F-beta,0.833948,0.081489,0.65,0.922581,10
9,clinical_variables_in_cardiovascular_and_gastr...,Precision,1.0,0.0,1.0,1.0,10


In [None]:
# synthetic_results_no_desc = pd.read_json("results/2025/llm/synthetic_no_desc.json")
# synthetic_results_with_desc = pd.read_json("results/2025/llm/synthetic_with_desc.json")
synthetic_results_no_desc["with_desc"] = False
synthetic_results_with_desc["with_desc"] = True

show_report(pd.concat([synthetic_results_no_desc, synthetic_results_with_desc], ignore_index=True))

Unnamed: 0_level_0,Unnamed: 1_level_0,meta,meta,meta,Precision,Precision,Recall,Recall,F-beta,F-beta
Unnamed: 0_level_1,Unnamed: 1_level_1,num_nodes,num_edges,repeats,mean,std,mean,std,mean,std
filename,with_desc,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
clinical_variables_in_cardiovascular_and_gastrointestinal_health,False,5,4,10,0.95,0.105409,0.316667,0.122977,0.863068,0.078601
clinical_variables_in_cardiovascular_and_gastrointestinal_health,True,5,4,10,1.0,0.0,0.316667,0.165738,0.901322,0.056589
pharmacological_and_cardiac_health_dynamics,False,6,6,10,0.938095,0.112351,0.522222,0.181821,0.903653,0.112139
pharmacological_and_cardiac_health_dynamics,True,6,6,10,0.983333,0.052705,0.466667,0.126144,0.936548,0.050264
clinical_pathways_in_cardiac_and_abdominal_health,False,7,7,10,0.964413,0.046881,0.435714,0.302015,0.856224,0.106886
clinical_pathways_in_cardiac_and_abdominal_health,True,7,7,10,0.992308,0.024325,0.45,0.254328,0.908354,0.098033
medical_conditions_and_outcomes,False,8,7,10,0.930032,0.095822,0.271429,0.15881,0.812083,0.091294
medical_conditions_and_outcomes,True,8,7,10,0.970909,0.046906,0.404762,0.051434,0.919662,0.036732
factors_influencing_well_being_and_resources,False,9,8,10,0.947457,0.070291,0.232143,0.111993,0.819527,0.034739
factors_influencing_well_being_and_resources,True,9,8,10,0.836667,0.314446,0.182143,0.137241,0.76715,0.161786


Save the best prior record for each dataset for combining with CausalABA.

Best is defined as the highest precision, and in case of ties, the highest F-beta (Recall).

In [None]:
synthetic_all_results = pd.concat(
    [synthetic_results_no_desc, synthetic_results_with_desc], ignore_index=True
).drop(columns="bn")
synthetic_best_results = (
    synthetic_all_results.sort_values(
        by=["filename", "Precision", "F-beta"], ascending=[True, False, False]
    )
    .groupby("filename", as_index=False)
    .first()
)
synthetic_best_results.to_json(
    "priors/datasets/random_graphs/prior_df.json", orient="records", indent=4
)
synthetic_best_results

Unnamed: 0,filename,title,num_nodes,num_edges,variable_descriptions,priors,TP,FP,TN,FN,Precision,Recall,F-beta,run_id
0,clinical_pathways_in_cardiac_and_abdominal_health,clinical_pathways_in_cardiac_and_abdominal_health,7,7,{'shivering': 'An involuntary muscular contrac...,"{(shivering, ascites), (shivering, hospital_ad...",10,0,7,4,1.0,0.714286,0.984848,4
1,clinical_variables_in_cardiovascular_and_gastr...,clinical_variables_in_cardiovascular_and_gastr...,5,4,{'rectal_varices': 'Abnormally enlarged and to...,"{(heart_failure, lower_gastrointestinal_bleedi...",4,0,4,2,1.0,0.666667,0.981132,9
2,factors_influencing_well_being_and_resources,factors_influencing_well_being_and_resources,9,8,"{'property_damage': 'The impairment, destructi...","{(early_death, separation), (heart_failure, di...",9,0,8,19,1.0,0.321429,0.924901,7
3,interacting_health_conditions_and_outcomes,interacting_health_conditions_and_outcomes,11,10,{'kidney_failure': 'A medical condition where ...,"{(heart_trouble, osteoporosis_associated_fract...",14,0,10,31,1.0,0.311111,0.921519,1
4,interconnected_health_and_socio_economic_outcomes,interconnected_health_and_socio_economic_outcomes,10,11,{'early_death': 'The occurrence of death at an...,"{(stagnant_economy, hospitalization), (clostri...",12,0,11,22,1.0,0.352941,0.934132,3
5,medical_conditions_and_outcomes,medical_conditions_and_outcomes,8,7,{'early_death': 'The occurrence of mortality a...,"{(coronary_heart_disease, imbalanced_ph), (lam...",10,0,7,11,1.0,0.47619,0.95941,9
6,pharmacological_and_cardiac_health_dynamics,pharmacological_and_cardiac_health_dynamics,6,6,{'palpitations': 'The subjective experience of...,"{(palpitations, appetite_suppression), (heart_...",7,0,6,2,1.0,0.777778,0.98913,1


### Average of the assessment results across multiple datasets

In [None]:
bnlearn_results_with_desc = pd.read_json("results/2025/llm/bnlearn_with_desc.json")
bnlearn_results_with_desc["dataset_type"] = "bnlearn"
bnlearn_results_with_desc["description_used"] = True

bnlearn_results_no_desc = pd.read_json("results/2025/llm/bnlearn_no_desc.json")
bnlearn_results_no_desc["dataset_type"] = "bnlearn"
bnlearn_results_no_desc["description_used"] = False

synthetic_results_with_desc = pd.read_json("results/2025/llm/synthetic_with_desc.json")
synthetic_results_with_desc["dataset_type"] = "synthetic"
synthetic_results_with_desc["description_used"] = True

synthetic_results_no_desc = pd.read_json("results/2025/llm/synthetic_no_desc.json")
synthetic_results_no_desc["dataset_type"] = "synthetic"
synthetic_results_no_desc["description_used"] = False

summary_df = pd.concat([
    bnlearn_results_with_desc,
    bnlearn_results_no_desc,
    synthetic_results_with_desc,
    synthetic_results_no_desc
], ignore_index=True).groupby(['dataset_type', 'description_used'])[
    ['Precision', 'Recall', "F-beta"]
].mean().reset_index()
summary_df.to_csv("results/2025/llm/prior_assessment.csv", index=False)
summary_df

Unnamed: 0,dataset_type,description_used,Precision,Recall,F-beta
0,bnlearn,False,0.878322,0.728327,0.865201
1,bnlearn,True,0.912024,0.671754,0.882731
2,synthetic,False,0.940081,0.382316,0.845814
3,synthetic,True,0.951584,0.35882,0.875679
