In [None]:
import pickle
import pandas as pd

from experiment_xml import (
    pydantic_to_xml_instructions,
    load_single_experiment,
    run_xml_experiment,
    load_experiment_summary,
    HistoricalEventXML,
    ArticleResponse1XML,
    ArticleResponse1nointXML,
    ArticleResponse2XML,
    ArticleResponse3XML,
    ArticleResponse4XML,
    ListofStrXML,
    ListofHistoricalEventXML,
    ArticleResponse2XMLalt,
    ArticleResponse4XMLalt,
)

In [99]:
from scipy import stats


def results_to_table(results_list):

    df_results = {}
    for name, ss_results in results_list.items():
        df_results[name] = pd.DataFrame.from_dict(
            {
                tuple(mname.split("_", maxsplit=1)): {
                    tname: ss_results[mname][tname]["valid"] * 100
                    for tname in ss_results[mname].keys()
                }
                for mname in ss_results.keys()
            },
            orient="index",
        )
    return pd.concat(df_results).reorder_levels([1, 2, 0], axis=0).sort_index(axis=0)


def compare_m_experiments(
    results_list, p_value=0.05, alternative="greater", print_only_passed=False
):
    """For each model compare the two experiments statistically.
    By default a one-sided test is used.
    Bonferroni correction is applied.
    """
    # Get all model names and select the subset shared by all experiments
    all_models = [set(inner_dict.keys()) for inner_dict in results_list.values()]
    model_list = set.intersection(*all_models)

    # Bonferroni correction
    n_tests = len(model_list)
    alpha = p_value / n_tests

    hypothesis_tests = {}
    for model in model_list:

        contingency_table = {}
        for name, ss_results in results_list.items():
            num_true = 0
            num_total = 0

            for tname in ss_results[model].keys():
                num_data = len(ss_results[model][tname]["outputs"])
                num_true += ss_results[model][tname]["valid"] * num_data
                num_total += num_data

            contingency_table[name] = {
                "Passed": num_true,
                "Failed": num_total - num_true,
            }

        ct = pd.DataFrame.from_dict(contingency_table, orient="index")

        # Ensure ordering to match hypotheses
        # Columns are experiments
        # Rows are outcomes
        # Column marginals are constant
        ct_n = ct[["Passed", "Failed"]].to_numpy().T
        sf = stats.fisher_exact(ct_n, alternative=alternative)
        sb = stats.barnard_exact(ct_n, alternative=alternative)

        hypothesis_tests[model] = {
            "Fisher exact": sf.pvalue,
            "Barnard exact": sf.pvalue,
            "Outcome": sf.pvalue < alpha,
        }

        if not print_only_passed or (sb.pvalue < alpha):
            print(f"\n{model}")
            print(f"Fisher exact p-value = {sf.pvalue}")
            print(f"Barnard exact p-value = {sb.pvalue}")
        if sb.pvalue < alpha:
            print(f"Hypothesis test passed: {sf.pvalue:.3g} < {alpha:.3g}")

    return pd.DataFrame.from_dict(hypothesis_tests, orient="index")

Load results


In [89]:
# Temperature 0/0.8 experiment
# Includes Anthropic models
results_temp = {}
experiment_date = "25-02-25"
experiment_num = "4"
load_experiment_summary(experiment_num, experiment_date, results_temp)

Loaded structure_support_by_model_t0
Loaded structure_support_by_model_t08


In [90]:
htest_temp = compare_m_experiments(results_temp, print_only_passed=True)

In [91]:
# Testing list prompting
results_list = {}
experiment_date = "24-02-25"
experiment_num = "3"
load_experiment_summary(experiment_num, experiment_date, results_list)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_alt


In [92]:
htest_list = compare_m_experiments(results_list, print_only_passed=True)

In [None]:
# Testing system and user prompting & output parsers
results_sys_user = {}
experiment_date = "20-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_user)
# load_single_experiment(experiment_num, experiment_date, "sp", results_sys_user)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_user)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_sp
Loaded structure_support_by_model_user


In [95]:
htest_sys_user = compare_m_experiments(results_sys_user, print_only_passed=True)

In [96]:
htest_sys_user

Unnamed: 0,Fisher exact,Barnard exact,Outcome
Ollama_phi4,1.0,1.0,False
Ollama_deepseekr1,1.0,1.0,False
fireworks_llama31,0.978526,0.978526,False
fireworks_llama33,1.0,1.0,False
Ollama_nemotron,0.999621,0.999621,False
Ollama_llama32,0.974642,0.974642,False
fireworks_llama32,0.999937,0.999937,False
Ollama_phi3,0.999997,0.999997,False


In [None]:
# Testing system and user prompting & output parsers
results_sys_other = {}
experiment_date = "21-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_other)
load_single_experiment(
    experiment_num, experiment_date, "sys_w_reminder", results_sys_other
)
load_single_experiment(experiment_num, experiment_date, "parser", results_sys_other)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_other)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_sys_w_reminder
Loaded structure_support_by_model_parser
Loaded structure_support_by_model_user


In [None]:
htest_sys_parser = compare_m_experiments(
    {
        k: results_sys_other[k]
        for k in [
            "structure_support_by_model_sys",
            "structure_support_by_model_parser",
        ]
    },
    print_only_passed=True,
)


Ollama_phi3
Fisher exact p-value = 8.963397555467613e-06
Barnard exact p-value = 1.4651317703804546e-05
Hypothesis test passed: 8.96e-06 < 0.0167

Ollama_deepseekr1
Fisher exact p-value = 2.9619019726680747e-20
Barnard exact p-value = 7.573150711923766e-20
Hypothesis test passed: 2.96e-20 < 0.0167

Ollama_llama32
Fisher exact p-value = 3.4484332628819433e-13
Barnard exact p-value = 5.960884930204636e-13
Hypothesis test passed: 3.45e-13 < 0.0167


Collate similar experiments


In [159]:
combined_experiment_list = [
    results_sys_user["structure_support_by_model_sys"],
    results_sys_user["structure_support_by_model_user"],
    results_temp["structure_support_by_model_t0"],
    results_temp["structure_support_by_model_t08"],
    results_list["structure_support_by_model_alt"],
    results_list["structure_support_by_model_sys"],
    results_sys_other["structure_support_by_model_sys"],
    results_sys_other["structure_support_by_model_sys_w_reminder"],
    results_sys_other["structure_support_by_model_user"],
]

# Create single dicitonary over models
combined_results = {}
for ex in combined_experiment_list:
    for model, results_by_struct in ex.items():
        mdict = combined_results.get(model, {})
        for sname, results in results_by_struct.items():
            sdict = mdict.get(sname, {})
            sdict["outputs"] = sdict.get("outputs", [])
            sdict["outputs"].extend(results["outputs"])
            mdict[sname] = sdict
        combined_results[model] = mdict

for model, results_by_struct in combined_results.items():
    for sname, results in results_by_struct.items():
        results["valid"] = pd.Series(
            [o["error_type"] == "ok" for o in results["outputs"]]
        ).mean()
        results["num"] = len(results["outputs"])

In [167]:
df_results = pd.DataFrame.from_dict(
    {
        tuple(mname.split("_", maxsplit=1)): {
            tname: combined_results[mname][tname]["valid"] * 100
            for tname in combined_results[mname].keys()
        }
        for mname in combined_results.keys()
    },
    orient="index",
)
df_count = pd.DataFrame.from_dict(
    {
        tuple(mname.split("_", maxsplit=1)): {
            tname: combined_results[mname][tname]["num"]
            for tname in combined_results[mname].keys()
        }
        for mname in combined_results.keys()
    },
    orient="index",
)
df_results.round(1)

Unnamed: 0,Unnamed: 1,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML,ArticleResponse2XMLalt,ArticleResponse4XMLalt
Ollama,llama32,39.4,97.9,91.2,68.3,69.4,95.0,100.0
Ollama,nemotron,21.7,35.0,15.6,17.8,25.0,0.0,0.0
Ollama,phi3,45.0,53.6,28.1,16.7,15.0,10.0,0.0
Ollama,phi4,96.7,100.0,100.0,98.3,98.1,100.0,100.0
Ollama,deepseekr1,73.3,77.9,76.9,70.6,63.7,75.0,60.0
fireworks,llama31,70.0,100.0,99.4,98.3,99.4,100.0,95.0
fireworks,llama32,37.8,95.7,92.5,91.1,90.6,100.0,100.0
fireworks,llama33,96.7,100.0,100.0,98.9,96.9,100.0,100.0
fireworks,qwen25,100.0,100.0,100.0,100.0,100.0,100.0,100.0
fireworks,deepseekr1_70b,0.0,0.0,0.0,0.0,0.0,,


In [161]:
df_count

Unnamed: 0,Unnamed: 1,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML,ArticleResponse2XMLalt,ArticleResponse4XMLalt
Ollama,llama32,180,140,160,180,160,20.0,20.0
Ollama,nemotron,180,140,160,180,160,20.0,20.0
Ollama,phi3,180,140,160,180,160,20.0,20.0
Ollama,phi4,180,140,160,180,160,20.0,20.0
Ollama,deepseekr1,180,140,160,180,160,20.0,20.0
fireworks,llama31,180,140,160,180,160,20.0,20.0
fireworks,llama32,180,140,160,180,160,20.0,20.0
fireworks,llama33,180,140,160,180,160,20.0,20.0
fireworks,qwen25,80,40,60,80,60,20.0,20.0
fireworks,deepseekr1_70b,40,40,40,40,40,,
