In [1]:
import pickle
import pandas as pd
import tabulate
from pydantic_structure_definitions import *
from experiment_xml import (
    load_single_experiment,
    load_experiment_summary,
)
from hypothesis_testing import wilson_score_ci, results_to_table, compare_m_experiments

In [2]:
%load_ext autoreload
%autoreload 1

### Experiment 4

Hypothesis: Higher temperature worsens the conformance to XML schema


In [3]:
# Temperature 0/0.8 experiment
# Includes Anthropic models
results_temp = {}
experiment_date = "25-02-25"
experiment_num = "4"
load_experiment_summary(experiment_num, experiment_date, results_temp)

Loaded structure_support_by_model_t0
Loaded structure_support_by_model_t08


In [4]:
htest_temp = compare_m_experiments(results_temp, print_only_passed=True)

### Experiment 3

Hypothesis: Encapsulating lists improves the conformance to XML schema


In [5]:
# Testing list prompting
results_list = {}
experiment_date = "24-02-25"
experiment_num = "3"
load_experiment_summary(experiment_num, experiment_date, results_list)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_alt


In [6]:
htest_list = compare_m_experiments(results_list, print_only_passed=True)

### Experiment X

Hypothesis: ???


In [25]:
# Testing system and user prompting & output parsers
results_sys_user = {}
experiment_date = "20-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_user)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_user)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_user


In [29]:
results_to_table(
    {
        k.split("_")[-1]: results_sys_user[k]
        for k in [
            "structure_support_by_model_sys",
            "structure_support_by_model_user",
        ]
    }
)

Unnamed: 0,Unnamed: 1,Unnamed: 2,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML
Ollama,deepseekr1,sys,70.0,65.0,60.0,75.0,40.0
Ollama,deepseekr1,user,75.0,85.0,95.0,65.0,75.0
Ollama,llama32,sys,35.0,100.0,80.0,50.0,70.0
Ollama,llama32,user,75.0,95.0,90.0,65.0,55.0
Ollama,nemotron,sys,50.0,65.0,0.0,5.0,15.0
Ollama,nemotron,user,25.0,35.0,40.0,55.0,40.0
Ollama,phi3,sys,30.0,35.0,10.0,0.0,5.0
Ollama,phi3,user,80.0,60.0,30.0,30.0,15.0
Ollama,phi4,sys,95.0,100.0,100.0,100.0,100.0
Ollama,phi4,user,100.0,100.0,100.0,95.0,90.0


In [28]:
htest_sys_user = compare_m_experiments(
    results_sys_user, bonferroni=None, print_only_passed=False, alternative="two-sided"
)


Ollama_nemotron
Fisher exact p-value = 0.09772525001821662
Barnard exact p-value = 0.07683768756297862

fireworks_llama33
Fisher exact p-value = 1.0
Barnard exact p-value = 0.5291008508783053

Ollama_llama32
Fisher exact p-value = 0.20996101859975763
Barnard exact p-value = 0.21067105973235728

fireworks_llama31
Fisher exact p-value = 1.0
Barnard exact p-value = 0.8524084514704552

Ollama_phi3
Fisher exact p-value = 4.482729548552896e-05
Barnard exact p-value = 2.659816375333478e-05
Hypothesis test passed: 2.66e-05 < 0.05

fireworks_llama32
Fisher exact p-value = 0.034734476465598206
Barnard exact p-value = 0.024304364173323535
Hypothesis test passed: 0.0243 < 0.05

Ollama_phi4
Fisher exact p-value = 0.621202458995485
Barnard exact p-value = 0.37408896938185093

Ollama_deepseekr1
Fisher exact p-value = 0.012750364922189563
Barnard exact p-value = 0.00872382492028616
Hypothesis test passed: 0.00872 < 0.05


### Experiment 2

Hypothesis 1: XML schema conformance differs between system and user prompts

Hypothesis 2: XML schema conformance differs between system and system + reminder prompting

Hypothesis 3: Output parsers are worse than prompting for XML


In [14]:
# Testing system and user prompting & output parsers
results_sys_other = {}
experiment_date = "21-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_other)
load_single_experiment(
    experiment_num, experiment_date, "sys_w_reminder", results_sys_other
)
load_single_experiment(experiment_num, experiment_date, "parser", results_sys_other)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_other)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_sys_w_reminder
Loaded structure_support_by_model_parser
Loaded structure_support_by_model_user


In [15]:
results_to_table(
    {
        k.split("_")[-1]: results_sys_other[k]
        for k in [
            "structure_support_by_model_sys_w_reminder",
            "structure_support_by_model_sys",
            "structure_support_by_model_user",
        ]
    }
)

Unnamed: 0,Unnamed: 1,Unnamed: 2,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML
Ollama,deepseekr1,reminder,75.0,85.0,85.0,75.0,45.0
Ollama,deepseekr1,sys,80.0,80.0,65.0,55.0,75.0
Ollama,deepseekr1,user,90.0,95.0,70.0,85.0,80.0
Ollama,llama32,reminder,30.0,95.0,90.0,60.0,60.0
Ollama,llama32,sys,15.0,100.0,90.0,45.0,65.0
Ollama,llama32,user,55.0,100.0,100.0,90.0,80.0
Ollama,nemotron,reminder,50.0,75.0,75.0,45.0,70.0
Ollama,nemotron,sys,15.0,25.0,0.0,5.0,10.0
Ollama,nemotron,user,25.0,20.0,5.0,40.0,45.0
Ollama,phi3,reminder,55.0,55.0,30.0,10.0,30.0


In [24]:
htest_sys_parser = compare_m_experiments(
    {
        k: results_sys_other[k]
        for k in [
            "structure_support_by_model_sys",
            "structure_support_by_model_sys_w_reminder",
        ]
    },
    bonferroni=3,
    print_only_passed=True,
    alternative="two-sided",
)


Ollama_nemotron
Fisher exact p-value = 1.0363568816780582e-14
Barnard exact p-value = 8.21839232294461e-15
Hypothesis test passed: 8.22e-15 < 0.0167


In [13]:
htest_sys_parser = compare_m_experiments(
    {
        k: results_sys_other[k]
        for k in [
            "structure_support_by_model_sys",
            "structure_support_by_model_user",
        ]
    },
    bonferroni=3,
    print_only_passed=True,
    alternative="two-sided",
)


Ollama_nemotron
Fisher exact p-value = 0.00633514342263136
Barnard exact p-value = 0.004019419937637749
Hypothesis test passed: 0.00402 < 0.0167

Ollama_llama32
Fisher exact p-value = 0.000626897240215083
Barnard exact p-value = 0.00040106000704492043
Hypothesis test passed: 0.000401 < 0.0167


In [20]:
htest_sys_parser = compare_m_experiments(
    {
        k: results_sys_other[k]
        for k in [
            "structure_support_by_model_sys",
            "structure_support_by_model_parser",
        ]
    },
    print_only_passed=False,
)


Ollama_phi3
Fisher exact p-value = 8.963397555467613e-06
Barnard exact p-value = 1.4651317703804546e-05
Hypothesis test passed: 1.47e-05 < 0.0167

Ollama_deepseekr1
Fisher exact p-value = 2.9619019726680747e-20
Barnard exact p-value = 7.573150711923766e-20
Hypothesis test passed: 7.57e-20 < 0.0167

Ollama_llama32
Fisher exact p-value = 3.4484332628819433e-13
Barnard exact p-value = 5.960884930204636e-13
Hypothesis test passed: 5.96e-13 < 0.0167


Collate similar experiments


In [None]:
combined_experiment_list = [
    results_sys_user["structure_support_by_model_sys"],
    results_sys_user["structure_support_by_model_user"],
    results_temp["structure_support_by_model_t0"],
    results_temp["structure_support_by_model_t08"],
    results_list["structure_support_by_model_alt"],
    results_list["structure_support_by_model_sys"],
    results_sys_other["structure_support_by_model_sys"],
    results_sys_other["structure_support_by_model_sys_w_reminder"],
    results_sys_other["structure_support_by_model_user"],
]

# Create single dictionary over models for each structure
combined_results = {}
for ex in combined_experiment_list:
    for model, results_by_struct in ex.items():
        mdict = combined_results.get(model, {})
        for sname, results in results_by_struct.items():
            sdict = mdict.get(sname, {})
            sdict["outputs"] = sdict.get("outputs", [])
            sdict["outputs"].extend(results["outputs"])
            mdict[sname] = sdict
        combined_results[model] = mdict

for model, results_by_struct in combined_results.items():
    for sname, results in results_by_struct.items():
        results["valid"] = pd.Series(
            [o["error_type"] == "ok" for o in results["outputs"]]
        ).mean()
        results["num"] = len(results["outputs"])

Calculate results and CI over all experiments


In [None]:
df_results = pd.DataFrame.from_dict(
    {
        tuple(mname.split("_", maxsplit=1)): {
            tname: combined_results[mname][tname]["valid"] * 100
            for tname in combined_results[mname].keys()
        }
        for mname in combined_results.keys()
    },
    orient="index",
)
df_count = pd.DataFrame.from_dict(
    {
        tuple(mname.split("_", maxsplit=1)): {
            tname: combined_results[mname][tname]["num"]
            for tname in combined_results[mname].keys()
        }
        for mname in combined_results.keys()
    },
    orient="index",
)


def format_ci(result):
    avg, lb, ub = wilson_score_ci(result["valid"] * result["num"], result["num"])
    # return f"{lb*100:.1f}% — {ub*100:.0f}% "
    return f"{avg*100:.1f}% ±{max(avg-lb, ub-avg)*100:.1f}"


df_ci = pd.DataFrame.from_dict(
    {
        tuple(mname.split("_", maxsplit=1)): {
            tname: format_ci(combined_results[mname][tname])
            for tname in combined_results[mname].keys()
        }
        for mname in combined_results.keys()
    },
    orient="index",
)
df_results.map(lambda x: f"{x:.1f}%")

Unnamed: 0,Unnamed: 1,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML,ArticleResponse2XMLalt,ArticleResponse4XMLalt
Ollama,llama32,39.4%,97.9%,91.2%,68.3%,69.4%,95.0%,100.0%
Ollama,nemotron,21.7%,35.0%,15.6%,17.8%,25.0%,0.0%,0.0%
Ollama,phi3,45.0%,53.6%,28.1%,16.7%,15.0%,10.0%,0.0%
Ollama,phi4,96.7%,100.0%,100.0%,98.3%,98.1%,100.0%,100.0%
Ollama,deepseekr1,73.3%,77.9%,76.9%,70.6%,63.7%,75.0%,60.0%
fireworks,llama31,70.0%,100.0%,99.4%,98.3%,99.4%,100.0%,95.0%
fireworks,llama32,37.8%,95.7%,92.5%,91.1%,90.6%,100.0%,100.0%
fireworks,llama33,96.7%,100.0%,100.0%,98.9%,96.9%,100.0%,100.0%
fireworks,qwen25,100.0%,100.0%,100.0%,100.0%,100.0%,100.0%,100.0%
fireworks,deepseekr1_70b,0.0%,0.0%,0.0%,0.0%,0.0%,nan%,nan%


In [None]:
print(
    tabulate.tabulate(
        df_results.map(lambda x: f"{x:.1f}%").reset_index(),
        headers="keys",
        tablefmt="pipe",
        showindex=False,
    )
)

| level_0   | level_1        | ArticleResponse1XML   | ArticleResponse1nointXML   | ArticleResponse2XML   | ArticleResponse3XML   | ArticleResponse4XML   | ArticleResponse2XMLalt   | ArticleResponse4XMLalt   |
|:----------|:---------------|:----------------------|:---------------------------|:----------------------|:----------------------|:----------------------|:-------------------------|:-------------------------|
| Ollama    | llama32        | 39.4%                 | 97.9%                      | 91.2%                 | 68.3%                 | 69.4%                 | 95.0%                    | 100.0%                   |
| Ollama    | nemotron       | 21.7%                 | 35.0%                      | 15.6%                 | 17.8%                 | 25.0%                 | 0.0%                     | 0.0%                     |
| Ollama    | phi3           | 45.0%                 | 53.6%                      | 28.1%                 | 16.7%                 | 15.0%                 | 10.0

The confidence interval


In [None]:
df_ci

Unnamed: 0,Unnamed: 1,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML,ArticleResponse2XMLalt,ArticleResponse4XMLalt
Ollama,llama32,39.4% ±7.3,97.9% ±4.0,91.2% ±5.4,68.3% ±7.1,69.4% ±7.5,95.0% ±18.6,100.0% ±16.1
Ollama,nemotron,21.7% ±6.6,35.0% ±8.2,15.6% ±6.4,17.8% ±6.2,25.0% ±7.2,0.0% ±16.1,0.0% ±16.1
Ollama,phi3,45.0% ±7.3,53.6% ±8.2,28.1% ±7.4,16.7% ±6.1,15.0% ±6.3,10.0% ±20.1,0.0% ±16.1
Ollama,phi4,96.7% ±3.7,100.0% ±2.7,100.0% ±2.3,98.3% ±3.1,98.1% ±3.5,100.0% ±16.1,100.0% ±16.1
Ollama,deepseekr1,73.3% ±6.9,77.9% ±7.6,76.9% ±7.1,70.6% ±7.0,63.7% ±7.7,75.0% ±21.9,60.0% ±21.3
fireworks,llama31,70.0% ±7.1,100.0% ±2.7,99.4% ±2.8,98.3% ±3.1,99.4% ±2.8,100.0% ±16.1,95.0% ±18.6
fireworks,llama32,37.8% ±7.3,95.7% ±4.7,92.5% ±5.2,91.1% ±5.1,90.6% ±5.5,100.0% ±16.1,100.0% ±16.1
fireworks,llama33,96.7% ±3.7,100.0% ±2.7,100.0% ±2.3,98.9% ±2.8,96.9% ±4.0,100.0% ±16.1,100.0% ±16.1
fireworks,qwen25,100.0% ±4.6,100.0% ±8.8,100.0% ±6.0,100.0% ±4.6,100.0% ±6.0,100.0% ±16.1,100.0% ±16.1
fireworks,deepseekr1_70b,0.0% ±8.8,0.0% ±8.8,0.0% ±8.8,0.0% ±8.8,0.0% ±8.8,,


In [None]:
print(
    tabulate.tabulate(
        df_ci.reset_index(), headers="keys", tablefmt="pipe", showindex=False
    )
)

| level_0   | level_1        | ArticleResponse1XML   | ArticleResponse1nointXML   | ArticleResponse2XML   | ArticleResponse3XML   | ArticleResponse4XML   | ArticleResponse2XMLalt   | ArticleResponse4XMLalt   |
|:----------|:---------------|:----------------------|:---------------------------|:----------------------|:----------------------|:----------------------|:-------------------------|:-------------------------|
| Ollama    | llama32        | 39.4% ±7.3            | 97.9% ±4.0                 | 91.2% ±5.4            | 68.3% ±7.1            | 69.4% ±7.5            | 95.0% ±18.6              | 100.0% ±16.1             |
| Ollama    | nemotron       | 21.7% ±6.6            | 35.0% ±8.2                 | 15.6% ±6.4            | 17.8% ±6.2            | 25.0% ±7.2            | 0.0% ±16.1               | 0.0% ±16.1               |
| Ollama    | phi3           | 45.0% ±7.3            | 53.6% ±8.2                 | 28.1% ±7.4            | 16.7% ±6.1            | 15.0% ±6.3            | 10.0