In [1]:
import pickle
import pandas as pd
import tabulate
from pydantic_structure_definitions import *
from experiment_xml import (
    load_single_experiment,
    load_experiment_summary,
    analyse_xml_experiment,
)
from analyse_experiment import (
    wilson_score_ci,
    results_to_table,
    compare_m_experiments,
    format_ci_pm,
    results_to_table,
)

In [2]:
%load_ext autoreload
%autoreload 1

### Experiment 6

Hypothesis: Higher temperature worsens the conformance to XML schema


In [3]:
# Temperature 0 - sys, user, parsers experiment
# Includes Anthropic models
results_temp_sup = {}
experiment_date = "28-02-25"
experiment_num = "6"
metadata6 = load_experiment_summary(experiment_num, experiment_date, results_temp_sup)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_user
Loaded structure_support_by_model_parsers


In [4]:
print("\n".join(metadata6.get("hypotheses")))

1. Do small models including Phi3 and Nemotron improve their output by providing more explicit instructions on the format at the end of the prompt?
2. Do output parsers perform worse for all models?
3. What is the performance of the models using user parsers?


In [5]:
temp_sup_table_1 = results_to_table(
    results_temp_sup, combine_levels=True, cell_format=format_ci_pm
)
temp_sup_table_1

Unnamed: 0,Unnamed: 1,sys,user,parsers
Anthropic,Haiku_3,95% ±6,95% ±6,52% ±11
Anthropic,Haiku_35,99% ±4,100% ±4,50% ±11
Anthropic,Sonnet_35,100% ±4,100% ±4,71% ±11
Ollama,deepseekr1,76% ±9,81% ±9,9% ±8
Ollama,llama32,84% ±8,87% ±8,5% ±7
Ollama,nemotron,8% ±7,50% ±10,0% ±5
Ollama,phi3,47% ±10,61% ±10,5% ±7
Ollama,phi4,99% ±4,99% ±4,61% ±11
fireworks,llama31,95% ±6,95% ±6,42% ±11
fireworks,llama32,87% ±8,90% ±7,0% ±5


Reanalyse using different search modes


In [6]:
classes = generate_xml_classes(search_mode="unordered")

structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_temp_sup_relaxed = {
    ex_name: analyse_xml_experiment(
        results_temp_sup[ex_name],
        structured_formats_xml,
        verbose=False,
    )
    for ex_name in results_temp_sup
}

In [7]:
temp_sup_table_2 = results_to_table(
    {
        "strict": results_temp_sup["structure_support_by_model_parsers"],
        "unordered": results_temp_sup_relaxed["structure_support_by_model_parsers"],
    },
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(temp_sup_table_2)

| level_0   | level_1    | strict   | unordered   |
|:----------|:-----------|:---------|:------------|
| Anthropic | Haiku_3    | 52% ±11  | 54% ±11     |
| Anthropic | Haiku_35   | 50% ±11  | 75% ±10     |
| Anthropic | Sonnet_35  | 71% ±11  | 71% ±11     |
| Ollama    | deepseekr1 | 9% ±8    | 9% ±8       |
| Ollama    | llama32    | 5% ±7    | 20% ±10     |
| Ollama    | nemotron   | 0% ±5    | 0% ±5       |
| Ollama    | phi3       | 5% ±7    | 6% ±8       |
| Ollama    | phi4       | 61% ±11  | 69% ±11     |
| fireworks | llama31    | 42% ±11  | 57% ±11     |
| fireworks | llama32    | 0% ±5    | 0% ±5       |
| fireworks | llama33    | 31% ±11  | 31% ±11     |
| fireworks | qwen25     | 75% ±10  | 75% ±10     |


In [8]:
temp_sup_table_1 = results_to_table(
    results_temp_sup,
    combine_levels=False,
    cell_format=format_ci_pm,
    subset=["structure_support_by_model_parsers"],
)
temp_sup_table_2 = results_to_table(
    results_temp_sup_relaxed,
    combine_levels=False,
    cell_format=format_ci_pm,
    subset=["structure_support_by_model_parsers"],
    compare_to=temp_sup_table_1,
    markdown=True,
)
print(temp_sup_table_2)

| level_0   | level_1    | ArticleResponse1nointXML   | ArticleResponse2XML   | ArticleResponse3XML   | ArticleResponse4XML   |
|:----------|:-----------|:---------------------------|:----------------------|:----------------------|:----------------------|
| Anthropic | Haiku_3    | **30% ±22**                | 100% ±16              | 85% ±21               | 0% ±16                |
| Anthropic | Haiku_35   | **100% ±16**               | 100% ±16              | 100% ±16              | 0% ±16                |
| Anthropic | Sonnet_35  | 100% ±16                   | 100% ±16              | 85% ±21               | 0% ±16                |
| Ollama    | deepseekr1 | 10% ±20                    | 25% ±22               | 0% ±16                | 0% ±16                |
| Ollama    | llama32    | **40% ±21**                | **40% ±21**           | 0% ±16                | 0% ±16                |
| Ollama    | nemotron   | 0% ±16                     | 0% ±16                | 0% ±16                | 

### Experiment 4

Hypothesis: Higher temperature worsens the conformance to XML schema


In [9]:
# Temperature 0/0.8 experiment
# Includes Anthropic models
results_temp = {}
experiment_date = "25-02-25"
experiment_num = "4"
metadata4 = load_experiment_summary(experiment_num, experiment_date, results_temp)

Loaded structure_support_by_model_t0
Loaded structure_support_by_model_t08


In [10]:
compare_m_experiments(results_temp)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
fireworks,deepseekr1_70b,1.0,1.0,False
fireworks,llama32,0.126409,0.122265,False
Ollama,phi3,0.00651,0.004496,False
Anthropic,Haiku_3,0.383672,0.296312,False
Ollama,nemotron,0.283963,0.264551,False
fireworks,llama33,0.689399,1.0,False
fireworks,llama31,0.616328,1.0,False
Anthropic,Haiku_35,0.751256,1.0,False
Ollama,deepseekr1,0.435205,0.409623,False
Ollama,phi4,1.0,1.0,False


In [11]:
temp_table_1 = results_to_table(
    results_temp, combine_levels=True, cell_format=format_ci_pm
)
temp_table_1

Unnamed: 0,Unnamed: 1,t0,t08
Anthropic,Haiku_3,95% ±6,93% ±7
Anthropic,Haiku_35,99% ±4,99% ±4
Anthropic,Sonnet_35,100% ±4,100% ±4
Ollama,deepseekr1,76% ±9,74% ±9
Ollama,llama32,84% ±8,77% ±9
Ollama,nemotron,8% ±7,5% ±6
Ollama,phi3,47% ±10,29% ±10
Ollama,phi4,99% ±4,100% ±4
fireworks,deepseekr1_70b,0% ±4,0% ±4
fireworks,llama31,94% ±6,94% ±6


Reanalyse using different search modes


In [12]:
classes = generate_xml_classes(search_mode="unordered")

structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_temp_relaxed = {
    ex_name: analyse_xml_experiment(
        results_temp[ex_name],
        structured_formats_xml,
        verbose=False,
    )
    for ex_name in results_temp
}

In [13]:
temp_table_2 = results_to_table(
    results_temp_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=temp_table_1,
)
temp_table_2

Unnamed: 0,Unnamed: 1,t0,t08
Anthropic,Haiku_3,95% ±6,93% ±7
Anthropic,Haiku_35,99% ±4,99% ±4
Anthropic,Sonnet_35,100% ±4,100% ±4
Ollama,deepseekr1,76% ±9,74% ±9
Ollama,llama32,84% ±8,78% ±9
Ollama,nemotron,8% ±7,5% ±6
Ollama,phi3,47% ±10,30% ±10
Ollama,phi4,99% ±4,100% ±4
fireworks,deepseekr1_70b,0% ±4,0% ±4
fireworks,llama31,94% ±6,94% ±6


### Experiment 3

Hypothesis: Encapsulating lists improves the conformance to XML schema


In [14]:
# Testing list prompting
results_list = {}
experiment_date = "24-02-25"
experiment_num = "3"
metadata3 = load_experiment_summary(experiment_num, experiment_date, results_list)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_alt


In [15]:
list_table_1 = results_to_table(
    results_list, combine_levels=True, cell_format=format_ci_pm
)
list_table_1

Unnamed: 0,Unnamed: 1,sys,alt
Anthropic,Haiku_3,94% ±8,95% ±7
Anthropic,Haiku_35,100% ±5,100% ±5
Anthropic,Sonnet_35,100% ±5,100% ±5
Ollama,deepseekr1,65% ±11,61% ±11
Ollama,llama32,61% ±11,74% ±11
Ollama,nemotron,5% ±7,1% ±5
Ollama,phi3,18% ±10,12% ±9
Ollama,phi4,99% ±5,100% ±5
fireworks,llama31,90% ±9,85% ±9
fireworks,llama32,79% ±10,80% ±10


Reanalyse using different search modes


In [16]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_list_relaxed = {
    ex_name: analyse_xml_experiment(
        ex_results,
        structured_formats_xml,
        verbose=False,
    )
    for ex_name, ex_results in results_list.items()
}

In [17]:
results_to_table(
    results_list_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=list_table_1,
)

Unnamed: 0,Unnamed: 1,sys,alt
Anthropic,Haiku_3,94% ±8,95% ±7
Anthropic,Haiku_35,100% ±5,100% ±5
Anthropic,Sonnet_35,100% ±5,100% ±5
Ollama,deepseekr1,65% ±11,61% ±11
Ollama,llama32,65% ±11,78% ±10
Ollama,nemotron,5% ±7,1% ±5
Ollama,phi3,19% ±10,12% ±9
Ollama,phi4,99% ±5,100% ±5
fireworks,llama31,90% ±9,85% ±9
fireworks,llama32,79% ±10,80% ±10


In [18]:
compare_m_experiments(results_list)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
fireworks,llama32,0.651807,1.0,False
Ollama,phi3,0.253618,0.264448,False
Anthropic,Haiku_3,0.752466,1.0,False
Ollama,nemotron,0.183532,0.121823,False
fireworks,llama33,0.248428,0.105082,False
fireworks,llama31,0.237051,0.264448,False
Anthropic,Haiku_35,1.0,1.0,False
Ollama,deepseekr1,0.37164,0.332301,False
Ollama,phi4,1.0,1.0,False
fireworks,qwen25,1.0,1.0,False


### Experiment X

Hypothesis: ???


In [19]:
# Testing system and user prompting & output parsers
results_sys_user = {}
experiment_date = "20-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_user)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_user)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_user


In [20]:
results_to_table(results_sys_user, combine_levels=True)

Unnamed: 0,Unnamed: 1,sys,user
Ollama,deepseekr1,62%,79%
Ollama,llama32,67%,76%
Ollama,nemotron,27%,39%
Ollama,phi3,16%,43%
Ollama,phi4,99%,97%
fireworks,llama31,93%,94%
fireworks,llama32,73%,86%
fireworks,llama33,99%,100%


In [21]:
compare_m_experiments(
    results_sys_user, bonferroni=None, alternative="two-sided"
).sort_index(axis=0)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Ollama,deepseekr1,0.01275,0.008724,True
Ollama,llama32,0.209961,0.210671,False
Ollama,nemotron,0.097725,0.076838,False
Ollama,phi3,4.5e-05,2.7e-05,True
Ollama,phi4,0.621202,0.374089,False
fireworks,llama31,1.0,0.852408,False
fireworks,llama32,0.034734,0.024304,True
fireworks,llama33,1.0,0.529101,False


### Experiment 2

Hypothesis 1: XML schema conformance differs between system and user prompts

Hypothesis 2: XML schema conformance differs between system and system + reminder prompting

Hypothesis 3: Output parsers are worse than prompting for XML

Note: No Quen2.5 or Anthropic models in this set


In [22]:
# Testing system and user prompting & output parsers
results_sys_other = {}
experiment_date = "21-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_other)
load_single_experiment(
    experiment_num, experiment_date, "sys_w_reminder", results_sys_other
)
load_single_experiment(experiment_num, experiment_date, "parser", results_sys_other)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_other)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_sys_w_reminder
Loaded structure_support_by_model_parser
Loaded structure_support_by_model_user


In [23]:
sys_other_table_1 = results_to_table(
    results_sys_other,
    combine_levels=True,
    cell_format=format_ci_pm,
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_sys_w_reminder",
        "structure_support_by_model_user",
    ],
)
sys_other_table_1

Unnamed: 0,Unnamed: 1,sys,sys_w_reminder,user
Ollama,deepseekr1,71% ±10,73% ±9,84% ±8
Ollama,llama32,63% ±10,67% ±10,85% ±8
Ollama,nemotron,11% ±8,63% ±10,27% ±9
Ollama,phi3,25% ±9,36% ±10,37% ±10
Ollama,phi4,97% ±5,98% ±5,99% ±4
fireworks,llama31,95% ±6,94% ±6,96% ±6
fireworks,llama32,82% ±9,83% ±9,81% ±9
fireworks,llama33,97% ±5,99% ±4,98% ±5


Reanalyse using different search modes


In [24]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_sys_other_relaxed = {
    ex_name: analyse_xml_experiment(
        ex_results,
        structured_formats_xml,
        verbose=False,
    )
    for ex_name, ex_results in results_sys_other.items()
}

In [25]:
results_to_table(
    results_sys_other_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_sys_w_reminder",
        "structure_support_by_model_user",
    ],
    compare_to=sys_other_table_1,
)

Unnamed: 0,Unnamed: 1,sys,sys_w_reminder,user
Ollama,deepseekr1,72% ±9,73% ±9,84% ±8
Ollama,llama32,63% ±10,76% ±9,85% ±8
Ollama,nemotron,11% ±8,63% ±10,27% ±9
Ollama,phi3,26% ±9,37% ±10,39% ±10
Ollama,phi4,97% ±5,98% ±5,99% ±4
fireworks,llama31,95% ±6,94% ±6,96% ±6
fireworks,llama32,82% ±9,84% ±8,81% ±9
fireworks,llama33,97% ±5,99% ±4,98% ±5


In [26]:
compare_m_experiments(
    results_sys_other,
    bonferroni=3,
    alternative="two-sided",
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_user",
    ],
)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
fireworks,llama32,1.0,0.91543,False
Ollama,phi3,0.092198,0.076838,False
Ollama,nemotron,0.006335,0.004019,True
fireworks,llama33,1.0,0.752431,False
fireworks,llama31,1.0,0.819245,False
Ollama,deepseekr1,0.041416,0.028925,False
Ollama,phi4,0.621202,0.374089,False
Ollama,llama32,0.000627,0.000401,True


In [27]:
compare_m_experiments(
    results_sys_other,
    bonferroni=3,
    alternative="two-sided",
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_sys_w_reminder",
    ],
)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
fireworks,llama32,1.0,0.9130627,False
Ollama,phi3,0.1242213,0.103639,False
Ollama,nemotron,1.036357e-14,8.218392e-15,True
fireworks,llama33,0.6212025,0.374089,False
fireworks,llama31,1.0,0.8381458,False
Ollama,deepseekr1,0.8749655,0.8192451,False
Ollama,phi4,1.0,0.7524307,False
Ollama,llama32,0.6566579,0.5926245,False


## Combined results


#### Hypothesis 1: Do different models have different performance in XML output?


Collate similar experiments


In [28]:
combined_experiment_sys_user = {
    "models": [
        # results_sys_user["structure_support_by_model_sys"],
        results_temp["structure_support_by_model_t0"],
        results_temp["structure_support_by_model_t08"],
        results_temp_sup["structure_support_by_model_sys"],
        results_list["structure_support_by_model_sys"],
        results_sys_other["structure_support_by_model_sys_w_reminder"],
        # results_sys_user["structure_support_by_model_user"],
        results_temp_sup["structure_support_by_model_user"],
        results_sys_other["structure_support_by_model_user"],
    ],
}

In [29]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
combined_experiment_sys_user_relaxed = {
    "models": [
        analyse_xml_experiment(
            ex_results,
            structured_formats_xml,
            verbose=False,
        )
        for ex_results in combined_experiment_sys_user["models"]
    ]
}

In [30]:
res_table_1 = results_to_table(
    combined_experiment_sys_user,
    combine_levels=False,
    cell_format=format_ci_pm,
    markdown=False,
)
res_table_2 = results_to_table(
    combined_experiment_sys_user_relaxed,
    combine_levels=False,
    cell_format=format_ci_pm,
    markdown=False,
    compare_to=res_table_1,
)
res_table_2

Unnamed: 0,Unnamed: 1,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML
Anthropic,Haiku_3,73% ±9,100% ±5,100% ±4,100% ±4,100% ±4
Anthropic,Haiku_35,98% ±5,100% ±5,100% ±4,99% ±4,100% ±4
Anthropic,Sonnet_35,100% ±4,100% ±5,100% ±4,100% ±4,100% ±4
Ollama,deepseekr1,76% ±8,78% ±8,78% ±8,79% ±7,69% ±8
Ollama,llama32,44% ±8,97% ±5,96% ±5,89% ±6,84% ±7
Ollama,nemotron,19% ±7,29% ±9,18% ±7,26% ±8,30% ±8
Ollama,phi3,49% ±8,62% ±9,44% ±8,29% ±8,21% ±8
Ollama,phi4,96% ±5,100% ±3,100% ±3,99% ±3,99% ±3
fireworks,deepseekr1_70b,0% ±9,0% ±9,0% ±9,0% ±9,0% ±9
fireworks,llama31,74% ±8,100% ±3,100% ±3,99% ±3,99% ±4


#### Hypothesis 2: Do different temperatures cause different compliance to XML?


Collate similar experiments

- Experiment 4: t=0 & t=0.8
- Experiment 6: t=0
- Experiment 2: t=0.8


In [31]:
combined_experiments_temp = {
    "t=0": [
        results_temp["structure_support_by_model_t0"],
        results_temp_sup["structure_support_by_model_sys"],
        results_temp_sup["structure_support_by_model_user"],
    ],
    "t=0.8": [
        results_temp["structure_support_by_model_t08"],
        results_sys_other["structure_support_by_model_sys"],
        results_sys_other["structure_support_by_model_user"],
    ],
}

In [32]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
combined_experiment_temp_relaxed = {
    conf_name: [
        analyse_xml_experiment(
            ex_results,
            structured_formats_xml,
            verbose=False,
        )
        for ex_results in conf_data
    ]
    for conf_name, conf_data in combined_experiments_temp.items()
}

In [33]:
res_table_temp_1 = results_to_table(
    combined_experiments_temp,
    combine_levels=True,
    cell_format=format_ci_pm,
)
res_table_temp_2 = results_to_table(
    combined_experiment_temp_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=res_table_temp_1,
)
res_table_temp_2

Unnamed: 0,Unnamed: 1,t=0,t=0.8
Anthropic,Haiku_3,95% ±3,93% ±7
Anthropic,Haiku_35,99% ±2,99% ±4
Anthropic,Sonnet_35,100% ±1,100% ±4
Ollama,deepseekr1,78% ±5,77% ±5
Ollama,llama32,87% ±4,75% ±5
Ollama,nemotron,22% ±5,14% ±4
Ollama,phi3,52% ±6,32% ±5
Ollama,phi4,99% ±2,99% ±2
fireworks,deepseekr1_70b,0% ±4,0% ±4
fireworks,llama31,95% ±3,95% ±3


In [34]:
res = results_to_table(
    combined_experiments_temp,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(res)

| level_0   | level_1        | t=0     | t=0.8   |
|:----------|:---------------|:--------|:--------|
| Anthropic | Haiku_3        | 95% ±3  | 93% ±7  |
| Anthropic | Haiku_35       | 99% ±2  | 99% ±4  |
| Anthropic | Sonnet_35      | 100% ±1 | 100% ±4 |
| Ollama    | deepseekr1     | 78% ±5  | 76% ±5  |
| Ollama    | llama32        | 85% ±4  | 75% ±5  |
| Ollama    | nemotron       | 22% ±5  | 14% ±4  |
| Ollama    | phi3           | 52% ±6  | 30% ±5  |
| Ollama    | phi4           | 99% ±2  | 99% ±2  |
| fireworks | deepseekr1_70b | 0% ±4   | 0% ±4   |
| fireworks | llama31        | 95% ±3  | 95% ±3  |
| fireworks | llama32        | 88% ±4  | 81% ±5  |
| fireworks | llama33        | 99% ±2  | 98% ±2  |
| fireworks | qwen25         | 100% ±1 | 100% ±4 |


In [35]:
compare_m_experiments(
    combined_experiments_temp,
    bonferroni=True,
    alternative="two-sided",
).sort_index(axis=0)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Anthropic,Haiku_3,0.4517915,0.5831062,False
Anthropic,Haiku_35,1.0,0.7511386,False
Anthropic,Sonnet_35,1.0,1.0,False
Ollama,deepseekr1,0.7711052,0.7242056,False
Ollama,llama32,0.002974332,0.002244,True
Ollama,nemotron,0.0195538,0.01594044,False
Ollama,phi3,1.508677e-07,1.051415e-07,True
Ollama,phi4,1.0,0.734337,False
fireworks,deepseekr1_70b,1.0,1.0,False
fireworks,llama31,1.0,0.9104654,False


#### Hypothesis 3: Do different models work using format in the system prompt or user prompt?


Collate similar experiments


In [36]:
combined_experiment_sys_user = {
    "sys": [
        results_sys_user["structure_support_by_model_sys"],
        results_temp["structure_support_by_model_t0"],
        results_temp["structure_support_by_model_t08"],
        results_temp_sup["structure_support_by_model_sys"],
        results_list["structure_support_by_model_sys"],
        results_sys_other["structure_support_by_model_sys"],
    ],
    "user": [
        results_sys_user["structure_support_by_model_user"],
        results_temp_sup["structure_support_by_model_user"],
        results_sys_other["structure_support_by_model_user"],
    ],
}

In [37]:
res = results_to_table(
    combined_experiment_sys_user,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(res)

| level_0   | level_1        | sys     | user    |
|:----------|:---------------|:--------|:--------|
| Anthropic | Haiku_3        | 94% ±3  | 95% ±6  |
| Anthropic | Haiku_35       | 99% ±2  | 100% ±4 |
| Anthropic | Sonnet_35      | 100% ±1 | 100% ±4 |
| Ollama    | deepseekr1     | 71% ±4  | 81% ±5  |
| Ollama    | llama32        | 73% ±4  | 83% ±5  |
| Ollama    | nemotron       | 11% ±3  | 39% ±6  |
| Ollama    | phi3           | 31% ±4  | 47% ±6  |
| Ollama    | phi4           | 99% ±1  | 98% ±2  |
| fireworks | deepseekr1_70b | 0% ±2   | nan     |
| fireworks | llama31        | 94% ±2  | 95% ±3  |
| fireworks | llama32        | 81% ±3  | 86% ±4  |
| fireworks | llama33        | 99% ±1  | 99% ±2  |
| fireworks | qwen25         | 100% ±1 | 100% ±4 |


In [38]:
compare_m_experiments(
    combined_experiment_sys_user,
    bonferroni=True,
    alternative="two-sided",
).sort_index(axis=0)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Anthropic,Haiku_3,1.0,0.7769068,False
Anthropic,Haiku_35,1.0,0.473764,False
Anthropic,Sonnet_35,1.0,1.0,False
Ollama,deepseekr1,0.0008052879,0.0007529682,True
Ollama,llama32,0.001459952,0.001585081,True
Ollama,nemotron,4.235695e-21,1.684465e-21,True
Ollama,phi3,2.913387e-06,2.214949e-06,True
Ollama,phi4,0.5551327,0.5868235,False
fireworks,llama31,0.4539338,0.4173579,False
fireworks,llama32,0.131039,0.1114722,False


#### Hypothesis 4: Do LangChain output parsers do as well as format examples for complex formats?


Collate similar experiments


In [39]:
combined_experiment_sys_user = {
    "sys": [
        results_temp["structure_support_by_model_t0"],
        results_temp["structure_support_by_model_t08"],
        results_temp_sup["structure_support_by_model_sys"],
        results_list["structure_support_by_model_sys"],
        results_sys_other["structure_support_by_model_sys"],
    ],
    "parser": [
        results_sys_other["structure_support_by_model_parser"],
        results_temp_sup["structure_support_by_model_parsers"],
    ],
}

In [40]:
res = results_to_table(
    combined_experiment_sys_user,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(res)

| level_0   | level_1        | sys     | parser   |
|:----------|:---------------|:--------|:---------|
| Anthropic | Haiku_3        | 94% ±3  | 52% ±11  |
| Anthropic | Haiku_35       | 99% ±2  | 50% ±11  |
| Anthropic | Sonnet_35      | 100% ±1 | 71% ±11  |
| Ollama    | deepseekr1     | 73% ±4  | 8% ±5    |
| Ollama    | llama32        | 74% ±4  | 8% ±5    |
| Ollama    | nemotron       | 8% ±3   | 0% ±5    |
| Ollama    | phi3           | 34% ±4  | 4% ±4    |
| Ollama    | phi4           | 99% ±1  | 61% ±11  |
| fireworks | deepseekr1_70b | 0% ±2   | nan      |
| fireworks | llama31        | 94% ±3  | 42% ±11  |
| fireworks | llama32        | 83% ±4  | 0% ±5    |
| fireworks | llama33        | 99% ±2  | 31% ±11  |
| fireworks | qwen25         | 100% ±1 | 75% ±10  |
