In [62]:
import pickle
import pandas as pd
import tabulate
from pydantic_structure_definitions import *
from experiment_xml import (
    load_single_experiment,
    load_experiment_summary,
    analyse_xml_experiment,
)
from analyse_experiment import (
    wilson_score_ci,
    results_to_table,
    compare_m_experiments,
    format_ci_pm,
    format_diag,
    results_to_table,
)

In [4]:
%load_ext autoreload
%autoreload 1

In [5]:
%cd "Experiment Outputs"

/home/andrew/Code/mastering-structured-output/Experiment Outputs


### Experiment 6

Hypothesis: Higher temperature worsens the conformance to XML schema


In [6]:
# Temperature 0 - sys, user, parsers experiment
# Includes Anthropic models
results_temp_sup = {}
experiment_date = "28-02-25"
experiment_num = "6"
metadata6 = load_experiment_summary(experiment_num, experiment_date, results_temp_sup)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_user
Loaded structure_support_by_model_parsers


In [7]:
print("\n".join(metadata6.get("hypotheses")))

1. Do small models including Phi3 and Nemotron improve their output by providing more explicit instructions on the format at the end of the prompt?
2. Do output parsers perform worse for all models?
3. What is the performance of the models using user parsers?


In [8]:
temp_sup_table_1 = results_to_table(
    results_temp_sup, combine_levels=True, cell_format=format_ci_pm
)
temp_sup_table_1

Unnamed: 0,Unnamed: 1,sys,user,parsers
Anthropic,Haiku_3,95% ±6,95% ±6,52% ±11
Anthropic,Haiku_35,99% ±4,100% ±4,50% ±11
Anthropic,Sonnet_35,100% ±4,100% ±4,71% ±11
Ollama,deepseekr1,76% ±9,81% ±9,9% ±8
Ollama,llama32,84% ±8,87% ±8,5% ±7
Ollama,nemotron,8% ±7,50% ±10,0% ±5
Ollama,phi3,47% ±10,61% ±10,5% ±7
Ollama,phi4,99% ±4,99% ±4,61% ±11
fireworks,llama31,95% ±6,95% ±6,42% ±11
fireworks,llama32,87% ±8,90% ±7,0% ±5


Reanalyse using different search modes


In [9]:
classes = generate_xml_classes(search_mode="unordered")

structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_temp_sup_relaxed = {
    ex_name: analyse_xml_experiment(
        results_temp_sup[ex_name],
        structured_formats_xml,
        verbose=False,
    )
    for ex_name in results_temp_sup
}

In [10]:
temp_sup_table_2 = results_to_table(
    {
        "strict": results_temp_sup["structure_support_by_model_parsers"],
        "unordered": results_temp_sup_relaxed["structure_support_by_model_parsers"],
    },
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(temp_sup_table_2)

| level_0   | level_1    | strict   | unordered   |
|:----------|:-----------|:---------|:------------|
| Anthropic | Haiku_3    | 52% ±11  | 54% ±11     |
| Anthropic | Haiku_35   | 50% ±11  | 75% ±10     |
| Anthropic | Sonnet_35  | 71% ±11  | 71% ±11     |
| Ollama    | deepseekr1 | 9% ±8    | 9% ±8       |
| Ollama    | llama32    | 5% ±7    | 20% ±10     |
| Ollama    | nemotron   | 0% ±5    | 0% ±5       |
| Ollama    | phi3       | 5% ±7    | 6% ±8       |
| Ollama    | phi4       | 61% ±11  | 69% ±11     |
| fireworks | llama31    | 42% ±11  | 57% ±11     |
| fireworks | llama32    | 0% ±5    | 0% ±5       |
| fireworks | llama33    | 31% ±11  | 31% ±11     |
| fireworks | qwen25     | 75% ±10  | 75% ±10     |


In [11]:
temp_sup_table_1 = results_to_table(
    results_temp_sup,
    combine_levels=False,
    cell_format=format_ci_pm,
    subset=["structure_support_by_model_parsers"],
)
temp_sup_table_2 = results_to_table(
    results_temp_sup_relaxed,
    combine_levels=False,
    cell_format=format_ci_pm,
    subset=["structure_support_by_model_parsers"],
    compare_to=temp_sup_table_1,
    markdown=True,
)
print(temp_sup_table_2)

| level_0   | level_1    | ArticleResponse1nointXML   | ArticleResponse2XML   | ArticleResponse3XML   | ArticleResponse4XML   |
|:----------|:-----------|:---------------------------|:----------------------|:----------------------|:----------------------|
| Anthropic | Haiku_3    | **30% ±22**                | 100% ±16              | 85% ±21               | 0% ±16                |
| Anthropic | Haiku_35   | **100% ±16**               | 100% ±16              | 100% ±16              | 0% ±16                |
| Anthropic | Sonnet_35  | 100% ±16                   | 100% ±16              | 85% ±21               | 0% ±16                |
| Ollama    | deepseekr1 | 10% ±20                    | 25% ±22               | 0% ±16                | 0% ±16                |
| Ollama    | llama32    | **40% ±21**                | **40% ±21**           | 0% ±16                | 0% ±16                |
| Ollama    | nemotron   | 0% ±16                     | 0% ±16                | 0% ±16                | 

### Experiment 4

Hypothesis: Higher temperature worsens the conformance to XML schema


In [12]:
# Temperature 0/0.8 experiment
# Includes Anthropic models
results_temp = {}
experiment_date = "25-02-25"
experiment_num = "4"
metadata4 = load_experiment_summary(experiment_num, experiment_date, results_temp)

Loaded structure_support_by_model_t0
Loaded structure_support_by_model_t08


In [13]:
compare_m_experiments(results_temp)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Ollama,phi3,0.00651,0.004496,False
fireworks,llama31,0.616328,1.0,False
Anthropic,Haiku_3,0.383672,0.296312,False
Ollama,nemotron,0.283963,0.264551,False
Anthropic,Sonnet_35,1.0,1.0,False
fireworks,llama32,0.126409,0.122265,False
fireworks,llama33,0.689399,1.0,False
fireworks,deepseekr1_70b,1.0,1.0,False
Anthropic,Haiku_35,0.751256,1.0,False
Ollama,llama32,0.142089,0.122582,False


In [14]:
temp_table_1 = results_to_table(
    results_temp, combine_levels=True, cell_format=format_ci_pm
)
temp_table_1

Unnamed: 0,Unnamed: 1,t0,t08
Anthropic,Haiku_3,95% ±6,93% ±7
Anthropic,Haiku_35,99% ±4,99% ±4
Anthropic,Sonnet_35,100% ±4,100% ±4
Ollama,deepseekr1,76% ±9,74% ±9
Ollama,llama32,84% ±8,77% ±9
Ollama,nemotron,8% ±7,5% ±6
Ollama,phi3,47% ±10,29% ±10
Ollama,phi4,99% ±4,100% ±4
fireworks,deepseekr1_70b,0% ±4,0% ±4
fireworks,llama31,94% ±6,94% ±6


Reanalyse using different search modes


In [15]:
classes = generate_xml_classes(search_mode="unordered")

structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_temp_relaxed = {
    ex_name: analyse_xml_experiment(
        results_temp[ex_name],
        structured_formats_xml,
        verbose=False,
    )
    for ex_name in results_temp
}

In [16]:
temp_table_2 = results_to_table(
    results_temp_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=temp_table_1,
)
temp_table_2

Unnamed: 0,Unnamed: 1,t0,t08
Anthropic,Haiku_3,95% ±6,93% ±7
Anthropic,Haiku_35,99% ±4,99% ±4
Anthropic,Sonnet_35,100% ±4,100% ±4
Ollama,deepseekr1,76% ±9,74% ±9
Ollama,llama32,84% ±8,78% ±9
Ollama,nemotron,8% ±7,5% ±6
Ollama,phi3,47% ±10,30% ±10
Ollama,phi4,99% ±4,100% ±4
fireworks,deepseekr1_70b,0% ±4,0% ±4
fireworks,llama31,94% ±6,94% ±6


### Experiment 3

Hypothesis: Encapsulating lists improves the conformance to XML schema


In [17]:
# Testing list prompting
results_list = {}
experiment_date = "24-02-25"
experiment_num = "3"
metadata3 = load_experiment_summary(experiment_num, experiment_date, results_list)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_alt


In [18]:
list_table_1 = results_to_table(
    results_list, combine_levels=True, cell_format=format_ci_pm
)
list_table_1

Unnamed: 0,Unnamed: 1,sys,alt
Anthropic,Haiku_3,94% ±8,95% ±7
Anthropic,Haiku_35,100% ±5,100% ±5
Anthropic,Sonnet_35,100% ±5,100% ±5
Ollama,deepseekr1,65% ±11,61% ±11
Ollama,llama32,61% ±11,74% ±11
Ollama,nemotron,5% ±7,1% ±5
Ollama,phi3,18% ±10,12% ±9
Ollama,phi4,99% ±5,100% ±5
fireworks,llama31,90% ±9,85% ±9
fireworks,llama32,79% ±10,80% ±10


Reanalyse using different search modes


In [19]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_list_relaxed = {
    ex_name: analyse_xml_experiment(
        ex_results,
        structured_formats_xml,
        verbose=False,
    )
    for ex_name, ex_results in results_list.items()
}

In [20]:
results_to_table(
    results_list_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=list_table_1,
)

Unnamed: 0,Unnamed: 1,sys,alt
Anthropic,Haiku_3,94% ±8,95% ±7
Anthropic,Haiku_35,100% ±5,100% ±5
Anthropic,Sonnet_35,100% ±5,100% ±5
Ollama,deepseekr1,65% ±11,61% ±11
Ollama,llama32,65% ±11,78% ±10
Ollama,nemotron,5% ±7,1% ±5
Ollama,phi3,19% ±10,12% ±9
Ollama,phi4,99% ±5,100% ±5
fireworks,llama31,90% ±9,85% ±9
fireworks,llama32,79% ±10,80% ±10


In [21]:
compare_m_experiments(results_list)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Ollama,phi3,0.253618,0.264448,False
fireworks,llama31,0.237051,0.264448,False
Anthropic,Haiku_3,0.752466,1.0,False
Ollama,nemotron,0.183532,0.121823,False
Anthropic,Sonnet_35,1.0,1.0,False
fireworks,llama32,0.651807,1.0,False
fireworks,llama33,0.248428,0.105082,False
Anthropic,Haiku_35,1.0,1.0,False
Ollama,llama32,0.968583,1.0,False
Ollama,phi4,1.0,1.0,False


### Experiment X

Hypothesis: ???


In [22]:
# Testing system and user prompting & output parsers
results_sys_user = {}
experiment_date = "20-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_user)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_user)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_user


In [23]:
results_to_table(results_sys_user, combine_levels=True)

Unnamed: 0,Unnamed: 1,sys,user
Ollama,deepseekr1,62%,79%
Ollama,llama32,67%,76%
Ollama,nemotron,27%,39%
Ollama,phi3,16%,43%
Ollama,phi4,99%,97%
fireworks,llama31,93%,94%
fireworks,llama32,73%,86%
fireworks,llama33,99%,100%


In [24]:
compare_m_experiments(
    results_sys_user, bonferroni=None, alternative="two-sided"
).sort_index(axis=0)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Ollama,deepseekr1,0.01275,0.008724,True
Ollama,llama32,0.209961,0.210671,False
Ollama,nemotron,0.097725,0.076838,False
Ollama,phi3,4.5e-05,2.7e-05,True
Ollama,phi4,0.621202,0.374089,False
fireworks,llama31,1.0,0.852408,False
fireworks,llama32,0.034734,0.024304,True
fireworks,llama33,1.0,0.529101,False


### Experiment 2

Hypothesis 1: XML schema conformance differs between system and user prompts

Hypothesis 2: XML schema conformance differs between system and system + reminder prompting

Hypothesis 3: Output parsers are worse than prompting for XML

Note: No Quen2.5 or Anthropic models in this set


In [25]:
# Testing system and user prompting & output parsers
results_sys_other = {}
experiment_date = "21-02-25"
experiment_num = "5"
load_single_experiment(experiment_num, experiment_date, "sys", results_sys_other)
load_single_experiment(
    experiment_num, experiment_date, "sys_w_reminder", results_sys_other
)
load_single_experiment(experiment_num, experiment_date, "parser", results_sys_other)
load_single_experiment(experiment_num, experiment_date, "user", results_sys_other)

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_sys_w_reminder
Loaded structure_support_by_model_parser
Loaded structure_support_by_model_user


In [26]:
sys_other_table_1 = results_to_table(
    results_sys_other,
    combine_levels=True,
    cell_format=format_ci_pm,
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_sys_w_reminder",
        "structure_support_by_model_user",
    ],
)
sys_other_table_1

Unnamed: 0,Unnamed: 1,sys,sys_w_reminder,user
Ollama,deepseekr1,71% ±10,73% ±9,84% ±8
Ollama,llama32,63% ±10,67% ±10,85% ±8
Ollama,nemotron,11% ±8,63% ±10,27% ±9
Ollama,phi3,25% ±9,36% ±10,37% ±10
Ollama,phi4,97% ±5,98% ±5,99% ±4
fireworks,llama31,95% ±6,94% ±6,96% ±6
fireworks,llama32,82% ±9,83% ±9,81% ±9
fireworks,llama33,97% ±5,99% ±4,98% ±5


Reanalyse using different search modes


In [27]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
results_sys_other_relaxed = {
    ex_name: analyse_xml_experiment(
        ex_results,
        structured_formats_xml,
        verbose=False,
    )
    for ex_name, ex_results in results_sys_other.items()
}

In [28]:
results_to_table(
    results_sys_other_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_sys_w_reminder",
        "structure_support_by_model_user",
    ],
    compare_to=sys_other_table_1,
)

Unnamed: 0,Unnamed: 1,sys,sys_w_reminder,user
Ollama,deepseekr1,72% ±9,73% ±9,84% ±8
Ollama,llama32,63% ±10,76% ±9,85% ±8
Ollama,nemotron,11% ±8,63% ±10,27% ±9
Ollama,phi3,26% ±9,37% ±10,39% ±10
Ollama,phi4,97% ±5,98% ±5,99% ±4
fireworks,llama31,95% ±6,94% ±6,96% ±6
fireworks,llama32,82% ±9,84% ±8,81% ±9
fireworks,llama33,97% ±5,99% ±4,98% ±5


In [29]:
compare_m_experiments(
    results_sys_other,
    bonferroni=3,
    alternative="two-sided",
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_user",
    ],
)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Ollama,phi3,0.092198,0.076838,False
fireworks,llama31,1.0,0.819245,False
Ollama,nemotron,0.006335,0.004019,True
fireworks,llama32,1.0,0.91543,False
fireworks,llama33,1.0,0.752431,False
Ollama,llama32,0.000627,0.000401,True
Ollama,phi4,0.621202,0.374089,False
Ollama,deepseekr1,0.041416,0.028925,False


In [30]:
compare_m_experiments(
    results_sys_other,
    bonferroni=3,
    alternative="two-sided",
    subset=[
        "structure_support_by_model_sys",
        "structure_support_by_model_sys_w_reminder",
    ],
)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Ollama,phi3,0.1242213,0.103639,False
fireworks,llama31,1.0,0.8381458,False
Ollama,nemotron,1.036357e-14,8.218392e-15,True
fireworks,llama32,1.0,0.9130627,False
fireworks,llama33,0.6212025,0.374089,False
Ollama,llama32,0.6566579,0.5926245,False
Ollama,phi4,1.0,0.7524307,False
Ollama,deepseekr1,0.8749655,0.8192451,False


## Combined results


#### Hypothesis 1: Do different models have different performance in XML output?


Collate similar experiments


In [None]:
combined_experiment_sys_user = {
    "user": [
        results_sys_user["structure_support_by_model_user"],
        results_temp_sup["structure_support_by_model_user"],
        results_sys_other["structure_support_by_model_user"],
    ],
}

In [95]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
combined_experiment_sys_user_relaxed = {
    ex_name: [
        analyse_xml_experiment(
            ex_single,
            structured_formats_xml,
            verbose=False,
        )
        for ex_single in ex_results
    ]
    for ex_name, ex_results in combined_experiment_sys_user.items()
}

In [104]:
res_table_1 = results_to_table(
    combined_experiment_sys_user,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=False,
)
res_table_2 = results_to_table(
    combined_experiment_sys_user_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
    compare_to=res_table_1,
)
print(res_table_2)

| level_0   | level_1        | user       | sys        |
|:----------|:---------------|:-----------|:-----------|
| Anthropic | Haiku_3        | 95% ±6     | 94% ±3     |
| Anthropic | Haiku_35       | 100% ±4    | 99% ±2     |
| Anthropic | Sonnet_35      | 100% ±4    | 100% ±1    |
| Ollama    | deepseekr1     | 81% ±5     | 71% ±4     |
| Ollama    | llama32        | **86% ±4** | **77% ±4** |
| Ollama    | nemotron       | 39% ±6     | 11% ±3     |
| Ollama    | phi3           | **48% ±6** | 32% ±4     |
| Ollama    | phi4           | 98% ±2     | 99% ±1     |
| fireworks | deepseekr1_70b | **nan**    | 0% ±2      |
| fireworks | llama31        | 95% ±3     | 93% ±3     |
| fireworks | llama32        | 86% ±4     | 81% ±4     |
| fireworks | llama33        | 99% ±2     | 99% ±1     |
| fireworks | qwen25         | 100% ±4    | 100% ±1    |


In [105]:
print(res_table_1)

                             user      sys
Anthropic Haiku_3          95% ±6   94% ±3
          Haiku_35        100% ±4   99% ±2
          Sonnet_35       100% ±4  100% ±1
Ollama    deepseekr1       81% ±5   71% ±4
          llama32          83% ±5   75% ±4
          nemotron         39% ±6   11% ±3
          phi3             47% ±6   32% ±4
          phi4             98% ±2   99% ±1
fireworks deepseekr1_70b      NaN    0% ±2
          llama31          95% ±3   93% ±3
          llama32          86% ±4   81% ±4
          llama33          99% ±2   99% ±1
          qwen25          100% ±4  100% ±1


#### Hypothesis 2: Do different temperatures cause different compliance to XML?


Collate similar experiments

- Experiment 4: t=0 & t=0.8
- Experiment 6: t=0
- Experiment 2: t=0.8


In [35]:
combined_experiments_temp = {
    "t=0": [
        results_temp["structure_support_by_model_t0"],
        results_temp_sup["structure_support_by_model_sys"],
        results_temp_sup["structure_support_by_model_user"],
    ],
    "t=0.8": [
        results_temp["structure_support_by_model_t08"],
        results_sys_other["structure_support_by_model_sys"],
        results_sys_other["structure_support_by_model_user"],
    ],
}

In [36]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
combined_experiment_temp_relaxed = {
    conf_name: [
        analyse_xml_experiment(
            ex_results,
            structured_formats_xml,
            verbose=False,
        )
        for ex_results in conf_data
    ]
    for conf_name, conf_data in combined_experiments_temp.items()
}

In [37]:
res_table_temp_1 = results_to_table(
    combined_experiments_temp,
    combine_levels=True,
    cell_format=format_ci_pm,
)
res_table_temp_2 = results_to_table(
    combined_experiment_temp_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=res_table_temp_1,
)
res_table_temp_2

Unnamed: 0,Unnamed: 1,t=0,t=0.8
Anthropic,Haiku_3,95% ±3,93% ±7
Anthropic,Haiku_35,99% ±2,99% ±4
Anthropic,Sonnet_35,100% ±1,100% ±4
Ollama,deepseekr1,78% ±5,77% ±5
Ollama,llama32,87% ±4,75% ±5
Ollama,nemotron,22% ±5,14% ±4
Ollama,phi3,52% ±6,32% ±5
Ollama,phi4,99% ±2,99% ±2
fireworks,deepseekr1_70b,0% ±4,0% ±4
fireworks,llama31,95% ±3,95% ±3


In [65]:
res = results_to_table(
    combined_experiments_temp,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(res)

| level_0   | level_1        | t=0     | t=0.8   |
|:----------|:---------------|:--------|:--------|
| Anthropic | Haiku_3        | 95% ±3  | 93% ±7  |
| Anthropic | Haiku_35       | 99% ±2  | 99% ±4  |
| Anthropic | Sonnet_35      | 100% ±1 | 100% ±4 |
| Ollama    | deepseekr1     | 78% ±5  | 76% ±5  |
| Ollama    | llama32        | 85% ±4  | 75% ±5  |
| Ollama    | nemotron       | 22% ±5  | 14% ±4  |
| Ollama    | phi3           | 52% ±6  | 30% ±5  |
| Ollama    | phi4           | 99% ±2  | 99% ±2  |
| fireworks | deepseekr1_70b | 0% ±4   | 0% ±4   |
| fireworks | llama31        | 95% ±3  | 95% ±3  |
| fireworks | llama32        | 88% ±4  | 81% ±5  |
| fireworks | llama33        | 99% ±2  | 98% ±2  |
| fireworks | qwen25         | 100% ±1 | 100% ±4 |


In [39]:
compare_m_experiments(
    combined_experiments_temp,
    bonferroni=True,
    alternative="two-sided",
).sort_index(axis=0)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Anthropic,Haiku_3,0.4517915,0.5831062,False
Anthropic,Haiku_35,1.0,0.7511386,False
Anthropic,Sonnet_35,1.0,1.0,False
Ollama,deepseekr1,0.7711052,0.7242056,False
Ollama,llama32,0.002974332,0.002244,True
Ollama,nemotron,0.0195538,0.01594044,False
Ollama,phi3,1.508677e-07,1.051415e-07,True
Ollama,phi4,1.0,0.734337,False
fireworks,deepseekr1_70b,1.0,1.0,False
fireworks,llama31,1.0,0.9104654,False


#### Hypothesis 3: Do different models work using format in the system prompt or user prompt?


Collate similar experiments


In [40]:
combined_experiment_sys_user = {
    "sys": [
        results_sys_user["structure_support_by_model_sys"],
        results_temp_sup["structure_support_by_model_sys"],
        results_sys_other["structure_support_by_model_sys"],
    ],
    "user": [
        results_sys_user["structure_support_by_model_user"],
        results_temp_sup["structure_support_by_model_user"],
        results_sys_other["structure_support_by_model_user"],
    ],
}

In [41]:
res = results_to_table(
    combined_experiment_sys_user,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(res)

| level_0   | level_1    | sys     | user    |
|:----------|:-----------|:--------|:--------|
| Anthropic | Haiku_3    | 95% ±6  | 95% ±6  |
| Anthropic | Haiku_35   | 99% ±4  | 100% ±4 |
| Anthropic | Sonnet_35  | 100% ±4 | 100% ±4 |
| Ollama    | deepseekr1 | 70% ±5  | 81% ±5  |
| Ollama    | llama32    | 71% ±5  | 83% ±5  |
| Ollama    | nemotron   | 15% ±5  | 39% ±6  |
| Ollama    | phi3       | 29% ±5  | 47% ±6  |
| Ollama    | phi4       | 98% ±2  | 98% ±2  |
| fireworks | llama31    | 94% ±3  | 95% ±3  |
| fireworks | llama32    | 81% ±5  | 86% ±4  |
| fireworks | llama33    | 99% ±2  | 99% ±2  |
| fireworks | qwen25     | 100% ±4 | 100% ±4 |


In [42]:
compare_m_experiments(
    combined_experiment_sys_user,
    bonferroni=True,
    alternative="two-sided",
).sort_index(axis=0)

Unnamed: 0,Unnamed: 1,Fisher exact,Barnard exact,Outcome
Anthropic,Haiku_3,1.0,1.0,False
Anthropic,Haiku_35,1.0,0.5291009,False
Anthropic,Sonnet_35,1.0,1.0,False
Ollama,deepseekr1,0.00120498,0.0009282922,True
Ollama,llama32,0.001316218,0.0009928911,True
Ollama,nemotron,1.346911e-10,1.147929e-10,True
Ollama,phi3,1.166575e-05,8.196285e-06,True
Ollama,phi4,1.0,1.0,False
fireworks,llama31,0.8561652,0.7516218,False
fireworks,llama32,0.1263373,0.127787,False


#### Hypothesis 4: Do LangChain output parsers do as well as format examples for complex formats?


Collate similar experiments


In [68]:
combined_experiment_sys_parser = {
    "sys": [
        results_sys_other["structure_support_by_model_sys"],
        results_temp_sup["structure_support_by_model_sys"],
    ],
    "parser": [
        results_sys_other["structure_support_by_model_parser"],
        results_temp_sup["structure_support_by_model_parsers"],
    ],
}

In [69]:
res = results_to_table(
    combined_experiment_sys_parser,
    combine_levels=True,
    cell_format=format_ci_pm,
    markdown=True,
)
print(res)

| level_0   | level_1    | sys     | parser   |
|:----------|:-----------|:--------|:---------|
| Anthropic | Haiku_3    | 95% ±6  | 52% ±11  |
| Anthropic | Haiku_35   | 99% ±4  | 50% ±11  |
| Anthropic | Sonnet_35  | 100% ±4 | 71% ±11  |
| Ollama    | deepseekr1 | 74% ±7  | 8% ±5    |
| Ollama    | llama32    | 74% ±7  | 8% ±5    |
| Ollama    | nemotron   | 10% ±5  | 0% ±5    |
| Ollama    | phi3       | 36% ±7  | 4% ±4    |
| Ollama    | phi4       | 98% ±3  | 61% ±11  |
| fireworks | llama31    | 95% ±4  | 42% ±11  |
| fireworks | llama32    | 84% ±6  | 0% ±5    |
| fireworks | llama33    | 98% ±3  | 31% ±11  |
| fireworks | qwen25     | 100% ±4 | 75% ±10  |


In [72]:
classes = generate_xml_classes(search_mode="unordered")
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=None)
    for name, schema in classes.items()
    if name.startswith("ArticleResponse")
]
combined_experiment_sys_parser_relaxed = {
    conf_name: [
        analyse_xml_experiment(
            ex_results,
            structured_formats_xml,
            verbose=False,
        )
        for ex_results in conf_data
    ]
    for conf_name, conf_data in combined_experiment_sys_parser.items()
}
res_table_sys_parser_1 = results_to_table(
    combined_experiment_sys_parser,
    combine_levels=True,
    cell_format=format_ci_pm,
)
res_table_sys_parser_2 = results_to_table(
    combined_experiment_sys_parser_relaxed,
    combine_levels=True,
    cell_format=format_ci_pm,
    compare_to=res_table_sys_parser_1,
    markdown=True,
)
print(res_table_sys_parser_2)

| level_0   | level_1    | sys        | parser      |
|:----------|:-----------|:-----------|:------------|
| Anthropic | Haiku_3    | 95% ±6     | **54% ±11** |
| Anthropic | Haiku_35   | 99% ±4     | **75% ±10** |
| Anthropic | Sonnet_35  | 100% ±4    | 71% ±11     |
| Ollama    | deepseekr1 | **74% ±6** | 8% ±5       |
| Ollama    | llama32    | 74% ±7     | **18% ±7**  |
| Ollama    | nemotron   | 10% ±5     | 0% ±5       |
| Ollama    | phi3       | 36% ±7     | 4% ±4       |
| Ollama    | phi4       | 98% ±3     | **69% ±11** |
| fireworks | llama31    | 95% ±4     | **57% ±11** |
| fireworks | llama32    | 84% ±6     | 0% ±5       |
| fireworks | llama33    | 98% ±3     | 31% ±11     |
| fireworks | qwen25     | 100% ±4    | 75% ±10     |


## Error analysis


In [45]:
def analyse_errors_from_results(ss_results, method="code", combined=False):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(
                    output["error_type"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    if combined:
                        error_counts[mname][e_name] = (
                            error_counts[mname].get(e_name, 0) + e_count
                        )
                    else:
                        error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = (
                    output["error_message"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                for error in errors:
                    if error is None:
                        continue
                    if error.lower().find("opening and ending tag mismatch") >= 0:
                        error_str = "XML tag mismatch"
                    elif error.lower().find("extracterror") >= 0:
                        error_str = "Missing main tags"
                    elif error.lower().find("input should be a valid integer") >= 0:
                        error_str = "Validation error (int)"
                    elif error.lower().find("premature end of data in tag") >= 0:
                        error_str = "Premature end"
                    elif error.lower().find("field required") >= 0:
                        error_str = "Missing field"
                    elif error.lower().find("expected '>'") >= 0:
                        error_str = "Tag malformed"
                    elif (
                        error.lower().find("extra content at the end of the document")
                        >= 0
                    ):
                        error_str = "Tag malformed"
                    elif error.find("BadGatewayError") >= 0:
                        error_str = "Connection error"
                    elif error.find("XMLSyntaxError") >= 0:
                        error_str = "Other syntax error"
                    else:
                        error_str = error

                    if combined:
                        error_counts[mname][error_str] = (
                            error_counts[mname].get(error_str, 0) + 1
                        )

                    else:
                        error_counts[mname][(tname, error_str)] = (
                            error_counts[mname].get((tname, error_str), 0) + 1
                        )

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index")

Let's look at the errors in the sys/user/parsers experiment to determine what's going on for the worst models


In [50]:
analyse_errors_from_results(
    results_temp_sup["structure_support_by_model_user"], method="parse", combined=True
).fillna(0).sort_index(axis=1)

Unnamed: 0,Connection error,Missing field,Missing main tags,Other syntax error,Tag malformed,Validation error (int),XML tag mismatch
Ollama_llama32,0.0,8.0,0.0,0.0,0.0,5.0,0.0
Ollama_nemotron,0.0,1.0,44.0,0.0,0.0,5.0,0.0
Ollama_phi4,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Ollama_deepseekr1,0.0,7.0,3.0,5.0,0.0,2.0,2.0
fireworks_llama31,1.0,0.0,0.0,0.0,0.0,4.0,0.0
fireworks_llama32,0.0,0.0,1.0,0.0,0.0,9.0,0.0
Anthropic_Haiku_3,0.0,0.0,0.0,0.0,0.0,5.0,0.0
Ollama_phi3,0.0,5.0,4.0,3.0,4.0,0.0,23.0
fireworks_llama33,1.0,0.0,0.0,0.0,0.0,0.0,1.0


DeepSeek performance is poor, and for the system prompt most errors are due to missing main tags -- typically indicating the output is not XML. Confirmed.

For the User prompt the XML is just badly generated with missing and corrupted tags.


In [None]:
for output in results_temp_sup["structure_support_by_model_user"]["Ollama_deepseekr1"][
    "ArticleResponse4XML"
]["outputs"]:
    if output["error_type"] != "ok":
        print("-" * 80)
        print(output["error_message"], "\n")
        if output["raw"] is not None:
            print(output["raw"].content)
        print()

Llama 3.2 on Fireworks is getting a big fat 0 for all levels with output parsers. Why's that?


In [None]:
for output in results_temp_sup["structure_support_by_model_parsers"][
    "fireworks_llama32"
]["ArticleResponse2XML"]["outputs"]:
    if output["error_type"] != "ok":
        print("-" * 80)
        print(output["error_message"], "\n")
        if output["raw"] is not None:
            print(output["raw"].content)
        print()

--------------------------------------------------------------------------------
RuntimeError, ExtractError: End or start strings not found 

**The Oldest Recorded Fossil: A Window into the Past**

Imagine walking through a barren, ancient landscape, surrounded by nothing but sand and rock. Yet, in this desolate environment, a tiny fragment of life has been preserved for over 3.5 billion years. This is the story of the oldest recorded fossil, a testament to the enduring power of life on Earth.

The fossil in question is a 3.5 billion-year-old stromatolite, a layered structure created by ancient microorganisms. Discovered in Western Australia, this ancient relic is a window into the early history of life on our planet. The stromatolite, named "Stromatolites of the Apex Chert," is a remarkable find, providing insights into the evolution of life on Earth.

This ancient fossil is a reminder that life has been present on our planet for an incredibly long time, long before the emergence of c

In [57]:
results_temp_sup.keys()

dict_keys(['structure_support_by_model_sys', 'structure_support_by_model_user', 'structure_support_by_model_parsers'])

In [None]:
results_temp_sup["structure_support_by_model_user"]["fireworks_llama32"]