## Using LangChain to get structured outputs


## Setup


In [None]:
import pickle
import pandas as pd
import tabulate
from scipy import stats
from langchain.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic
from langchain_ollama import ChatOllama
from langchain_fireworks import ChatFireworks

from pydantic_structure_definitions import *

from experiment_xml import (
    pydantic_to_xml_instructions,
    run_xml_experiment,
    load_single_experiment,
    load_experiment_summary,
    analyse_xml_experiment,
)

In [2]:
ANTHROPIC_API_KEY = "<API KEY>"
FIREWORKS_API_KEY = "<API KEY>"

In [3]:
import streamlit as st

LANGSMITH_API_KEY = st.secrets["api_keys"]["LANGSMITH_API_KEY"]
ANTHROPIC_API_KEY = st.secrets["api_keys"]["ANTHROPIC_API_KEY"]
FIREWORKS_API_KEY = st.secrets["api_keys"]["FIREWORKS_API_KEY"]

In [4]:
experiment_date = "28-02-25"
experiment_num = "6"
n_iter = 1

Let's start by creating a LLM model to run our structured output queries. Use a temperature of 0 to improve structured output generation (but at the cost of "creativity").


### Model setup


In [5]:
# LLM parameters
# temperature = 0.0
timeout = 30
num_ctx = 8192
num_predict = 4096

In [6]:
llm_models = []
llm_models_with_anthropic = []
for ii, temperature in enumerate([0, 0.8]):
    llm_models_ii = {
        "Ollama_llama32": ChatOllama(
            model="llama3.2",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_nemotron": ChatOllama(
            model="nemotron-mini",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_phi3": ChatOllama(
            model="phi3",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_phi4": ChatOllama(
            model="phi4",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_deepseekr1": ChatOllama(
            model="deepseek-r1",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "fireworks_llama31": ChatFireworks(
            model_name="accounts/fireworks/models/llama-v3p1-70b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        "fireworks_llama32": ChatFireworks(
            model_name="accounts/fireworks/models/llama-v3p2-3b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        "fireworks_llama33": ChatFireworks(
            model_name="accounts/fireworks/models/llama-v3p3-70b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        "fireworks_qwen25": ChatFireworks(
            model_name="accounts/fireworks/models/qwen2p5-72b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
    }
    llm_models_with_anthropic_ii = {
        **llm_models_ii,
        "Anthropic_Sonnet_35": ChatAnthropic(
            model="claude-3-5-sonnet-20241022",
            api_key=ANTHROPIC_API_KEY,
            timeout=timeout,
            temperature=temperature,
        ),
        "Anthropic_Haiku_35": ChatAnthropic(
            model="claude-3-5-haiku-20241022",
            api_key=ANTHROPIC_API_KEY,
            timeout=timeout,
            temperature=temperature,
        ),
        "Anthropic_Haiku_3": ChatAnthropic(
            model="claude-3-haiku-20240307",
            api_key=ANTHROPIC_API_KEY,
            timeout=timeout,
            temperature=temperature,
        ),
    }
    llm_models.append(llm_models_ii)
    llm_models_with_anthropic.append(llm_models_with_anthropic_ii)

### Problem setup and prompt


In [7]:
test_science_prompt_txt = """
You are a professional science writer tasked with responding to members of
the general public who write in asking questions about science.
Write an article responding to a writer's question for publication in a
science magazine intended for a general readership with a high-school education.
You should write clearly and compellingly, include all relavent context,
and provide motivating stories where applicable.

Your response must be less than 200 words.

The question given to you is the following:
{question}
"""

questions = [
    "What is the oldest recorded fossil?",
    "What is a black hole?",
    "How far away is the sun?",
    "Which other planet in the Solar System has a surface gravity closest to that of the Earth?",
    "Eris, Haumea, Makemake and Ceres are all examples of what?",
    "Why does earth have seasons? Do other planets exhibit seasons too?",
    "What causes the aurora borealis?",
    "Why is the sky blue?",
    "How do bees communicate?",
    "What is the smallest unit of life?",
    "How do plants make their own food?",
    "Why do we dream?",
    "What is the theory of relativity?",
    "How do volcanoes erupt?",
    "What is the speed of light?",
    "How do magnets work?",
    "What is the purpose of DNA?",
    "What are the different types of galaxies?",
    "Why do some animals hibernate?",
    "How do vaccines work?",
]

In [8]:
prompt_direct = ChatPromptTemplate.from_template(test_science_prompt_txt)

prompt_system_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Return a publishable article in the requested format.\n{format_instructions}",
        ),
        ("human", test_science_prompt_txt),
    ]
)

prompt_system_plus_reminder_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Return a publishable article in the requested format.\n{format_instructions}",
        ),
        (
            "human",
            test_science_prompt_txt + "\nYour response must be in valid XML.",
        ),
    ]
)

prompt_user_format = ChatPromptTemplate.from_template(
    test_science_prompt_txt + "\n{format_instructions}"
)

### Schema using Pydantic XML


In [5]:
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=pydantic_to_xml_instructions(schema))
    for schema in [
        ArticleResponse1XML,
        ArticleResponse1nointXML,
        ArticleResponse2XML,
        ArticleResponse3XML,
        ArticleResponse4XML,
    ]
]

structured_formats_xml_alt = [
    dict(pydantic=schema, format_instructions=pydantic_to_xml_instructions(schema))
    for schema in [
        ArticleResponse1XML,
        ArticleResponse2XMLalt,
        ArticleResponse3XML,
        ArticleResponse4XMLalt,
    ]
]

In [6]:
print(pydantic_to_xml_instructions(ArticleResponse2XMLalt))

You must respond only in XML using the following schema.
Do not provide any output outside the first and last XML tags.

<article>
  <!--Structured article for publication answering a reader's question-->
  <title>
    {Title of the article - must be type str}
  </title>
  <answer>
    {Answer the writer's question - must be type str}
  </answer>
<further_questions>
  <!--A list of related questions of interest to the readers-->
<!-- Next list element -->
  <further_question>
    {A related question of interest to readers - must be type str}
  </further_question>
<!-- First list element -->
  <further_question>
    {A related question of interest to readers - must be type str}
  </further_question>
<!-- Etc -->
  <further_question>
  ...
  </further_question>
</further_questions>
</article>


In [7]:
from langchain.output_parsers import XMLOutputParser

xml_output_parsers = [
    XMLOutputParser(name="article", tags=["article", "title", "answer", "number"]),
    XMLOutputParser(
        name="article", tags=["article", "title", "answer", "further_question"]
    ),
    XMLOutputParser(
        name="article",
        tags=[
            "article",
            "title",
            "historical_event_1",
            "year",
            "event",
            "historical_event_2",
            "year",
            "event",
        ],
    ),
    XMLOutputParser(
        name="article", tags=["article", "title", "historical_event", "year", "event"]
    ),
]
structured_formats_output_parser_xml = [
    dict(pydantic=schema, format_instructions=op.get_format_instructions())
    for schema, op in zip(
        [
            ArticleResponse1nointXML,
            ArticleResponse2XML,
            ArticleResponse3XML,
            ArticleResponse4XML,
        ],
        xml_output_parsers,
    )
]

## Experiment


In [12]:
hypotheses = [
    "1. Do small models including Phi3 and Nemotron improve their output by providing more explicit instructions on the format at the end of the prompt?",
    "2. Do output parsers perform worse for all models?",
    "3. What is the performance of the models using user parsers?",
]

### Experiment run


User prompt


In [13]:
if "structure_support_by_model_sys" not in locals():
    structure_support_by_model_sys = {}

_ = run_xml_experiment(
    prompt_system_format,
    questions,
    llm_models_with_anthropic[0],
    structured_formats_xml,
    n_iter=1,
    results_out=structure_support_by_model_sys,
    save_file_name=f"exp{experiment_num}_xml_output_sys_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1XML   Pos: 1
e.ee..e...eeeee.eeee
Model: Ollama_llama32  Output: ArticleResponse1nointXML   Pos: 2
......e.............
Model: Ollama_llama32  Output: ArticleResponse2XML   Pos: 3
....................
Model: Ollama_llama32  Output: ArticleResponse3XML   Pos: 4
.....e..............
Model: Ollama_llama32  Output: ArticleResponse4XML   Pos: 5
....e...............
Model: Ollama_nemotron  Output: ArticleResponse1XML   Pos: 6
eeee..eeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse1nointXML   Pos: 7
eeee..eeee.eeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse2XML   Pos: 8
eeee.eeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse3XML   Pos: 9
eeee.eeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse4XML   Pos: 10
eeee.eeeeeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse1XML   Pos: 11
e.eee..eee....ee...e
Model: Ollama_phi3  Output: ArticleResponse1nointXML   Pos: 12
..eeee.......ee.....
Model: Olla

System prompt


In [14]:
if "structure_support_by_model_user" not in locals():
    structure_support_by_model_user = {}

_ = run_xml_experiment(
    prompt_user_format,
    questions,
    llm_models_with_anthropic[0],
    structured_formats_xml,
    n_iter=n_iter,
    results_out=structure_support_by_model_user,
    save_file_name=f"exp{experiment_num}_xml_output_user_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1XML   Pos: 1
.eee.e........e.....
Model: Ollama_llama32  Output: ArticleResponse1nointXML   Pos: 2
...........e........
Model: Ollama_llama32  Output: ArticleResponse2XML   Pos: 3
....................
Model: Ollama_llama32  Output: ArticleResponse3XML   Pos: 4
....................
Model: Ollama_llama32  Output: ArticleResponse4XML   Pos: 5
ee.e.e..e.e......e..
Model: Ollama_nemotron  Output: ArticleResponse1XML   Pos: 6
e.eeeeeee.e.eee..eee
Model: Ollama_nemotron  Output: ArticleResponse1nointXML   Pos: 7
ee.ee.ee....eeee.e.e
Model: Ollama_nemotron  Output: ArticleResponse2XML   Pos: 8
eeeee.e.ee..eee..e.e
Model: Ollama_nemotron  Output: ArticleResponse3XML   Pos: 9
e..ee.............e.
Model: Ollama_nemotron  Output: ArticleResponse4XML   Pos: 10
...ee.ee......e....e
Model: Ollama_phi3  Output: ArticleResponse1XML   Pos: 11
....e..e......ee...e
Model: Ollama_phi3  Output: ArticleResponse1nointXML   Pos: 12
...e.e...e....ee....
Model: Olla

Output parsers


In [15]:
if "structure_support_by_model_parsers" not in locals():
    structure_support_by_model_parsers = {}

_ = run_xml_experiment(
    prompt_system_format,
    questions,
    llm_models_with_anthropic[0],
    structured_formats_output_parser_xml,
    n_iter=n_iter,
    results_out=structure_support_by_model_parsers,
    save_file_name=f"exp{experiment_num}_xml_output_parser_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1nointXML   Pos: 1
eeeeeeeeeee.eee.eeee
Model: Ollama_llama32  Output: ArticleResponse2XML   Pos: 2
eee.eee.eeeeeeeeeeee
Model: Ollama_llama32  Output: ArticleResponse3XML   Pos: 3
eeeeeeeeeeeeeeeeeeee
Model: Ollama_llama32  Output: ArticleResponse4XML   Pos: 4
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse1nointXML   Pos: 5
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse2XML   Pos: 6
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse3XML   Pos: 7
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse4XML   Pos: 8
eeeeeeeeeeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse1nointXML   Pos: 9
eeeeeee.eeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse2XML   Pos: 10
e.eeeeeeeeeee.e.eeee
Model: Ollama_phi3  Output: ArticleResponse3XML   Pos: 11
eeeeeeeeeeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse4XML   Pos: 12
eeeeeeeeeeeeeeeeeeee
Model: Ollama_phi4

Save all models


In [16]:
namespace = locals()
all_model_outputs = [k for k in namespace.keys() if k.startswith("structure_support")]

with open(file=f"exp{experiment_num}_all_models_{experiment_date}.pkl", mode="wb") as f:
    data = dict(
        hypotheses=hypotheses,
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        questions=questions,
        prompt=prompt_direct,
        models={k: namespace[k] for k in all_model_outputs},
    )
    pickle.dump(data, f)

### Results table


Load models


In [9]:
# Temperature 0 - sys, user, parsers experiment
# Includes Anthropic models
namespace = locals()
load_experiment_summary(experiment_num, experiment_date, namespace)

In [9]:
results_list = {
    "sys": structure_support_by_model_sys,
    "user": structure_support_by_model_user,
    "parsers": structure_support_by_model_parsers,
}

df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = pd.DataFrame.from_dict(
        {
            tuple(mname.split("_", maxsplit=1)): {
                tname: ss_results[mname][tname]["valid"] * 100
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    display(name)

'sys'

'user'

'parsers'

In [10]:
df = pd.concat(df_results).reorder_levels([1, 2, 0], axis=0).sort_index(axis=0)

with open(
    file=f"exp{experiment_num}_summary_df_{experiment_date}.json", mode="wb"
) as f:
    df.to_json(f)

In [11]:
df_orig = df.copy()

Reanalyse


In [22]:
structure_support_by_model_parsers_new = analyse_xml_experiment(
    structure_support_by_model_parsers, structured_formats_output_parser_xml
)
structure_support_by_model_user_new = analyse_xml_experiment(
    structure_support_by_model_user, structured_formats_output_parser_xml
)
structure_support_by_model_sys_new = analyse_xml_experiment(
    structure_support_by_model_sys, structured_formats_output_parser_xml
)

Model: Ollama_llama32  Structure: ArticleResponse1nointXML   Pos: 1
eeeeee..ee..e.e...ee
Model: Ollama_llama32  Structure: ArticleResponse2XML   Pos: 2
.e..eee..e.eeeee.e.e
Model: Ollama_llama32  Structure: ArticleResponse3XML   Pos: 3
eeeeeeeeeeeeeeeeeeee
Model: Ollama_llama32  Structure: ArticleResponse4XML   Pos: 4
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Structure: ArticleResponse1nointXML   Pos: 5
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Structure: ArticleResponse2XML   Pos: 6
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Structure: ArticleResponse3XML   Pos: 7
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Structure: ArticleResponse4XML   Pos: 8
eeeeeeeeeeeeeeeeeeee
Model: Ollama_phi3  Structure: ArticleResponse1nointXML   Pos: 9
eeeeeee.eeeeeeeeeeee
Model: Ollama_phi3  Structure: ArticleResponse2XML   Pos: 10
e.eeeeeeeeeee.e.eee.
Model: Ollama_phi3  Structure: ArticleResponse3XML   Pos: 11
eeeeeeeeeeeeeeeeeeee
Model: Ollama_phi3  Structure: ArticleResponse4XML   Pos: 12
eee

In [13]:
results_list = {
    "unordered": results_new,
    "ordered": structure_support_by_model_parsers,
}

df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = pd.DataFrame.from_dict(
        {
            tuple(mname.split("_", maxsplit=1)): {
                tname: ss_results[mname][tname]["valid"] * 100
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    display(name)

'unordered'

'ordered'

In [17]:
df_new = pd.concat(df_results).reorder_levels([1, 2, 0], axis=0).sort_index(axis=0)

In [18]:
import tabulate

print(
    tabulate.tabulate(
        df_new.reset_index(), headers="keys", tablefmt="pipe", showindex=False
    )
)

| level_0   | level_1    | level_2   |   ArticleResponse1nointXML |   ArticleResponse2XML |   ArticleResponse3XML |   ArticleResponse4XML |
|:----------|:-----------|:----------|---------------------------:|----------------------:|----------------------:|----------------------:|
| Anthropic | Haiku_3    | ordered   |                         25 |                   100 |                    85 |                     0 |
| Anthropic | Haiku_3    | unordered |                         30 |                   100 |                    85 |                     0 |
| Anthropic | Haiku_35   | ordered   |                          0 |                   100 |                   100 |                     0 |
| Anthropic | Haiku_35   | unordered |                        100 |                   100 |                   100 |                     0 |
| Anthropic | Sonnet_35  | ordered   |                        100 |                   100 |                    85 |                     0 |
| Anthropic | Sonnet

Let's combine these results over structures to have more power to determine the hypothesis


In [20]:
from hypothesis_testing import wilson_score_ci

In [23]:
results_list = {
    "sys": structure_support_by_model_sys_new,
    "user": structure_support_by_model_user_new,
}


def format_ci(p, num_total):
    avg, lb, ub = wilson_score_ci(p * num_total, num_total)
    return f"{lb*100:.1f}% — {ub*100:.0f}% "
    # return f"{avg*100:.0f}%"  # ±{max(avg-lb, ub-avg)*100:.0f}"


df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = {
        tuple(mname.split("_", maxsplit=1)): format_ci(
            pd.Series(
                [
                    ss_results[mname][tname]["valid"]
                    for tname in ss_results[mname].keys()
                ]
            ).mean(),
            20 * len(ss_results[mname].keys()),
        )
        for mname in ss_results.keys()
    }

pd.DataFrame.from_dict(df_results)

Unnamed: 0,Unnamed: 1,sys,user
Ollama,llama32,89.5% — 99%,81.5% — 95%
Ollama,nemotron,3.5% — 15%,45.3% — 67%
Ollama,phi3,35.7% — 57%,46.6% — 68%
Ollama,phi4,95.4% — 100%,95.4% — 100%
Ollama,deepseekr1,67.2% — 85%,70.0% — 87%
fireworks,llama31,93.3% — 100%,95.4% — 100%
fireworks,llama32,93.3% — 100%,93.3% — 100%
fireworks,llama33,95.4% — 100%,91.3% — 99%
fireworks,qwen25,95.4% — 100%,95.4% — 100%
Anthropic,Sonnet_35,95.4% — 100%,95.4% — 100%


Approximate confidence interval


In [16]:
print(
    tabulate.tabulate(
        pd.DataFrame.from_dict(df_results).reset_index(),
        headers="keys",
        tablefmt="pipe",
        showindex=False,
    )
)

| level_0   | level_1    | sys          | user         |
|:----------|:-----------|:-------------|:-------------|
| Ollama    | llama32    | 75.6% — 90%  | 79.0% — 92%  |
| Ollama    | nemotron   | 4.1% — 15%   | 40.4% — 60%  |
| Ollama    | phi3       | 37.5% — 57%  | 51.2% — 70%  |
| Ollama    | phi4       | 94.6% — 100% | 94.6% — 100% |
| Ollama    | deepseekr1 | 66.8% — 83%  | 72.2% — 87%  |
| fireworks | llama31    | 88.8% — 98%  | 88.8% — 98%  |
| fireworks | llama32    | 79.0% — 92%  | 82.6% — 94%  |
| fireworks | llama33    | 96.3% — 100% | 93.0% — 99%  |
| fireworks | qwen25     | 96.3% — 100% | 96.3% — 100% |
| Anthropic | Sonnet_35  | 96.3% — 100% | 96.3% — 100% |
| Anthropic | Haiku_35   | 94.6% — 100% | 96.3% — 100% |
| Anthropic | Haiku_3    | 88.8% — 98%  | 88.8% — 98%  |


### Error analysis


Extract all error messages & count


In [25]:
def analyse_errors_from_results(ss_results, method="code", combined=False):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(
                    output["error_type"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    if combined:
                        error_counts[mname][e_name] = (
                            error_counts[mname].get(e_name, 0) + e_count
                        )
                    else:
                        error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = (
                    output["error_message"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                for error in errors:
                    if error is None:
                        continue
                    if error.lower().find("opening and ending tag mismatch") >= 0:
                        error_str = "XML tag mismatch"
                    elif error.lower().find("extracterror") >= 0:
                        error_str = "Missing main tags"
                    elif error.lower().find("input should be a valid integer") >= 0:
                        error_str = "Validation error (int)"
                    elif error.lower().find("premature end of data in tag") >= 0:
                        error_str = "Premature end"
                    elif error.lower().find("field required") >= 0:
                        error_str = "Missing field"
                    elif error.lower().find("expected '>'") >= 0:
                        error_str = "Tag malformed"
                    elif (
                        error.lower().find("extra content at the end of the document")
                        >= 0
                    ):
                        error_str = "Tag malformed"
                    elif error.find("BadGatewayError") >= 0:
                        error_str = "Connection error"
                    elif error.find("XMLSyntaxError") >= 0:
                        error_str = "Other syntax error"
                    else:
                        error_str = error

                    if combined:
                        error_counts[mname][error_str] = (
                            error_counts[mname].get(error_str, 0) + 1
                        )

                    else:
                        error_counts[mname][(tname, error_str)] = (
                            error_counts[mname].get((tname, error_str), 0) + 1
                        )

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index")

In [26]:
analyse_errors_from_results(
    structure_support_by_model_parsers, method="parse", combined=True
).fillna(0).sort_index(axis=1)

Unnamed: 0,Connection error,"InternalServerError, Error code: 529 - {'type': 'error', 'error': {'type': 'overloaded_error', 'message': 'Overloaded'}}",Missing field,Missing main tags,Other syntax error,Premature end,Tag malformed,XML tag mismatch
Ollama_llama32,0.0,0.0,66.0,6.0,0.0,0.0,0.0,4.0
Ollama_phi3,0.0,0.0,30.0,15.0,3.0,0.0,1.0,27.0
Ollama_phi4,0.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0
Ollama_deepseekr1,0.0,0.0,19.0,35.0,3.0,3.0,0.0,13.0
fireworks_llama31,1.0,0.0,45.0,0.0,0.0,0.0,0.0,0.0
fireworks_llama33,0.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0
fireworks_qwen25,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0
Anthropic_Sonnet_35,0.0,0.0,20.0,0.0,0.0,0.0,0.0,3.0
Anthropic_Haiku_35,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0
Anthropic_Haiku_3,0.0,5.0,33.0,0.0,0.0,0.0,0.0,0.0


In [27]:
analyse_errors_from_results(
    structure_support_by_model_user, method="parse", combined=True
).fillna(0).sort_index(axis=1)

Unnamed: 0,Connection error,Missing field,Missing main tags,Other syntax error,Tag malformed,Validation error (int),XML tag mismatch
Ollama_llama32,0.0,8.0,0.0,0.0,0.0,5.0,0.0
Ollama_nemotron,0.0,1.0,44.0,0.0,0.0,5.0,0.0
Ollama_phi4,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Ollama_deepseekr1,0.0,7.0,3.0,5.0,0.0,2.0,2.0
fireworks_llama31,1.0,0.0,0.0,0.0,0.0,4.0,0.0
fireworks_llama32,0.0,0.0,1.0,0.0,0.0,9.0,0.0
Anthropic_Haiku_3,0.0,0.0,0.0,0.0,0.0,5.0,0.0
Ollama_phi3,0.0,5.0,4.0,3.0,4.0,0.0,23.0
fireworks_llama33,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
structure_support_by_model_parsers.keys()

dict_keys(['Ollama_llama32', 'Ollama_nemotron', 'Ollama_phi3', 'Ollama_phi4', 'Ollama_deepseekr1', 'fireworks_llama31', 'fireworks_llama32', 'fireworks_llama33', 'fireworks_qwen25', 'Anthropic_Sonnet_35', 'Anthropic_Haiku_35', 'Anthropic_Haiku_3'])

In [29]:
from experiment_xml import extract_substring

In [30]:
output_xml = extract_substring(
    structure_support_by_model_parsers["Anthropic_Haiku_3"]["ArticleResponse1nointXML"][
        "outputs"
    ][10]["raw"].content,
    "<article>",
    "</article>",
)

In [31]:
print(output_xml)

<article>
<title>How Plants Make Their Own Food</title>
<number>1</number>
<answer>
Plants are amazing living things that can create their own food through a process called photosynthesis. Using just sunlight, water, and carbon dioxide, plants are able to produce the nutrients they need to grow and thrive.

Here's how it works: Plants have special structures called chloroplasts that contain chlorophyll, a green pigment that absorbs sunlight. During photosynthesis, the chloroplasts use the energy from sunlight to convert carbon dioxide and water into glucose, a type of sugar that the plant can use for energy.

This process is like a plant's version of cooking or baking - they are essentially "making" their own food from basic raw ingredients. And just like our food, this glucose provides the plant with the nutrients it needs to grow its roots, stems, leaves, and flowers.

So the next time you see a lush green plant, remember that it is engaged in a remarkable feat of natural chemistry, 

In [41]:
if 1:
    for output in structure_support_by_model_parsers["Anthropic_Haiku_35"][
        "ArticleResponse3XML"
    ]["outputs"]:
        if output["error_type"] == "ok":
            print(output["error_message"], "\n")
            if output["raw"] is not None:
                print(output["raw"].content)
            print()

None 

<?xml version="1.0" encoding="UTF-8"?>
<article>
    <title>Tracing Life's Ancient Roots: The Oldest Known Fossil</title>
    
    <historical_event_1>
        <year>3.5 billion years ago</year>
        <event>Researchers discovered stromatolite fossils in Western Australia's Pilbara region, representing some of the earliest evidence of life on Earth. These layered rock formations were created by ancient microorganisms called cyanobacteria, which formed complex microbial mats in shallow marine environments.</event>
    </historical_event_1>
    
    <historical_event_2>
        <year>4.28 billion years ago</year>
        <event>Microscopic graphite deposits found in Quebec, Canada, suggest potential microbial life existed even earlier. These carbon-based remnants hint at primitive biological processes occurring near the planet's earliest geological periods, though scientists continue to debate their definitive biological origin.</event>
    </historical_event_2>
</article>

None

### Load previous results


First try all models file


In [None]:
if 0:
    import pickle

    with open(
        file=f"exp{experiment_num}_all_models_{experiment_date}.pkl", mode="rb"
    ) as f:
        data = pickle.load(f)

    # Inject into toplevel namespace
    namespace = locals()
    for key, value in data["models"].items():
        if key not in namespace:
            print(f"Loaded {key}")
            namespace[key] = value

In [None]:
if 0:
    import pickle

    # Load individual models
    output_idents = ["sys", "alt"]

    namespace = locals()
    for ident in output_idents:
        with open(
            file=f"exp{experiment_num}_xml_output_{ident}_{experiment_date}.pkl",
            mode="rb",
        ) as f:
            data = pickle.load(f)

        key = f"structure_support_by_model_{ident}"
        if key not in namespace:
            print(f"Loaded {key}")
            namespace[key] = data["structure_support_by_model"]

## Hypothesis testing


Hypothesis:

- H0: Different temperatures lead to different rates of XML schema conformance
- H1: The proportions of conformance are the same


In [None]:
# Bonferroni correction
model_list = structure_support_by_model_t0.keys()
n_tests = len(model_list)
alpha = 0.05  # / n_tests

for model in model_list:

    contingency_table = {}
    for name, ss_results in results_list.items():
        num_true = 0
        num_total = 0
        for tname in ss_results[model].keys():
            num_true += ss_results[model][tname]["valid"] * len(questions)
            num_total += len(questions)

        contingency_table[name] = {"Passed": num_true, "Failed": num_total - num_true}

    ct = pd.DataFrame.from_dict(contingency_table, orient="index")

    # Ensure ordering to match hypotheses
    # Columns are experiments
    # Rows are outcomes
    # Column marginals are constant
    ct_n = ct[["Passed", "Failed"]].to_numpy().T

    print(f"\n{model}")
    print(sf := stats.fisher_exact(ct_n, alternative="greater"))
    print(sb := stats.barnard_exact(ct_n, alternative="greater"))
    if sb.pvalue < alpha:
        print(f"Hypothesis test passed: {sb.pvalue:.3g} < {alpha:.3g}")


Ollama_llama32
SignificanceResult(statistic=np.float64(1.5681818181818181), pvalue=np.float64(0.14208888177506213))
BarnardExactResult(statistic=np.float64(1.249303038696861), pvalue=np.float64(0.12258240508091653))

Ollama_nemotron
SignificanceResult(statistic=np.float64(1.6521739130434783), pvalue=np.float64(0.28396268931110596))
BarnardExactResult(statistic=np.float64(0.8604859293176627), pvalue=np.float64(0.2645506113622348))

Ollama_phi3
SignificanceResult(statistic=np.float64(2.171112556929083), pvalue=np.float64(0.006510361002595765))
BarnardExactResult(statistic=np.float64(2.6222244603662923), pvalue=np.float64(0.004495770764296521))
Hypothesis test passed: 0.0045 < 0.05

Ollama_phi4
SignificanceResult(statistic=np.float64(0.0), pvalue=np.float64(1.0))
BarnardExactResult(statistic=np.float64(-1.0025094142341715), pvalue=np.float64(1.0))

Ollama_deepseekr1
SignificanceResult(statistic=np.float64(1.1126126126126126), pvalue=np.float64(0.43520490196213557))
BarnardExactResult(sta

### Combined levels: does temperature change the conformance to XML ?


In [None]:
from scipy import stats

model_list = structure_support_by_model_t0.keys()

contingency_table = {}
for model in model_list:
    for name, ss_results in results_list.items():
        for tname in ss_results[model].keys():
            contingency_table[name] = contingency_table.get(
                name, {"Passed": 0, "Failed": 0}
            )
            num_true = ss_results[model][tname]["valid"] * len(questions)
            num_total = len(questions)

            contingency_table[name]["Passed"] += num_true
            contingency_table[name]["Failed"] += num_total - num_true

ct = pd.DataFrame.from_dict(contingency_table, orient="index")

From the contingency table alone we see that there is some differnece between the results for temperatures


In [None]:
ct

Unnamed: 0,Passed,Failed
t=0,987.0,213.0
t=0.8,949.0,251.0


In [None]:
# Chi-Squared Test for multiple groups
res = stats.chi2_contingency(ct)
print(f"\nChi-Squared Test:\nStatistic={res.statistic:.4f}, p={res.pvalue:.4g}")


Chi-Squared Test:
Statistic=3.6576, p=0.05582


In [None]:
# Chi-Squared Test for multiple groups
res = stats.barnard_exact(ct)
print(f"Barnard exact test:\nStatistic={res.statistic:.4f}, p={res.pvalue:.4g}")

Barnard exact test:
Statistic=1.9642, p=0.04995
