## Using LangChain to get structured outputs


## Setup


In [None]:
import pickle
import pandas as pd
import tabulate
from scipy import stats
from langchain.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic
from langchain_ollama import ChatOllama
from langchain_fireworks import ChatFireworks

from pydantic_structure_definitions import *

from experiment_xml import (
    pydantic_to_xml_instructions,
    run_xml_experiment,
)

In [2]:
ANTHROPIC_API_KEY = "<API KEY>"
FIREWORKS_API_KEY = "<API KEY>"

In [3]:
import streamlit as st

LANGSMITH_API_KEY = st.secrets["api_keys"]["LANGSMITH_API_KEY"]
ANTHROPIC_API_KEY = st.secrets["api_keys"]["ANTHROPIC_API_KEY"]
FIREWORKS_API_KEY = st.secrets["api_keys"]["FIREWORKS_API_KEY"]

In [4]:
experiment_date = "25-02-25"
experiment_num = "4"
n_iter = 1

Let's start by creating a LLM model to run our structured output queries. Use a temperature of 0 to improve structured output generation (but at the cost of "creativity").


### Model setup


In [5]:
# LLM parameters
# temperature = 0.0
timeout = 30
num_ctx = 8192
num_predict = 4096

In [None]:
llm_models = []
llm_models_with_anthropic = []
for ii, temperature in enumerate([0, 0.8]):
    llm_models_ii = {
        "Ollama_llama32": ChatOllama(
            model="llama3.2",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_nemotron": ChatOllama(
            model="nemotron-mini",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_phi3": ChatOllama(
            model="phi3",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_phi4": ChatOllama(
            model="phi4",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "Ollama_deepseekr1": ChatOllama(
            model="deepseek-r1",
            temperature=temperature,
            num_ctx=num_ctx,
            num_thread=1,
            num_predict=num_predict,
        ),
        "fireworks_llama31": ChatFireworks(
            model_name="accounts/fireworks/models/llama-v3p1-70b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        "fireworks_llama32": ChatFireworks(
            model_name="accounts/fireworks/models/llama-v3p2-3b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        "fireworks_llama33": ChatFireworks(
            model_name="accounts/fireworks/models/llama-v3p3-70b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        "fireworks_qwen25": ChatFireworks(
            model_name="accounts/fireworks/models/qwen2p5-72b-instruct",
            api_key=FIREWORKS_API_KEY,
            temperature=temperature,
            timeout=timeout,
        ),
        # "fireworks_deepseekr1": ChatFireworks(
        #     model_name="accounts/fireworks/models/deepseek-r1",
        #     api_key=FIREWORKS_API_KEY,
        #     temperature=temperature,
        #     timeout=timeout,
        # ),
    }
    llm_models_with_anthropic_ii = {
        **llm_models_ii,
        "Anthropic_Sonnet_35": ChatAnthropic(
            model="claude-3-5-sonnet-20241022",
            api_key=ANTHROPIC_API_KEY,
            timeout=timeout,
            temperature=temperature,
        ),
        "Anthropic_Haiku_35": ChatAnthropic(
            model="claude-3-5-haiku-20241022",
            api_key=ANTHROPIC_API_KEY,
            timeout=timeout,
            temperature=temperature,
        ),
        "Anthropic_Haiku_3": ChatAnthropic(
            model="claude-3-haiku-20240307",
            api_key=ANTHROPIC_API_KEY,
            timeout=timeout,
            temperature=temperature,
        ),
    }
    llm_models.append(llm_models_ii)
    llm_models_with_anthropic.append(llm_models_with_anthropic_ii)

### Problem setup and prompt


In [7]:
test_science_prompt_txt = """
You are a professional science writer tasked with responding to members of
the general public who write in asking questions about science.
Write an article responding to a writer's question for publication in a
science magazine intended for a general readership with a high-school education.
You should write clearly and compellingly, include all relavent context,
and provide motivating stories where applicable.

Your response must be less than 200 words.

The question given to you is the following:
{question}
"""

questions = [
    "What is the oldest recorded fossil?",
    "What is a black hole?",
    "How far away is the sun?",
    "Which other planet in the Solar System has a surface gravity closest to that of the Earth?",
    "Eris, Haumea, Makemake and Ceres are all examples of what?",
    "Why does earth have seasons? Do other planets exhibit seasons too?",
    "What causes the aurora borealis?",
    "Why is the sky blue?",
    "How do bees communicate?",
    "What is the smallest unit of life?",
    "How do plants make their own food?",
    "Why do we dream?",
    "What is the theory of relativity?",
    "How do volcanoes erupt?",
    "What is the speed of light?",
    "How do magnets work?",
    "What is the purpose of DNA?",
    "What are the different types of galaxies?",
    "Why do some animals hibernate?",
    "How do vaccines work?",
]

In [8]:
prompt_direct = ChatPromptTemplate.from_template(test_science_prompt_txt)

prompt_system_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Return a publishable article in the requested format.\n{format_instructions}",
        ),
        ("human", test_science_prompt_txt),
    ]
)

prompt_system_plus_reminder_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Return a publishable article in the requested format.\n{format_instructions}",
        ),
        (
            "human",
            test_science_prompt_txt + "\nYour response must be in valid XML.",
        ),
    ]
)

prompt_user_format = ChatPromptTemplate.from_template(
    test_science_prompt_txt + "\n{format_instructions}"
)

### Schema using Pydantic XML


In [10]:
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=pydantic_to_xml_instructions(schema))
    for schema in [
        ArticleResponse1XML,
        ArticleResponse1nointXML,
        ArticleResponse2XML,
        ArticleResponse3XML,
        ArticleResponse4XML,
    ]
]

structured_formats_xml_alt = [
    dict(pydantic=schema, format_instructions=pydantic_to_xml_instructions(schema))
    for schema in [
        ArticleResponse1XML,
        ArticleResponse2XMLalt,
        ArticleResponse3XML,
        ArticleResponse4XMLalt,
    ]
]

In [11]:
print(pydantic_to_xml_instructions(ArticleResponse2XMLalt))

You must respond only in XML using the following schema.
Do not provide any output outside the first and last XML tags.

<article>
  <!--Structured article for publication answering a reader's question-->
  <title>
    {Title of the article - must be type str}
  </title>
  <answer>
    {Answer the writer's question - must be type str}
  </answer>
<further_questions>
  <!--A list of related questions of interest to the readers-->
<!-- Next list element -->
  <further_question>
    {A related question of interest to readers - must be type str}
  </further_question>
<!-- First list element -->
  <further_question>
    {A related question of interest to readers - must be type str}
  </further_question>
<!-- Etc -->
  <further_question>
  ...
  </further_question>
</further_questions>
</article>


## Experiment

Hypotheses:

- Temperature influences the conformance to XML


First Temperature


In [12]:
if "structure_support_by_model_t0" not in locals():
    structure_support_by_model_t0 = {}

_ = run_xml_experiment(
    prompt_system_format,
    questions,
    llm_models_with_anthropic[0],
    structured_formats_xml,
    n_iter=1,
    results_out=structure_support_by_model_t0,
    save_file_name=f"exp{experiment_num}_xml_output_t0_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1XML   Pos: 1
e.ee..e...eeeee.eeee
Model: Ollama_llama32  Output: ArticleResponse1nointXML   Pos: 2
......e.............
Model: Ollama_llama32  Output: ArticleResponse2XML   Pos: 3
....................
Model: Ollama_llama32  Output: ArticleResponse3XML   Pos: 4
.....e..............
Model: Ollama_llama32  Output: ArticleResponse4XML   Pos: 5
....e...............
Model: Ollama_nemotron  Output: ArticleResponse1XML   Pos: 6
eeee..eeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse1nointXML   Pos: 7
eeee..eeee.eeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse2XML   Pos: 8
eeee.eeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse3XML   Pos: 9
eeee.eeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse4XML   Pos: 10
eeee.eeeeeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse1XML   Pos: 11
e.eee..eee....ee...e
Model: Ollama_phi3  Output: ArticleResponse1nointXML   Pos: 12
..eeee.......ee.....
Model: Olla

In [13]:
if "structure_support_by_model_t08" not in locals():
    structure_support_by_model_t08 = {}

_ = run_xml_experiment(
    prompt_system_format,
    questions,
    llm_models_with_anthropic[1],
    structured_formats_xml,
    n_iter=1,
    results_out=structure_support_by_model_t08,
    save_file_name=f"exp{experiment_num}_xml_output_t08_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1XML   Pos: 1
eeee.......eeeee.eee
Model: Ollama_llama32  Output: ArticleResponse1nointXML   Pos: 2
....................
Model: Ollama_llama32  Output: ArticleResponse2XML   Pos: 3
.......e............
Model: Ollama_llama32  Output: ArticleResponse3XML   Pos: 4
.....e.......ee..e..
Model: Ollama_llama32  Output: ArticleResponse4XML   Pos: 5
...ee..eee.e........
Model: Ollama_nemotron  Output: ArticleResponse1XML   Pos: 6
eeee.eeeeeeeeeee.eee
Model: Ollama_nemotron  Output: ArticleResponse1nointXML   Pos: 7
eeee.eeeee.eeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse2XML   Pos: 8
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse3XML   Pos: 9
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse4XML   Pos: 10
eeeee.eeeeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse1XML   Pos: 11
e.eee....e.e..eee..e
Model: Ollama_phi3  Output: ArticleResponse1nointXML   Pos: 12
..e.e.e.ee.ee.eee...
Model: Olla

Save all models


In [14]:
namespace = locals()
all_model_outputs = [k for k in namespace.keys() if k.startswith("structure_support")]

with open(file=f"exp{experiment_num}_all_models_{experiment_date}.pkl", mode="wb") as f:
    data = dict(
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        questions=questions,
        prompt=prompt_direct,
        models={k: namespace[k] for k in all_model_outputs},
    )
    pickle.dump(data, f)

### Results table


In [15]:
results_list = {
    "t=0": structure_support_by_model_t0,
    "t=0.8": structure_support_by_model_t08,
}

df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = pd.DataFrame.from_dict(
        {
            tuple(mname.split("_", maxsplit=1)): {
                tname: ss_results[mname][tname]["valid"] * 100
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    display(name)

't=0'

't=0.8'

In [16]:
df = pd.concat(df_results).reorder_levels([1, 2, 0], axis=0).sort_index(axis=0)

with open(
    file=f"exp{experiment_num}_summary_df_{experiment_date}.json", mode="wb"
) as f:
    df.to_json(f)

In [29]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,ArticleResponse1XML,ArticleResponse1nointXML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML
Anthropic,Haiku_3,t=0,75.0,100.0,100.0,100.0,100.0
Anthropic,Haiku_3,t=0.8,65.0,100.0,100.0,100.0,100.0
Anthropic,Haiku_35,t=0,95.0,100.0,100.0,100.0,100.0
Anthropic,Haiku_35,t=0.8,100.0,100.0,100.0,95.0,100.0
Anthropic,Sonnet_35,t=0,100.0,100.0,100.0,100.0,100.0
Anthropic,Sonnet_35,t=0.8,100.0,100.0,100.0,100.0,100.0
Ollama,deepseekr1,t=0,70.0,60.0,80.0,90.0,80.0
Ollama,deepseekr1,t=0.8,70.0,75.0,80.0,80.0,65.0
Ollama,llama32,t=0,35.0,95.0,100.0,95.0,95.0
Ollama,llama32,t=0.8,40.0,100.0,95.0,80.0,70.0


In [17]:
import tabulate

print(
    tabulate.tabulate(
        df.reset_index(), headers="keys", tablefmt="pipe", showindex=False
    )
)

| level_0   | level_1        | level_2   |   ArticleResponse1XML |   ArticleResponse1nointXML |   ArticleResponse2XML |   ArticleResponse3XML |   ArticleResponse4XML |
|:----------|:---------------|:----------|----------------------:|---------------------------:|----------------------:|----------------------:|----------------------:|
| Anthropic | Haiku_3        | t=0       |                    75 |                        100 |                   100 |                   100 |                   100 |
| Anthropic | Haiku_3        | t=0.8     |                    65 |                        100 |                   100 |                   100 |                   100 |
| Anthropic | Haiku_35       | t=0       |                    95 |                        100 |                   100 |                   100 |                   100 |
| Anthropic | Haiku_35       | t=0.8     |                   100 |                        100 |                   100 |                    95 |                 

Let's combine these results over structures to have more power to determine the hypothesis


In [139]:
from hypothesis_testing import wilson_score_ci

In [None]:
results_list = {
    "t=0": structure_support_by_model_t0,
    "t=0.8": structure_support_by_model_t08,
}


def format_ci(p, num_total):
    avg, lb, ub = wilson_score_ci(p * num_total, num_total)
    # return f"{lb*100:.1f}% — {ub*100:.0f}% "
    return f"{avg*100:.0f}%"  # ±{max(avg-lb, ub-avg)*100:.0f}"


df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = {
        tuple(mname.split("_", maxsplit=1)): format_ci(
            pd.Series(
                [
                    ss_results[mname][tname]["valid"]
                    for tname in ss_results[mname].keys()
                ]
            ).mean(),
            80,
        )
        for mname in ss_results.keys()
    }

pd.DataFrame.from_dict(df_results)

Unnamed: 0,Unnamed: 1,t=0,t=0.8
Ollama,llama32,84%,77%
Ollama,nemotron,8%,5%
Ollama,phi3,47%,29%
Ollama,phi4,99%,100%
Ollama,deepseekr1,76%,74%
fireworks,llama31,94%,94%
fireworks,llama32,87%,80%
fireworks,llama33,98%,98%
fireworks,qwen25,100%,100%
Anthropic,Sonnet_35,100%,100%


Approximate confidence interval


In [152]:
print(
    tabulate.tabulate(
        pd.DataFrame.from_dict(df_results).reset_index(),
        headers="keys",
        tablefmt="pipe",
        showindex=False,
    )
)

| level_0   | level_1    | t=0   | t=0.8   |
|:----------|:-----------|:------|:--------|
| Ollama    | llama32    | 84%   | 77%     |
| Ollama    | nemotron   | 8%    | 5%      |
| Ollama    | phi3       | 47%   | 29%     |
| Ollama    | phi4       | 99%   | 100%    |
| Ollama    | deepseekr1 | 76%   | 74%     |
| fireworks | llama31    | 94%   | 94%     |
| fireworks | llama32    | 87%   | 80%     |
| fireworks | llama33    | 98%   | 98%     |
| fireworks | qwen25     | 100%  | 100%    |
| Anthropic | Sonnet_35  | 100%  | 100%    |
| Anthropic | Haiku_35   | 99%   | 99%     |
| Anthropic | Haiku_3    | 95%   | 93%     |


### Error analysis


Extract all error messages & count


In [55]:
def analyse_errors_from_results(ss_results, method="code", combined=False):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(
                    output["error_type"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    if combined:
                        error_counts[mname][e_name] = (
                            error_counts[mname].get(e_name, 0) + e_count
                        )
                    else:
                        error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = (
                    output["error_message"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                for error in errors:
                    if error is None:
                        continue
                    if error.lower().find("opening and ending tag mismatch") >= 0:
                        error_str = "XML tag mismatch"
                    elif error.lower().find("extracterror") >= 0:
                        error_str = "Missing main tags"
                    elif error.lower().find("input should be a valid integer") >= 0:
                        error_str = "Validation error (int)"
                    elif error.lower().find("premature end of data in tag") >= 0:
                        error_str = "Premature end"
                    elif error.lower().find("field required") >= 0:
                        error_str = "Missing field"
                    elif error.lower().find("expected '>'") >= 0:
                        error_str = "Tag malformed"
                    elif (
                        error.lower().find("extra content at the end of the document")
                        >= 0
                    ):
                        error_str = "Tag malformed"
                    elif error.find("BadGatewayError") >= 0:
                        error_str = "Connection error"
                    elif error.find("XMLSyntaxError") >= 0:
                        error_str = "Other syntax error"
                    else:
                        error_str = error

                    if combined:
                        error_counts[mname][error_str] = (
                            error_counts[mname].get(error_str, 0) + 1
                        )

                    else:
                        error_counts[mname][(tname, error_str)] = (
                            error_counts[mname].get((tname, error_str), 0) + 1
                        )

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index")

In [None]:
analyse_errors_from_results(
    structure_support_by_model_t08, method="parse", combined=True
).fillna(0).sort_index(axis=1)

Unnamed: 0,Missing field,Missing main tags,Other syntax error,Premature end,Tag malformed,Validation error (int),XML tag mismatch
Ollama_llama32,4.0,4.0,0.0,0.0,1.0,12.0,2.0
Ollama_phi3,7.0,32.0,4.0,2.0,1.0,3.0,22.0
Ollama_deepseekr1,1.0,11.0,3.0,0.0,1.0,1.0,9.0
fireworks_llama31,0.0,0.0,0.0,0.0,0.0,6.0,0.0
fireworks_llama32,1.0,5.0,1.0,0.0,0.0,11.0,2.0
fireworks_llama33,0.0,0.0,0.0,0.0,0.0,2.0,0.0
Anthropic_Haiku_3,0.0,0.0,0.0,0.0,0.0,7.0,0.0
Ollama_nemotron,1.0,94.0,0.0,0.0,0.0,0.0,0.0
Anthropic_Haiku_35,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
analyse_errors_from_results(
    structure_support_by_model_t0, method="parse", combined=True
).fillna(0).sort_index(axis=1)

Unnamed: 0,Missing field,Missing main tags,Other syntax error,Tag malformed,Validation error (int),XML tag mismatch
Ollama_llama32,1.0,0.0,0.0,0.0,12.0,3.0
Ollama_phi3,6.0,21.0,0.0,1.0,3.0,22.0
Ollama_phi4,0.0,0.0,0.0,0.0,1.0,0.0
Ollama_deepseekr1,1.0,18.0,2.0,0.0,2.0,1.0
fireworks_llama31,0.0,0.0,0.0,0.0,6.0,0.0
fireworks_llama32,0.0,0.0,0.0,0.0,11.0,2.0
fireworks_llama33,0.0,1.0,0.0,0.0,1.0,0.0
Anthropic_Haiku_35,0.0,0.0,0.0,0.0,1.0,0.0
Anthropic_Haiku_3,0.0,0.0,0.0,0.0,5.0,0.0
Ollama_nemotron,0.0,92.0,0.0,0.0,0.0,0.0


In [58]:
if 1:
    for output in structure_support_by_model_t08["Ollama_phi3"]["ArticleResponse1XML"][
        "outputs"
    ]:
        if output["error_type"] != "ok":
            print(output["error_message"], "\n")
            print(output["raw"].content)

ValidationError, 1 validation error for ArticleResponse1XML
number
  [line 4]: Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='3700 million years ago', input_type=str] 

 <article>
    <title>Unveiling Earth's Deep Time Treasures: The Journey of Ancient Life in Fossils</title>
    <answer>Dinosaurs may top most people’s minds when it comes to ancient life, but the fossil record tells a tale that begins even before their reign. Delving into our planet's past reveals stromatolites as Earth’s oldest known records of life; these lithified bacterial communities date back about 3.7 billion years ago in rocks from Western Australia and South Africa, serving not only as the earliest examples but also as some of its most convincing evidence for early microbial activity. These structures showcase how even the simplest organisms can influence their environment dramatically—in this case creating a habitat that facilitated further lifeforms to d

### Load previous results


First try all models file


In [None]:
if 0:
    import pickle

    with open(
        file=f"exp{experiment_num}_all_models_{experiment_date}.pkl", mode="rb"
    ) as f:
        data = pickle.load(f)

    # Inject into toplevel namespace
    namespace = locals()
    for key, value in data["models"].items():
        if key not in namespace:
            print(f"Loaded {key}")
            namespace[key] = value

In [None]:
if 0:
    import pickle

    # Load individual models
    output_idents = ["sys", "alt"]

    namespace = locals()
    for ident in output_idents:
        with open(
            file=f"exp{experiment_num}_xml_output_{ident}_{experiment_date}.pkl",
            mode="rb",
        ) as f:
            data = pickle.load(f)

        key = f"structure_support_by_model_{ident}"
        if key not in namespace:
            print(f"Loaded {key}")
            namespace[key] = data["structure_support_by_model"]

## Hypothesis testing


Hypothesis:

- H0: Different temperatures lead to different rates of XML schema conformance
- H1: The proportions of conformance are the same


In [None]:
# Bonferroni correction
model_list = structure_support_by_model_t0.keys()
n_tests = len(model_list)
alpha = 0.05  # / n_tests

for model in model_list:

    contingency_table = {}
    for name, ss_results in results_list.items():
        num_true = 0
        num_total = 0
        for tname in ss_results[model].keys():
            num_true += ss_results[model][tname]["valid"] * len(questions)
            num_total += len(questions)

        contingency_table[name] = {"Passed": num_true, "Failed": num_total - num_true}

    ct = pd.DataFrame.from_dict(contingency_table, orient="index")

    # Ensure ordering to match hypotheses
    # Columns are experiments
    # Rows are outcomes
    # Column marginals are constant
    ct_n = ct[["Passed", "Failed"]].to_numpy().T

    print(f"\n{model}")
    print(sf := stats.fisher_exact(ct_n, alternative="greater"))
    print(sb := stats.barnard_exact(ct_n, alternative="greater"))
    if sb.pvalue < alpha:
        print(f"Hypothesis test passed: {sb.pvalue:.3g} < {alpha:.3g}")


Ollama_llama32
SignificanceResult(statistic=np.float64(1.5681818181818181), pvalue=np.float64(0.14208888177506213))
BarnardExactResult(statistic=np.float64(1.249303038696861), pvalue=np.float64(0.12258240508091653))

Ollama_nemotron
SignificanceResult(statistic=np.float64(1.6521739130434783), pvalue=np.float64(0.28396268931110596))
BarnardExactResult(statistic=np.float64(0.8604859293176627), pvalue=np.float64(0.2645506113622348))

Ollama_phi3
SignificanceResult(statistic=np.float64(2.171112556929083), pvalue=np.float64(0.006510361002595765))
BarnardExactResult(statistic=np.float64(2.6222244603662923), pvalue=np.float64(0.004495770764296521))
Hypothesis test passed: 0.0045 < 0.05

Ollama_phi4
SignificanceResult(statistic=np.float64(0.0), pvalue=np.float64(1.0))
BarnardExactResult(statistic=np.float64(-1.0025094142341715), pvalue=np.float64(1.0))

Ollama_deepseekr1
SignificanceResult(statistic=np.float64(1.1126126126126126), pvalue=np.float64(0.43520490196213557))
BarnardExactResult(sta

### Combined levels: does temperature change the conformance to XML ?


In [115]:
from scipy import stats

model_list = structure_support_by_model_t0.keys()

contingency_table = {}
for model in model_list:
    for name, ss_results in results_list.items():
        for tname in ss_results[model].keys():
            contingency_table[name] = contingency_table.get(
                name, {"Passed": 0, "Failed": 0}
            )
            num_true = ss_results[model][tname]["valid"] * len(questions)
            num_total = len(questions)

            contingency_table[name]["Passed"] += num_true
            contingency_table[name]["Failed"] += num_total - num_true

ct = pd.DataFrame.from_dict(contingency_table, orient="index")

From the contingency table alone we see that there is some differnece between the results for temperatures


In [116]:
ct

Unnamed: 0,Passed,Failed
t=0,987.0,213.0
t=0.8,949.0,251.0


In [117]:
# Chi-Squared Test for multiple groups
res = stats.chi2_contingency(ct)
print(f"\nChi-Squared Test:\nStatistic={res.statistic:.4f}, p={res.pvalue:.4g}")


Chi-Squared Test:
Statistic=3.6576, p=0.05582


In [120]:
# Chi-Squared Test for multiple groups
res = stats.barnard_exact(ct)
print(f"Barnard exact test:\nStatistic={res.statistic:.4f}, p={res.pvalue:.4g}")

Barnard exact test:
Statistic=1.9642, p=0.04995
