## Using LangChain to get structured outputs


## Setup


In [8]:
import pickle
import pandas as pd
from langchain.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic
from langchain_ollama import ChatOllama
from langchain_fireworks import ChatFireworks

from experiment_xml import (
    pydantic_to_xml_instructions,
    run_xml_experiment,
)

In [9]:
ANTHROPIC_API_KEY = "<API KEY>"
FIREWORKS_API_KEY = "<API KEY>"

In [10]:
import streamlit as st

LANGSMITH_API_KEY = st.secrets["api_keys"]["LANGSMITH_API_KEY"]
ANTHROPIC_API_KEY = st.secrets["api_keys"]["ANTHROPIC_API_KEY"]
FIREWORKS_API_KEY = st.secrets["api_keys"]["FIREWORKS_API_KEY"]

In [44]:
experiment_date = "24-02-25"
experiment_num = "3"
n_iter = 1

Let's start by creating a LLM model to run our structured output queries. Use a temperature of 0 to improve structured output generation (but at the cost of "creativity").


### Model setup


In [12]:
# LLM parameters
temperature = 0.8
timeout = 30
num_ctx = 8192
num_predict = 4096

In [13]:
llm_models_test = {
    "Ollama_llama32": ChatOllama(
        model="llama3.2",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_phi3": ChatOllama(
        model="phi3",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_deepseekr1": ChatOllama(
        model="deepseek-r1",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
}
llm_models = {
    "Ollama_llama32": ChatOllama(
        model="llama3.2",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_nemotron": ChatOllama(
        model="nemotron-mini",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_phi3": ChatOllama(
        model="phi3",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_phi4": ChatOllama(
        model="phi4",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_deepseekr1": ChatOllama(
        model="deepseek-r1",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "fireworks_llama31": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p1-70b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    "fireworks_llama32": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p2-3b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    "fireworks_llama33": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p3-70b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    "fireworks_qwen25": ChatFireworks(
        model_name="accounts/fireworks/models/qwen2p5-72b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    # "fireworks_deepseekr1_70b": ChatFireworks(
    #     model_name="accounts/fireworks/models/deepseek-r1-distill-llama-70b",
    #     api_key=FIREWORKS_API_KEY,
    #     temperature=temperature,
    #     timeout=timeout,
    # ),
}
llm_models_with_anthropic = {
    **llm_models,
    "Anthropic_Sonnet_35": ChatAnthropic(
        model="claude-3-5-sonnet-20241022",
        api_key=ANTHROPIC_API_KEY,
        timeout=timeout,
    ),
    "Anthropic_Haiku_35": ChatAnthropic(
        model="claude-3-5-haiku-20241022",
        api_key=ANTHROPIC_API_KEY,
        timeout=timeout,
    ),
    "Anthropic_Haiku_3": ChatAnthropic(
        model="claude-3-haiku-20240307",
        api_key=ANTHROPIC_API_KEY,
        timeout=timeout,
    ),
}

### Problem setup and prompt


In [14]:
test_science_prompt_txt = """
You are a professional science writer tasked with responding to members of
the general public who write in asking questions about science.
Write an article responding to a writer's question for publication in a
science magazine intended for a general readership with a high-school education.
You should write clearly and compellingly, include all relavent context,
and provide motivating stories where applicable.

Your response must be less than 200 words.

The question given to you is the following:
{question}
"""

questions = [
    "What is the oldest recorded fossil?",
    "What is a black hole?",
    "How far away is the sun?",
    "Which other planet in the Solar System has a surface gravity closest to that of the Earth?",
    "Eris, Haumea, Makemake and Ceres are all examples of what?",
    "Why does earth have seasons? Do other planets exhibit seasons too?",
    "What causes the aurora borealis?",
    "Why is the sky blue?",
    "How do bees communicate?",
    "What is the smallest unit of life?",
    "How do plants make their own food?",
    "Why do we dream?",
    "What is the theory of relativity?",
    "How do volcanoes erupt?",
    "What is the speed of light?",
    "How do magnets work?",
    "What is the purpose of DNA?",
    "What are the different types of galaxies?",
    "Why do some animals hibernate?",
    "How do vaccines work?",
]

In [15]:
prompt_direct = ChatPromptTemplate.from_template(test_science_prompt_txt)

prompt_system_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Return a publishable article in the requested format.\n{format_instructions}",
        ),
        ("human", test_science_prompt_txt),
    ]
)

prompt_system_plus_reminder_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Return a publishable article in the requested format.\n{format_instructions}",
        ),
        (
            "human",
            test_science_prompt_txt + "\nYour response must be in valid XML.",
        ),
    ]
)

prompt_user_format = ChatPromptTemplate.from_template(
    test_science_prompt_txt + "\n{format_instructions}"
)

### Schema using Pydantic XML


In [16]:
from pydantic_xml import BaseXmlModel, element, attr


class ArticleResponse1XML(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="Title of the article")
    answer: str = element(
        description="Provide a detailed description of historical events to answer the question"
    )
    number: int = element(description="A number that is most relevant to the question.")


class ArticleResponse1nointXML(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="Title of the article")
    answer: str = element(
        description="Provide a detailed description of historical events to answer the question"
    )
    number: str = element(description="A number that is most relevant to the question.")


# Lists of simple types
class ArticleResponse2XML(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="Title of the article")
    answer: str = element(description="Answer the writer's question")
    further_questions: list[str] = element(
        tag="further_question",
        description="A list of related questions that may be of interest to the readers.",
    )


class ListofStrXML(BaseXmlModel):
    """A list of related questions of interest to the readers"""

    further_questions: list[str] = element(
        tag="further_question",
        description="A related question of interest to readers",
    )


# Lists of simple types (encapsulated list)
class ArticleResponse2XMLalt(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="Title of the article")
    answer: str = element(description="Answer the writer's question")
    further_questions: ListofStrXML = element(
        tag="further_questions",
        description="A list of related questions of interest to the readers",
    )


# Nested types
class HistoricalEventXML(BaseXmlModel):
    """The year and explanation of a historical event."""

    year: str = element(description="The year of the historical event")
    event: str = element(
        description="A clear and concise explanation of what happened in this event"
    )


class ArticleResponse3XML(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="[Title of the article]")
    historical_event_1: HistoricalEventXML = element(
        description="A first historical event relevant to the question"
    )
    historical_event_2: HistoricalEventXML = element(
        description="A second historical event relevant to the question"
    )


# Lists of custom types
class ArticleResponse4XML(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="Title of the article")
    historical_timeline: list[HistoricalEventXML] = element(
        description="A list of historical events relevant to the question"
    )


class ListofHistoricalEventXML(BaseXmlModel):
    """A list of historical events relevant to the question"""

    historical_event: list[HistoricalEventXML] = element(
        tag="historical_event",
        description="A relevant historical event",
    )


# Lists of custom types (encapsulated list)
class ArticleResponse4XMLalt(BaseXmlModel, tag="article"):
    """Structured article for publication answering a reader's question"""

    title: str = element(description="Title of the article")
    historical_timeline: ListofHistoricalEventXML = element(
        description="A list of historical events relevant to the question"
    )

In [17]:
structured_formats_xml = [
    dict(pydantic=schema, format_instructions=pydantic_to_xml_instructions(schema))
    for schema in [
        ArticleResponse1XML,
        ArticleResponse2XML,
        ArticleResponse3XML,
        ArticleResponse4XML,
    ]
]

structured_formats_xml_alt = [
    dict(pydantic=schema, format_instructions=pydantic_to_xml_instructions(schema))
    for schema in [
        ArticleResponse1XML,
        ArticleResponse2XMLalt,
        ArticleResponse3XML,
        ArticleResponse4XMLalt,
    ]
]

In [11]:
print(pydantic_to_xml_instructions(ArticleResponse2XMLalt))

You must respond only in XML using the following schema.
Do not provide any output outside the first and last XML tags.

<article>
  <!--Structured article for publication answering a reader's question-->
  <title>
    {Title of the article - must be type str}
  </title>
  <answer>
    {Answer the writer's question - must be type str}
  </answer>
<further_questions>
  <!--A list of related questions of interest to the readers-->
<!-- First list element -->
  <further_question>
    {A related question of interest to readers - must be type str}
  </further_question>
<!-- Next list element -->
  <further_question>
    {A related question of interest to readers - must be type str}
  </further_question>
<!-- Etc -->
  <further_question>
  ...
  </further_question>
</further_questions>
</article>


## Experiment

Hypotheses:

- Enclosing a list in another container improves XML conformance.


System prompt


In [None]:
if "structure_support_by_model_sys" not in locals():
    structure_support_by_model_sys = {}

_ = run_xml_experiment(
    prompt_system_format,
    questions,
    llm_models_with_anthropic,
    structured_formats_xml,
    n_iter=1,
    results_out=structure_support_by_model_sys,
    save_file_name=f"exp3_xml_output_sys_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1XML   Pos: 1
eeeeee..eeee.e.eeeee
Model: Ollama_llama32  Output: ArticleResponse2XML   Pos: 2
........e.....

KeyboardInterrupt: 

In [13]:
if "structure_support_by_model_alt" not in locals():
    structure_support_by_model_alt = {}

_ = run_xml_experiment(
    prompt_system_format,
    questions,
    llm_models,
    structured_formats_xml_alt,
    n_iter=1,
    results_out=structure_support_by_model_alt,
    save_file_name=f"exp3_xml_output_alt_{experiment_date}.pkl",
)

Model: Ollama_llama32  Output: ArticleResponse1XML   Pos: 1
eeee....eeee.ee.ee.e
Model: Ollama_llama32  Output: ArticleResponse2XMLalt   Pos: 2
e...................
Model: Ollama_llama32  Output: ArticleResponse3XML   Pos: 3
.....e..e.e..e..eee.
Model: Ollama_llama32  Output: ArticleResponse4XMLalt   Pos: 4
....................
Model: Ollama_nemotron  Output: ArticleResponse1XML   Pos: 5
eeee.eeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse2XMLalt   Pos: 6
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse3XML   Pos: 7
eeeeeeeeeeeeeeeeeeee
Model: Ollama_nemotron  Output: ArticleResponse4XMLalt   Pos: 8
eeeeeeeeeeeeeeeeeeee
Model: Ollama_phi3  Output: ArticleResponse1XML   Pos: 9
eeeee..eee..eeee.eee
Model: Ollama_phi3  Output: ArticleResponse2XMLalt   Pos: 10
eeeeeeeeeeeeee..eeee
Model: Ollama_phi3  Output: ArticleResponse3XML   Pos: 11
.eeeeee.eee.eeeeeeee
Model: Ollama_phi3  Output: ArticleResponse4XMLalt   Pos: 12
eeeeeeeeeeeeeeeeeeee
Model: Ollama_p

Save all models


In [45]:
namespace = locals()
all_model_outputs = [k for k in namespace.keys() if k.startswith("structure_support")]

with open(file=f"exp{experiment_num}_all_models_{experiment_date}.pkl", mode="wb") as f:
    data = dict(
        temperature=temperature,
        num_ctx=num_ctx,
        num_predict=num_predict,
        questions=questions,
        prompt=prompt_direct,
        models={k: namespace[k] for k in all_model_outputs},
    )
    pickle.dump(data, f)

### Results table


In [46]:
results_list = {
    "Standard": structure_support_by_model_sys,
    "Encapsulated List": structure_support_by_model_alt,
}

df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = pd.DataFrame.from_dict(
        {
            tuple(mname.split("_")): {
                tname: ss_results[mname][tname]["valid"] * 100
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    display(name)

'Standard'

'Encapsulated List'

In [47]:
df = pd.concat(df_results).reorder_levels([1, 2, 0], axis=0).sort_index(axis=0)

with open(
    file=f"exp{experiment_num}_summary_df_{experiment_date}.json", mode="wb"
) as f:
    df.to_json(f)

In [None]:
# Coalese two alternative columns in the dataframe
df["ArticleResponse2XML"] = df["ArticleResponse2XML"].combine_first(
    df["ArticleResponse2XMLalt"]
)
df["ArticleResponse4XML"] = df["ArticleResponse4XML"].combine_first(
    df["ArticleResponse4XMLalt"]
)
# Remove alternative column
df.drop(columns=["ArticleResponse2XMLalt", "ArticleResponse4XMLalt"], inplace=True)

In [62]:
df.sort_index(axis=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,ArticleResponse1XML,ArticleResponse2XML,ArticleResponse3XML,ArticleResponse4XML
Ollama,deepseekr1,Encapsulated List,60.0,75.0,50.0,60.0
Ollama,deepseekr1,Standard,70.0,80.0,60.0,50.0
Ollama,llama32,Encapsulated List,35.0,95.0,65.0,100.0
Ollama,llama32,Standard,35.0,85.0,65.0,60.0
Ollama,nemotron,Encapsulated List,5.0,0.0,0.0,0.0
Ollama,nemotron,Standard,5.0,0.0,5.0,10.0
Ollama,phi3,Encapsulated List,25.0,10.0,15.0,0.0
Ollama,phi3,Standard,35.0,25.0,5.0,5.0
Ollama,phi4,Encapsulated List,100.0,100.0,100.0,100.0
Ollama,phi4,Standard,95.0,100.0,100.0,100.0


In [63]:
import tabulate

print(
    tabulate.tabulate(
        df.reset_index(), headers="keys", tablefmt="pipe", showindex=False
    )
)

| level_0   | level_1    | level_2           |   ArticleResponse1XML |   ArticleResponse2XML |   ArticleResponse3XML |   ArticleResponse4XML |
|:----------|:-----------|:------------------|----------------------:|----------------------:|----------------------:|----------------------:|
| Ollama    | deepseekr1 | Encapsulated List |                    60 |                    75 |                    50 |                    60 |
| Ollama    | deepseekr1 | Standard          |                    70 |                    80 |                    60 |                    50 |
| Ollama    | llama32    | Encapsulated List |                    35 |                    95 |                    65 |                   100 |
| Ollama    | llama32    | Standard          |                    35 |                    85 |                    65 |                    60 |
| Ollama    | nemotron   | Encapsulated List |                     5 |                     0 |                     0 |                     0 |

### Error analysis


Extract all error messages & count


In [64]:
import pandas as pd

In [None]:
def analyse_errors_from_results(ss_results, method="code"):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(
                    output["error_type"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = (
                    output["error_message"]
                    for output in ss_results[mname][tname]["outputs"]
                )
                for error in errors:
                    if error is None:
                        continue
                    if error.lower().find("opening and ending tag mismatch") >= 0:
                        error_str = "XML tag mismatch"
                    elif error.lower().find("extracterror") >= 0:
                        error_str = "Missing main tags"
                    elif error.lower().find("input should be a valid integer") >= 0:
                        error_str = "Validation error (int)"
                    elif error.lower().find("premature end of data in tag") >= 0:
                        error_str = "Premature end"
                    elif error.lower().find("field required") >= 0:
                        error_str = "Missing field"
                    elif error.lower().find("expected '>'") >= 0:
                        error_str = "Tag malformed"
                    elif (
                        error.lower().find("extra content at the end of the document")
                        >= 0
                    ):
                        error_str = "Tag malformed"
                    elif error.find("BadGatewayError") >= 0:
                        error_str = "Connection error"
                    elif error.find("XMLSyntaxError") >= 0:
                        error_str = "Other syntax error"
                    else:
                        error_str = error

                    error_counts[mname][(tname, error_str)] = (
                        error_counts[mname].get((tname, error_str), 0) + 1
                    )

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index")

In [None]:
analyse_errors_from_results(structure_support_by_model_alt, method="parse").fillna(
    0
).sort_index(axis=1)

Unnamed: 0_level_0,ArticleResponse1XML,ArticleResponse1XML,ArticleResponse1XML,ArticleResponse1XML,ArticleResponse1XML,ArticleResponse1XML,ArticleResponse2XMLalt,ArticleResponse2XMLalt,ArticleResponse2XMLalt,ArticleResponse3XML,ArticleResponse3XML,ArticleResponse3XML,ArticleResponse3XML,ArticleResponse3XML,ArticleResponse3XML,ArticleResponse3XML,ArticleResponse4XMLalt,ArticleResponse4XMLalt,ArticleResponse4XMLalt,ArticleResponse4XMLalt,ArticleResponse4XMLalt
Unnamed: 0_level_1,Missing field,Missing main tags,Other syntax error,Tag malformed,Validation error (int),XML tag mismatch,Missing field,Missing main tags,XML tag mismatch,Connection error,...,Missing main tags,Other syntax error,Premature end,Tag malformed,XML tag mismatch,Missing field,Missing main tags,Other syntax error,Premature end,XML tag mismatch
Ollama_llama32,0.0,0.0,0.0,0.0,11.0,2.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
Ollama_phi3,3.0,4.0,1.0,2.0,1.0,4.0,3.0,9.0,6.0,0.0,...,4.0,1.0,1.0,1.0,6.0,3.0,8.0,2.0,0.0,7.0
Ollama_deepseekr1,0.0,3.0,1.0,0.0,2.0,2.0,1.0,4.0,0.0,0.0,...,2.0,3.0,0.0,1.0,3.0,1.0,2.0,0.0,2.0,3.0
fireworks_llama31,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
fireworks_llama32,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fireworks_llama33,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ollama_nemotron,0.0,19.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,...,19.0,0.0,0.0,0.0,1.0,1.0,18.0,0.0,0.0,1.0


In [76]:
if 1:
    for output in structure_support_by_model_sys["Ollama_nemotron"][
        "ArticleResponse1XML"
    ]["outputs"]:
        if output["error_type"] != "ok":
            print(output["error_message"], "\n")
            print(output["raw"].content)

RuntimeError, ExtractError: End or start strings not found 

 The oldest known fossil is around 3.5 billion years old and was found in Greenland's Isua Greenstone Belt, which suggests life may have originated soon after Earth's oceans formed.
RuntimeError, ExtractError: End or start strings not found 

 A black hole is a region in space with such strong gravity that nothing, not even light, can escape from it once it has passed within its boundary called the event horizon. The concept of black holes was first proposed by physicist John Wheeler over 50 years ago and has since been confirmed through observations made by scientists using radio telescopes like the Event Horizon Telescope project.
RuntimeError, ExtractError: End or start strings not found 

 The sun is about 93 million miles away from Earth!
RuntimeError, ExtractError: End or start strings not found 

 Mars! With approximately one-third of Earth's gravity, it would be easier for us to walk on Mars compared to the Moon (one-

### Load previous results


First try all models file


In [None]:
import pickle
import pandas as pd

with open(file=f"exp3_all_models_{experiment_date}.pkl", mode="rb") as f:
    data = pickle.load(f)

# Inject into toplevel namespace
namespace = locals()
for key, value in data.items():
    if key not in namespace:
        print(f"Loaded {key}")
        namespace[key] = value

In [None]:
import pickle
import pandas as pd

# Load individual models
output_idents = ["sys", "alt"]

namespace = locals()
for ident in output_idents:
    with open(file=f"exp3_xml_output_{ident}_{experiment_date}.pkl", mode="rb") as f:
        data = pickle.load(f)

    key = f"structure_support_by_model_{ident}"
    if key not in namespace:
        print(f"Loaded {key}")
        namespace[key] = data["structure_support_by_model"]

Loaded structure_support_by_model_sys
Loaded structure_support_by_model_alt


## Hypothesis testing


Hypothesis:

- H0: Results are the same
- H1: Encapsulated list produces more conformant output


In [78]:
structure_support_by_model_sys.keys()

dict_keys(['Ollama_llama32', 'Ollama_nemotron', 'Ollama_phi3', 'Ollama_phi4', 'Ollama_deepseekr1', 'fireworks_llama31', 'fireworks_llama32', 'fireworks_llama33', 'fireworks_qwen25'])

In [None]:
from scipy import stats

# Bonferroni correction
model_list = structure_support_by_model_sys.keys()
n_tests = len(model_list)
alpha = 0.05 / n_tests

experiments_list = {
    "Standard": structure_support_by_model_sys,
    "Encapsulated": structure_support_by_model_alt,
}

for model in model_list:

    contingency_table = {}
    for name, ss_results in experiments_list.items():
        num_true = 0
        num_total = 0
        for tname in ss_results[model].keys():
            num_true += ss_results[model][tname]["valid"] * len(questions)
            num_total += len(questions)

        contingency_table[name] = {"Passed": num_true, "Failed": num_total - num_true}

    ct = pd.DataFrame.from_dict(contingency_table, orient="index")

    print(f"\n{model}")
    print(sf := stats.fisher_exact(ct.to_numpy()))
    print(sb := stats.barnard_exact(ct.to_numpy()))
    if sf.pvalue < alpha:
        print(f"Hypothesis test passed: {sf.pvalue:.3g} < {alpha:.3g}")


Ollama_llama32
SignificanceResult(statistic=np.float64(0.5626025150355386), pvalue=np.float64(0.12834009480419506))
BarnardExactResult(statistic=np.float64(-1.687898945139444), pvalue=np.float64(0.09692710210693593))

Ollama_nemotron
SignificanceResult(statistic=np.float64(4.157894736842105), pvalue=np.float64(0.3670632536153507))
BarnardExactResult(statistic=np.float64(1.3631084021929558), pvalue=np.float64(0.24286260716850405))

Ollama_phi3
SignificanceResult(statistic=np.float64(1.4848484848484849), pvalue=np.float64(0.507235943240366))
BarnardExactResult(statistic=np.float64(0.8856148855400957), pvalue=np.float64(0.43744823069801114))

Ollama_phi4
SignificanceResult(statistic=np.float64(0.0), pvalue=1.0)
BarnardExactResult(statistic=np.float64(-1.0031397251510383), pvalue=np.float64(0.5000000000000013))

Ollama_deepseekr1
SignificanceResult(statistic=np.float64(1.1749271137026238), pvalue=np.float64(0.743280409513128))
BarnardExactResult(statistic=np.float64(0.49158039400737397), 

### Combined experiments : which prompt style is better?


In [None]:
from scipy import stats

model_list = structure_support_by_model_sys.keys()

contingency_table = {}
for model in model_list:
    for name, ss_results in experiments_list.items():
        for tname in ss_results[model].keys():
            contingency_table[tname] = contingency_table.get(
                tname, {"Passed": 0, "Failed": 0}
            )
            num_true = ss_results[model][tname]["valid"] * len(questions)
            num_total = len(questions)

            contingency_table[tname]["Passed"] += num_true
            contingency_table[tname]["Failed"] += num_total - num_true

ct = pd.DataFrame.from_dict(contingency_table, orient="index")

From the contingency table alone we see that there is no significant difference between the prompts


In [82]:
ct

Unnamed: 0,Passed,Failed
ArticleResponse1XML,208.0,152.0
ArticleResponse2XML,136.0,44.0
ArticleResponse3XML,248.0,112.0
ArticleResponse4XML,123.0,57.0
ArticleResponse2XMLalt,136.0,44.0
ArticleResponse4XMLalt,131.0,49.0


In [90]:
# Chi-Squared Test for multiple groups
res = stats.chi2_contingency(ct.loc[["ArticleResponse2XML", "ArticleResponse2XMLalt"]])
print(f"\nChi-Squared Test:\nStatistic={res.statistic:.4f}, p={res.pvalue:.4g}")


Chi-Squared Test:
Statistic=0.0000, p=1


In [93]:
# Chi-Squared Test
res = stats.chi2_contingency(ct.loc[["ArticleResponse4XML", "ArticleResponse4XMLalt"]])
print(f"\nChi-Squared Test:\nStatistic={res.statistic:.4f}, p={res.pvalue:.4g}")


Chi-Squared Test:
Statistic=0.6552, p=0.4183
