## Evaluating structured outputs in LangChain


In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic
from langchain_ollama import ChatOllama
from langchain_fireworks import ChatFireworks
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser

from pydantic import BaseModel, Field
from time import sleep

import pandas as pd

import streamlit as st

Experiment parameters


In [5]:
ANTHROPIC_API_KEY = "<API KEY>"
FIREWORKS_API_KEY = "<API KEY>"
experiment_date = "09-02-25"
n_iter = 1

### Prompt and problem setup

For this test I’m going to start with a substitute task to write an article for a magazine and provide the response for different questions in a specific format.

Here we specify the prompt and any inputs to use to vary the problem (the list of questions).0


In [6]:
test_science_prompt_txt = """
You are a professional science writer tasked with responding to members of
the general public who write in asking questions about science.
Write an article responding to a writer's question for publication in a
science magazine intended for a general readership with a high-school education.
You should write clearly and compellingly, include all relavent context,
and provide motivating stories where applicable.

Your response must be less than 200 words.

The question given to you is the following:
{question}
"""

questions = [
    "What is the oldest recorded fossil?",
    "What is a black hole?",
    "How far away is the sun?",
    "Which other planet in the Solar System has a surface gravity closest to that of the Earth?",
    "Eris, Haumea, Makemake and Ceres are all examples of what?",
    "Why does earth have seasons? Do other planets exhibit seasons too?",
    "What causes the aurora borealis?",
    "Why is the sky blue?",
    "How do bees communicate?",
    "What is the smallest unit of life?",
    "How do plants make their own food?",
    "Why do we dream?",
    "What is the theory of relativity?",
    "How do volcanoes erupt?",
    "What is the speed of light?",
    "How do magnets work?",
    "What is the purpose of DNA?",
    "What are the different types of galaxies?",
    "Why do some animals hibernate?",
    "How do vaccines work?",
]

prompt_direct = ChatPromptTemplate.from_template(test_science_prompt_txt)

prompt_system_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query.\n{format_instructions}",
        ),
        ("human", test_science_prompt_txt),
    ]
)

prompt_user_format = ChatPromptTemplate.from_template(
    test_science_prompt_txt + "\n{format_instructions}"
)

### JSON output format specs


#### Pydantic structures

To answer the question of how these models and output methods differ with different complexities of schema I’m defining four example schema in increasing order of complexity


In [7]:
# Simple types
class ArticleResponse1(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    answer: str = Field(
        description="Provide a detailed description of historical events to answer the question."
    )
    number: int = Field(
        description="An arbitraty number that is most relevant to the question."
    )


# Lists of simple types
class ArticleResponse2(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    further_questions: list[str] = Field(
        description="A list of related questions that may be of interest to the readers."
    )


# Nested types
class HistoricalEvent(BaseModel):
    """The year and explanation of a historical event."""

    year: int = Field(description="The year of the historical event")
    description: str = Field(
        description="A clear description of what happened in this event"
    )


class ArticleResponse3(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    historical_event_1: HistoricalEvent = Field(
        description="Provide a detailed description of one historical events to answer the question."
    )
    historical_event_2: HistoricalEvent = Field(
        description="Provide a detailed description of one historical events to answer the question."
    )


# Lists of custom types
class ArticleResponse4(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    historical_timeline: list[HistoricalEvent] = Field(
        description="Provide a compelling account of the historical context of the question"
    )


# Nested types
class CriicalAnalysis(BaseModel):
    """A critique of interpretations of historical events"""

    historical_event: HistoricalEvent = Field(
        description="Provide an overview of the facts of a historical event"
    )
    common_understanding: str = Field(description="Agreed interpretation of event")
    analysis: str = Field(
        description="Critical analysis of the event and opposing interpretations"
    )


# Multiple nested custom types
class ArticleResponse5(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    historical_timeline: list[HistoricalEvent] = Field(
        description="Provide a compelling account of the historical context of the question"
    )
    critique: list[CriicalAnalysis] = Field(
        description="A list of key historical events and historical analysis of them"
    )


structured_formats = [
    dict(pydantic=ArticleResponse1),
    dict(pydantic=ArticleResponse2),
    dict(pydantic=ArticleResponse3),
    dict(pydantic=ArticleResponse4),
    dict(pydantic=ArticleResponse5),
]

### Models to evaluate


In [5]:
# Default temperature
temperature = 0.0
timeout = 30
num_ctx = 8192
num_predict = 4096

In [6]:
llm_models = {
    "Ollama_llama32": ChatOllama(
        model="llama3.2",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_nemotron": ChatOllama(
        model="nemotron-mini",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_phi3": ChatOllama(
        model="phi3",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_phi4": ChatOllama(
        model="phi4",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "Ollama_deepseekr1": ChatOllama(
        model="deepseek-r1",
        temperature=temperature,
        num_ctx=num_ctx,
        num_thread=1,
        num_predict=num_predict,
    ),
    "fireworks_llama31": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p1-70b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    "fireworks_llama32": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p2-3b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    "fireworks_llama33": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p3-70b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
    "fireworks_deepseekr1_70b": ChatFireworks(
        model_name="accounts/fireworks/models/deepseek-r1",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        timeout=timeout,
    ),
}
llm_models_with_anthropic = {
    **llm_models,
    "Anthropic_Sonnet_35": ChatAnthropic(
        model="claude-3-5-sonnet-20241022",
        api_key=ANTHROPIC_API_KEY,
        timeout=timeout,
    ),
    "Anthropic_Haiku_35": ChatAnthropic(
        model="claude-3-5-haiku-20241022",
        api_key=ANTHROPIC_API_KEY,
        timeout=timeout,
    ),
    "Anthropic_Haiku_3": ChatAnthropic(
        model="claude-3-haiku-20240307",
        api_key=ANTHROPIC_API_KEY,
        timeout=timeout,
    ),
}

## Evaluation

Let's loop over different structured outputs and check the adherence using the tool-calling API (structured output mode)


### Evaluate Structured Ouputs accross providers & models

Question - of the models that have tool calling, what complexity of structure can they support?


In [7]:
import pickle


def run_experiment(
    prompt_format,
    questions,
    llm_models,
    method,
    n_iter=1,
    resume=0,
    results_out=None,
    save_file_name=None,
):

    if results_out is None:
        structure_support_by_model = {}
    else:
        structure_support_by_model = results_out
    n_questions = len(questions)

    position = 0

    # Iterate over models
    for model_name, llm_model in llm_models.items():
        structure_support_by_model[model_name] = {}

        # Iterate over schemas
        for structure in structured_formats:
            pydantic_obj = structure["pydantic"]
            print(
                f"Model: {model_name}  Output: {pydantic_obj.__name__}   Pos: {position}"
            )

            position += 1
            if position < resume:
                continue

            # Format instructions if required
            parser = PydanticOutputParser(pydantic_object=pydantic_obj)
            prompt = prompt_format.partial(
                format_instructions=parser.get_format_instructions()
            )

            # Iterate over questions
            error_types = []
            error_messages = []
            outputs = []
            output_valid = 0
            for _ in range(n_iter):
                for ii in range(n_questions):
                    try:
                        test_chain = prompt | llm_model.with_structured_output(
                            pydantic_obj, method=method, include_raw=True
                        )
                        output = test_chain.invoke(dict(question=questions[ii]))
                        outputs.append(output)

                        # Typically Pydantic validation failure
                        if output["parsing_error"] is not None:
                            error_types.append("parse_error")
                            error_messages.append(output["parsing_error"])
                            print("Error: Parse error")

                        # Typically function-calling failure
                        elif output["parsed"] is None:
                            error_types.append("no_output")
                            print("Error: No output")

                        # This is not expected to happen
                        elif not isinstance(output["parsed"], pydantic_obj):
                            error_types.append("unexpected_error")
                            raise RuntimeError("Unexpected error")

                        else:
                            error_types.append("ok")
                            output_valid += 1

                    # Other failures (typically function-calling not supported)
                    except Exception as e:
                        error_types.append("other_error")
                        print(f"Error: Other error {type(e).__name__}")
                        error_messages.append(f"{type(e).__name__}, {e}")

                    # Pause to avoid timeouts
                    print(".", end="")
                    sleep(1)
                print()

            structure_support_by_model[model_name][pydantic_obj.__name__] = dict(
                valid=output_valid / (n_iter * n_questions),
                error_types=error_types,
                errors=error_messages,
                outputs=outputs,
            )
    if save_file_name:
        with open(file=save_file_name, mode="wb") as f:
            pickle.dump(
                dict(
                    method=method,
                    prompt=prompt,
                    questions=questions,
                    structure_support_by_model=structure_support_by_model,
                ),
                f,
            )
    return structure_support_by_model

Function Calling: Include Anthropic models


In [None]:
if "structure_support_by_model_fc" not in locals():
    structure_support_by_model_fc = {}
run_experiment(
    prompt_direct,
    questions,
    llm_models_with_anthropic,
    method="function_calling",
    n_iter=1,
    results_out=structure_support_by_model_fc,
    save_file_name=f"exp5_function_calling_{experiment_date}.pkl",
)

In [None]:
if "structure_support_by_model_js" not in locals():
    structure_support_by_model_js = {}
run_experiment(
    prompt_direct,
    questions,
    llm_models,
    method="json_schema",
    n_iter=1,
    results_out=structure_support_by_model_js,
    save_file_name=f"exp5_json_schema_{experiment_date}.pkl",
)

In [None]:
if "structure_support_by_model_jm" not in locals():
    structure_support_by_model_jm = {}
run_experiment(
    prompt_user_format,
    questions,
    llm_models,
    method="json_mode",
    n_iter=1,
    results_out=structure_support_by_model_jm,
    save_file_name=f"exp5_json_mode_{experiment_date}.pkl",
)

In [9]:
def results_to_df(ss_results, key="valid"):
    df = pd.DataFrame.from_dict(
        {
            mname: {
                tname: ss_results[mname][tname][key] * 100 / n_questions
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    return df


def analyse_errors_from_results(ss_results, method="code"):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            validation_error = 0
            json_error = 0
            unknown_error = 0

            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(ss_results[mname][tname]["error_types"])
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = ss_results[mname][tname]["errors"]
                for error in errors:
                    error_str = str(error)
                    if error_str.lower().find("invalid json output") >= 0:
                        json_error += 1
                    elif error_str.lower().find("validation error") >= 0:
                        validation_error += 1
                    else:
                        unknown_error += 1
                error_counts[mname][(tname, "invalid_json")] = json_error
                error_counts[mname][(tname, "validation")] = validation_error
                error_counts[mname][(tname, "unknown")] = unknown_error

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index")

In [16]:
def results_to_df(ss_results, key="valid"):
    df = pd.DataFrame.from_dict(
        {
            mname: {
                tname: ss_results[mname][tname][key] * 100 / n_questions
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    return df


def analyse_errors_from_results(ss_results, method="code"):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            validation_error = 0
            json_error = 0
            unknown_error = 0

            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(ss_results[mname][tname]["error_types"])
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = ss_results[mname][tname]["errors"]
                for error in errors:
                    error_str = str(error)
                    if error_str.lower().find("invalid json output") >= 0:
                        json_error += 1
                    elif error_str.lower().find("validation error") >= 0:
                        validation_error += 1
                    else:
                        unknown_error += 1
                error_counts[mname][(tname, "invalid_json")] = json_error
                error_counts[mname][(tname, "validation")] = validation_error
                error_counts[mname][(tname, "unknown")] = unknown_error

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index").fillna(0)

#### Function calling errors

- Fireworks Llama3.2 doesn't support function calling


In [None]:
errors_df = analyse_errors_from_results(structure_support_by_model_fc)
errors_df.sort_index(axis=1)

#### JSON Mode


In [None]:
errors_df = analyse_errors_from_results(structure_support_by_model_jm)
errors_df

#### JSON Schema


In [None]:
errors_df = analyse_errors_from_results(structure_support_by_model_js)
errors_df.sort_index(axis=1)

Let's look at the errors in the JSON schema method


In [None]:
all_ollama_errors = [
    output["parsing_error"]
    for model in structure_support_by_model_js
    if model.startswith("Ollama")
    for struct, outputs in structure_support_by_model_js[model].items()
    for output in outputs["outputs"]
    if output["parsing_error"] is not None
]

In [None]:
all_ollama_errors

### Results


In [None]:
results_list = {
    "Function-calling": structure_support_by_model_fc,
    "JSON Schema": structure_support_by_model_js,
    "JSON Mode": structure_support_by_model_jm,
}

df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = pd.DataFrame.from_dict(
        {
            mname: {
                tname: ss_results[mname][tname]["valid"] * 100
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    display(name)

In [None]:
df = pd.concat(df_results).swaplevel(axis=0).sort_index(axis=0)
df

Save results


In [24]:
with open(file=f"exp5_summary_df_{experiment_date}.json", mode="wb") as f:
    df.to_json(f)

with open(file=f"exp5_all_models_{experiment_date}.pkl", mode="wb") as f:
    pickle.dump(
        dict(
            temperature=temperature,
            num_ctx=num_ctx,
            num_predict=num_predict,
            questions=questions,
            prompt=prompt_direct,
            structure_support_by_model_fc=structure_support_by_model_fc,
            structure_support_by_model_jm=structure_support_by_model_jm,
            structure_support_by_model_js=structure_support_by_model_js,
        ),
        f,
    )

Load results


In [None]:
import pickle
import pandas as pd

# with open(file=f"exp5_summary_df_{experiment_date}.json", mode="rb") as f:
#    df = pd.read_json(f)

with open(file=f"exp5_all_models_{experiment_date}.pkl", mode="rb") as f:
    data = pickle.load(f)

# Inject into toplevel namespace
namespace = locals()
for key, value in data.items():
    if key not in namespace:
        print(f"Loaded {key}")
        namespace[key] = value