## Evaluating structured outputs in LangChain


In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_anthropic import ChatAnthropic
from langchain_ollama import ChatOllama
from langchain_fireworks import ChatFireworks
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser

from pydantic import BaseModel, Field
from time import sleep

import pandas as pd

import streamlit as st

Experiment parameters


In [2]:
ANTHROPIC_API_KEY = "<API KEY>"
ANTHROPIC_API_KEY = st.secrets["api_keys"]["ANTHROPIC_API_KEY"]
FIREWORKS_API_KEY = st.secrets["api_keys"]["FIREWORKS_API_KEY"]
experiment_date = "06-02-25"
n_iter = 1

### Prompt and problem setup

For this test I’m going to start with a substitute task to write an article for a magazine and provide the response for different questions in a specific format.

Here we specify the prompt and any inputs to use to vary the problem (the list of questions).0


In [3]:
test_science_prompt_txt = """
You are a professional science writer tasked with responding to members of
the general public who write in asking questions about science.
Write an article responding to a writer's question for publication in a
science magazine intended for a general readership with a high-school education.
You should write clearly and compellingly, include all relavent context,
and provide motivating stories where applicable.

Your response must be less than 200 words.

The question given to you is the following:
{question}
"""

questions = [
    "What is the oldest recorded fossil?",
    "What is a black hole?",
    "How far away is the sun?",
    "Which other planet in the Solar System has a surface gravity closest to that of the Earth?",
    "Eris, Haumea, Makemake and Ceres are all examples of what?",
    "Why does earth have seasons? Do other planets exhibit seasons too?",
    "What causes the aurora borealis?",
    "Why is the sky blue?",
    "How do bees communicate?",
    "What is the smallest unit of life?",
    "How do plants make their own food?",
    "Why do we dream?",
    "What is the theory of relativity?",
    "How do volcanoes erupt?",
    "What is the speed of light?",
    "How do magnets work?",
    "What is the purpose of DNA?",
    "What are the different types of galaxies?",
    "Why do some animals hibernate?",
    "How do vaccines work?",
]

prompt_direct = ChatPromptTemplate.from_template(test_science_prompt_txt)

prompt_system_format = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query.\n{format_instructions}",
        ),
        ("human", test_science_prompt_txt),
    ]
)

prompt_user_format = ChatPromptTemplate.from_template(
    test_science_prompt_txt + "\n{format_instructions}"
)

### JSON output format specs


#### Pydantic structures

To answer the question of how these models and output methods differ with different complexities of schema I’m defining four example schema in increasing order of complexity


In [4]:
# Simple types
class ArticleResponse1(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    answer: str = Field(
        description="Provide a detailed description of historical events to answer the question."
    )
    number: int = Field(
        description="An arbitraty number that is most relevant to the question."
    )


# Lists of simple types
class ArticleResponse2(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    further_questions: list[str] = Field(
        description="A list of related questions that may be of interest to the readers."
    )


# Nested types
class HistoricalEvent(BaseModel):
    """The year and explanation of a historical event."""

    year: int = Field(description="The year of the historical event")
    description: str = Field(
        description="A clear description of what happened in this event"
    )


class ArticleResponse3(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    historical_event_1: HistoricalEvent = Field(
        description="Provide a detailed description of one historical events to answer the question."
    )
    historical_event_2: HistoricalEvent = Field(
        description="Provide a detailed description of one historical events to answer the question."
    )


# Lists of custom types
class ArticleResponse4(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    historical_timeline: list[HistoricalEvent] = Field(
        description="Provide a compelling account of the historical context of the question"
    )


# Nested types
class CriicalAnalysis(BaseModel):
    """A critique of interpretations of historical events"""

    historical_event: HistoricalEvent = Field(
        description="Provide an overview of the facts of a historical event"
    )
    common_understanding: str = Field(description="Agreed interpretation of event")
    analysis: str = Field(
        description="Critical analysis of the event and opposing interpretations"
    )


# Multiple nested custom types
class ArticleResponse5(BaseModel):
    """Structured article for publication answering a reader's question."""

    title: str = Field(description="Title of the article")
    historical_timeline: list[HistoricalEvent] = Field(
        description="Provide a compelling account of the historical context of the question"
    )
    critique: list[CriicalAnalysis] = Field(
        description="A list of key historical events and historical analysis of them"
    )


structured_formats = [
    dict(pydantic=ArticleResponse1),
    dict(pydantic=ArticleResponse2),
    dict(pydantic=ArticleResponse3),
    dict(pydantic=ArticleResponse4),
    dict(pydantic=ArticleResponse5),
]

### Models to evaluate


In [5]:
# Default temperature
temperature = 0.0
timeout = 30

In [6]:
llm_models = {
    "Anthropic_Sonnet": ChatAnthropic(
        model="claude-3-5-sonnet-20241022",
        api_key=claude_api_key,
        request_timeout=timeout,
    ),
    "Anthropic_Haiku": ChatAnthropic(
        model="claude-3-5-haiku-20241022", api_key=claude_api_key
    ),
    "Anthropic_Haiku": ChatAnthropic(
        model="claude-3-haiku-20240307", api_key=claude_api_key, request_timeout=timeout
    ),
    "Ollama_llama32": ChatOllama(
        model="llama3.2", temperature=temperature, request_timeout=timeout
    ),
    "Ollama_nemotron": ChatOllama(
        model="nemotron-mini", temperature=temperature, request_timeout=timeout
    ),
    "Ollama_phi3": ChatOllama(
        model="phi3", temperature=temperature, request_timeout=timeout
    ),
    "Ollama_phi4": ChatOllama(
        model="phi4", temperature=temperature, request_timeout=timeout
    ),
    "Ollama_deepseekr1": ChatOllama(
        model="deepseek-r1", temperature=temperature, request_timeout=timeout
    ),
    "fireworks_llama31": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p1-70b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        request_timeout=timeout,
    ),
    "fireworks_llama32": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p2-3b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        request_timeout=timeout,
    ),
    "fireworks_llama33": ChatFireworks(
        model_name="accounts/fireworks/models/llama-v3p3-70b-instruct",
        api_key=FIREWORKS_API_KEY,
        temperature=temperature,
        request_timeout=timeout,
    ),
}

## Evaluation

Let's loop over different structured outputs and check the adherence using the tool-calling API (structured output mode)


### Evaluate Structured Ouputs accross providers & models

Question - of the models that have tool calling, what complexity of structure can they support?


In [10]:
def run_experiment(prompt_format, questions, llm_models, method, n_iter=1):

    structure_support_by_model = {}
    n_questions = len(questions)

    # Iterate over models
    for model_name, llm_model in llm_models.items():
        structure_support_by_model[model_name] = {}

        # Iterate over schemas
        for structure in structured_formats:
            pydantic_obj = structure["pydantic"]
            print(f"Model: {model_name}  Output: {pydantic_obj.__name__}")

            # Format instructions if required
            parser = PydanticOutputParser(pydantic_object=pydantic_obj)
            prompt = prompt_format.partial(
                format_instructions=parser.get_format_instructions()
            )

            # Iterate over questions
            error_types = []
            error_messages = []
            outputs = []
            output_valid = 0
            for _ in range(n_iter):
                for ii in range(n_questions):
                    try:
                        test_chain = prompt | llm_model.with_structured_output(
                            pydantic_obj, method=method, include_raw=True
                        )
                        output = test_chain.invoke(dict(question=questions[ii]))
                        outputs.append(output)

                        # Typically Pydantic validation failure
                        if output["parsing_error"] is not None:
                            error_types.append("parse_error")
                            error_messages.append(output["parsing_error"])
                            print("Error: Parse error")

                        # Typically function-calling failure
                        elif output["parsed"] is None:
                            error_types.append("no_output")
                            print("Error: No output")

                        # This is not expected to happen
                        elif not isinstance(output["parsed"], pydantic_obj):
                            raise RuntimeError("Unexpected error")

                        else:
                            output_valid += 1

                    # Other failures (typically function-calling not supported)
                    except Exception as e:
                        error_types.append("other error")
                        print(f"Error: Other error {type(e).__name__}")
                        error_messages.append(f"{type(e).__name__}, {e}")

                    # Pause to avoid timeouts
                    print(".", end="")
                    sleep(1)
                print()

            structure_support_by_model[model_name][pydantic_obj.__name__] = dict(
                valid=output_valid / (n_iter * n_questions),
                error_types=error_types,
                errors=error_messages,
                outputs=outputs,
            )

    return structure_support_by_model

In [None]:
structure_support_by_model_fc = run_experiment(
    prompt_direct, questions, llm_models, method="function_calling", n_iter=1
)

Model: Ollama_llama32  Output: ArticleResponse1
..
Model: Ollama_llama32  Output: ArticleResponse2
..
Model: Ollama_llama32  Output: ArticleResponse3
Error: Parse error
.Error: Parse error
.
Model: Ollama_llama32  Output: ArticleResponse4
Error: Parse error
.Error: Parse error
.
Model: Ollama_llama32  Output: ArticleResponse5
Error: Parse error
.Error: Parse error
.
Model: Ollama_nemotron  Output: ArticleResponse1
Error: No output
.Error: No output
.
Model: Ollama_nemotron  Output: ArticleResponse2
Error: No output
.Error: No output
.
Model: Ollama_nemotron  Output: ArticleResponse3
Error: No output
.Error: No output
.
Model: Ollama_nemotron  Output: ArticleResponse4
Error: No output
.Error: No output
.
Model: Ollama_nemotron  Output: ArticleResponse5
Error: No output
.Error: No output
.
Model: Ollama_phi3  Output: ArticleResponse1
Error: Other error ResponseError
.Error: Other error ResponseError
.
Model: Ollama_phi3  Output: ArticleResponse2
Error: Other error ResponseError
.Error: O

In [None]:
structure_support_by_model_js = run_experiment(
    prompt_direct, questions, llm_models, method="json_schema", n_iter=1
)

Model: Ollama_llama32  Output: ArticleResponse1
..
Model: Ollama_llama32  Output: ArticleResponse2
..
Model: Ollama_llama32  Output: ArticleResponse3
..
Model: Ollama_llama32  Output: ArticleResponse4
..
Model: Ollama_llama32  Output: ArticleResponse5
..
Model: Ollama_nemotron  Output: ArticleResponse1
..
Model: Ollama_nemotron  Output: ArticleResponse2
..
Model: Ollama_nemotron  Output: ArticleResponse3
..
Model: Ollama_nemotron  Output: ArticleResponse4
..
Model: Ollama_nemotron  Output: ArticleResponse5
..
Model: Ollama_phi3  Output: ArticleResponse1
..
Model: Ollama_phi3  Output: ArticleResponse2
..
Model: Ollama_phi3  Output: ArticleResponse3
..
Model: Ollama_phi3  Output: ArticleResponse4
..
Model: Ollama_phi3  Output: ArticleResponse5
..
Model: Ollama_phi4  Output: ArticleResponse1
..
Model: Ollama_phi4  Output: ArticleResponse2
..
Model: Ollama_phi4  Output: ArticleResponse3
..
Model: Ollama_phi4  Output: ArticleResponse4
..
Model: Ollama_phi4  Output: ArticleResponse5
..
Model

ValueError: Unrecognized method argument. Expected one of 'function_calling' or 'json_mode'. Received: 'json_schema'

In [None]:
structure_support_by_model_jm = run_experiment(
    prompt_user_format, questions, llm_models, method="json_mode", n_iter=1
)

Model: Ollama_llama32  Output: ArticleResponse1
..
Model: Ollama_llama32  Output: ArticleResponse2
Error: Parse error
..
Model: Ollama_llama32  Output: ArticleResponse3
Error: Parse error
.

In [12]:
structure_support_by_model = structure_support_by_model_js

pd.DataFrame.from_dict(
    {
        mname: {
            tname: structure_support_by_model[mname][tname]["valid"] * 100
            for tname in structure_support_by_model[mname].keys()
        }
        for mname in structure_support_by_model.keys()
    },
    orient="index",
)

Unnamed: 0,ArticleResponse1,ArticleResponse2,ArticleResponse3,ArticleResponse4,ArticleResponse5
Ollama_nemotron,100.0,100.0,100.0,100.0,100.0


### Error analysis


In [59]:
def results_to_df(ss_results, key="valid"):
    df = pd.DataFrame.from_dict(
        {
            mname: {
                tname: ss_results[mname][tname][key] * 100 / n_questions
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    return df


def analyse_errors_from_results(ss_results, method="code"):
    error_counts = {}
    for mname in ss_results.keys():
        error_counts[mname] = {}
        for tname in ss_results[mname].keys():
            validation_error = 0
            json_error = 0
            unknown_error = 0

            # Count errors by failure code above
            if method == "code":
                error_types = pd.Series(ss_results[mname][tname]["error_types"])
                error_codes = error_types.value_counts()

                for e_name, e_count in error_codes.items():
                    error_counts[mname][(tname, e_name)] = e_count

            elif method == "parse":
                # Count errors by parsing error message
                errors = ss_results[mname][tname]["errors"]
                for error in errors:
                    error_str = str(error)
                    if error_str.lower().find("invalid json output") >= 0:
                        json_error += 1
                    elif error_str.lower().find("validation error") >= 0:
                        validation_error += 1
                    else:
                        unknown_error += 1
                error_counts[mname][(tname, "invalid_json")] = json_error
                error_counts[mname][(tname, "validation")] = validation_error
                error_counts[mname][(tname, "unknown")] = unknown_error

            else:
                raise NameError(f"Method {method} not supported")

    return pd.DataFrame.from_dict(error_counts, orient="index")

In [60]:
errors_df = analyse_errors_from_results(structure_support_by_model_jm)
errors_df

Unnamed: 0_level_0,ArticleResponse4,ArticleResponse5
Unnamed: 0_level_1,parse_error,parse_error
Ollama_nemotron,1,4


In [62]:
errors_df = analyse_errors_from_results(structure_support_by_model_fc, method="parse")
errors_df

Unnamed: 0_level_0,ArticleResponse1,ArticleResponse1,ArticleResponse1,ArticleResponse2,ArticleResponse2,ArticleResponse2,ArticleResponse3,ArticleResponse3,ArticleResponse3,ArticleResponse4,ArticleResponse4,ArticleResponse4,ArticleResponse5,ArticleResponse5,ArticleResponse5
Unnamed: 0_level_1,invalid_json,validation,unknown,invalid_json,validation,unknown,invalid_json,validation,unknown,invalid_json,validation,unknown,invalid_json,validation,unknown
Ollama_nemotron,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [64]:
errors_df = analyse_errors_from_results(structure_support_by_model_js)
errors_df

### Results


In [None]:
import pandas as pd

In [65]:
results_list = {
    "Function-calling": structure_support_by_model_fc,
    "JSON Schema": structure_support_by_model_js,
    "JSON Mode": structure_support_by_model_jm,
}

df_results = {}
for name, ss_results in results_list.items():
    df_results[name] = pd.DataFrame.from_dict(
        {
            mname: {
                tname: ss_results[mname][tname]["valid"] * 100
                for tname in ss_results[mname].keys()
            }
            for mname in ss_results.keys()
        },
        orient="index",
    )
    display(name)

'Function-calling'

'JSON Schema'

'JSON Mode'

In [66]:
df = pd.concat(df_results)
df

Unnamed: 0,Unnamed: 1,ArticleResponse1,ArticleResponse2,ArticleResponse3,ArticleResponse4,ArticleResponse5
Function-calling,Ollama_nemotron,0.0,0.0,0.0,0.0,0.0
JSON Schema,Ollama_nemotron,100.0,100.0,100.0,100.0,100.0
JSON Mode,Ollama_nemotron,100.0,100.0,100.0,80.0,20.0


In [None]:
import tabulate

print(
    tabulate.tabulate(
        df.reset_index(), headers="keys", tablefmt="pipe", showindex=False
    )
)

| level_0          | level_1         |   ArticleResponse1 |   ArticleResponse2 |   ArticleResponse3 |   ArticleResponse4 |   ArticleResponse5 |
|:-----------------|:----------------|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:|
| Function-calling | Ollama_nemotron |                  0 |                  0 |                  0 |                  0 |                  0 |
| JSON Schema      | Ollama_nemotron |                100 |                100 |                100 |                100 |                100 |
| JSON Mode        | Ollama_nemotron |                100 |                100 |                100 |                 80 |                 20 |


Save results


In [67]:
import pickle

with open(file=f"exp5_summary_df_{experiment_date}.json", mode="wb") as f:
    df.to_json(f)

with open(file=f"exp5_all_models_{experiment_date}.pkl", mode="wb") as f:
    pickle.dump(
        dict(
            structure_support_by_model_fc=structure_support_by_model_fc,
            structure_support_by_model_jm=structure_support_by_model_jm,
            structure_support_by_model_js=structure_support_by_model_js,
        ),
        f,
    )

Load results


In [22]:
import pickle
import pandas as pd

with open(file=f"exp5_summary_df_{experiment_date}.json", mode="rb") as f:
    df = pd.read_json(f)

with open(file=f"exp5_all_models_{experiment_date}.pkl", mode="rb") as f:
    data = pickle.load(f)

# Inject into toplevel namespace
namespace = locals()
for key, value in data.items():
    if key not in namespace:
        print(f"Loaded {key}")
        namespace[key] = value

Loaded structure_support_by_model
Loaded structure_support_by_model_op
Loaded structure_support_by_model_op_system
Loaded structure_support_by_model_op_jsonmode


#### Output parsers [Deprecated]


In [None]:
stop

Let's do the same for the output parser formatting. Note that as a lot of models ignore these instructions, it can take a lot of time.


In [None]:
from langchain_core.output_parsers import PydanticOutputParser


def run_experiment_with_op(prompt_format, llm_models, n_iter):
    ss_results = {}
    n_questions = len(questions)

    for model_name, llm_model in llm_models.items():
        ss_results[model_name] = {}
        for structure in structured_formats:
            pydantic_obj = structure["pydantic"]
            print(f"Model: {model_name}  Output: {pydantic_obj.__name__}")

            # Iterate over questions
            output_valid = 0
            tool_use = 0
            error_messages = []
            outputs = []

            for kk in range(n_iter):
                for ii in range(n_questions):
                    parser = PydanticOutputParser(pydantic_object=pydantic_obj)
                    prompt = prompt_format.partial(
                        format_instructions=parser.get_format_instructions()
                    )
                    test_chain = prompt | llm_model | parser

                    try:
                        output = test_chain.invoke(dict(question=questions[ii]))
                        assert isinstance(output, pydantic_obj)
                        output_valid += 1
                        outputs.append(output)

                    except Exception as e:
                        print(f"  Invalid ouput ({type(e)})")
                        error_messages.append(f"{type(e).__name__}, {e}")

            ss_results[model_name][pydantic_obj.__name__] = dict(
                valid=output_valid / (n_iter * n_questions),
                tool_use=tool_use / (n_iter * n_questions),
                errors=error_messages,
                outputs=outputs,
            )
    return ss_results

In [None]:
structure_support_by_model_op = run_experiment_with_op(
    prompt_user_format, llm_models, n_iter
)

In [None]:
structure_support_by_model_op_jsonmode = run_experiment_with_op(
    prompt_user_format, llm_models_jsonmode, n_iter
)

In [None]:
structure_support_by_model_op_system = run_experiment_with_op(
    prompt_system_format, llm_models, n_iter
)