Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions internal/golden/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ cd langchain-py-v1
python langchain.py
```

```bash
cd pydantic-ai-v1
python pydantic_ai_test.py
```


## Requirements

Before running a suite, ensure you have the appropriate API keys set as environment variables for that provider, along with `BRAINTRUST_API_KEY` if you want to log traces to Braintrust.
Expand Down
913 changes: 0 additions & 913 deletions internal/golden/pydantic-ai-v1/pydantic_ai_test.py

This file was deleted.

13 changes: 0 additions & 13 deletions internal/golden/pydantic-ai-v1/pyproject.toml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
interactions:
- request:
body: '{"messages":[{"role":"user","content":[{"type":"image_url","image_url":{"url":"data:image/png;base64,SU5WQUxJRF9QTkdfREFUQV9OT1RfQV9SRUFMX0lNQUdF"}},{"type":"text","text":"What''s
in this image?"}]}],"model":"gpt-4o-mini","max_completion_tokens":100,"stream":false}'
headers:
Accept:
- application/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Type:
- application/json
Host:
- api.openai.com
User-Agent:
- python-httpx/0.28.1
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: '{"error":{"message":"Could not process image. The image data is not
valid.","type":"invalid_request_error","param":null,"code":"invalid_image"}}'
headers:
Content-Type:
- application/json
status:
code: 400
message: Bad Request
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
interactions:
- request:
body: '{"messages":[{"role":"user","content":"Here is a long text:\n\nThe quick
brown fox jumps over the lazy dog. <...repeated 20 times...>\n\nHow many times
does the word ''fox'' appear?"}],"model":"gpt-4o-mini","max_completion_tokens":100,"stream":false}'
headers:
Accept:
- application/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Type:
- application/json
Host:
- api.openai.com
User-Agent:
- python-httpx/0.28.1
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: '{"id":"chatcmpl-long001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"The
word ''fox'' appears 20 times in the text.","refusal":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":120,"completion_tokens":14,"total_tokens":134,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}'
headers:
Content-Type:
- application/json
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
interactions:
- request:
body: '{"messages":[{"role":"user","content":"Write a haiku about coding."},{"role":"assistant","content":"Here
is a haiku:"},{"role":"user","content":"Write a haiku about coding."}],"model":"gpt-4o-mini","max_completion_tokens":200,"stream":false}'
headers:
Accept:
- application/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Type:
- application/json
Host:
- api.openai.com
User-Agent:
- python-httpx/0.28.1
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: '{"id":"chatcmpl-pre001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"Whitespace
tells a tale\nFunctions dance in loops and turns\nClean code brings freedom","refusal":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":35,"completion_tokens":16,"total_tokens":51,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}'
headers:
Content-Type:
- application/json
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
interactions:
- request:
body: '{"messages":[{"role":"user","content":"What is AI?"}],"model":"gpt-4o-mini","max_completion_tokens":5,"stream":false}'
headers:
Accept:
- application/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Type:
- application/json
Host:
- api.openai.com
User-Agent:
- python-httpx/0.28.1
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: '{"id":"chatcmpl-short001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"AI
stands for Artificial","refusal":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"completion_tokens":5,"total_tokens":16,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}'
headers:
Content-Type:
- application/json
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
interactions:
- request:
body: '{"messages":[{"role":"user","content":"Write a short story about a robot."}],"model":"gpt-4o-mini","max_completion_tokens":500,"stop":["END","\n\n"],"stream":false}'
headers:
Accept:
- application/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Type:
- application/json
Host:
- api.openai.com
User-Agent:
- python-httpx/0.28.1
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: '{"id":"chatcmpl-stop001","object":"chat.completion","created":1735000000,"model":"gpt-4o-mini-2024-07-18","service_tier":"default","system_fingerprint":"fp_test","choices":[{"index":0,"message":{"role":"assistant","content":"Once
upon a time, there was a small robot named Bolt who loved exploring the world.
Every day, Bolt would venture out to discover new things.","refusal":null},"logprobs":null,"finish_reason":"stop"}],"usage":{"prompt_tokens":16,"completion_tokens":31,"total_tokens":47,"prompt_tokens_details":{"cached_tokens":0,"audio_tokens":0},"completion_tokens_details":{"reasoning_tokens":0,"audio_tokens":0,"accepted_prediction_tokens":0,"rejected_prediction_tokens":0}}}'
headers:
Content-Type:
- application/json
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -1573,8 +1573,7 @@ def check_item(item):
async def test_agent_with_document_input(memory_logger):
"""Test that agents with document input (PDF) properly serialize attachments.

This specifically tests the scenario from test_document_input in the golden tests,
verifying that both agent_run and chat spans convert BinaryContent to Braintrust
Verifies that both agent_run and chat spans convert BinaryContent to Braintrust
attachments for document files like PDFs.
"""
from braintrust.logger import Attachment
Expand Down Expand Up @@ -2777,4 +2776,189 @@ def wrapped():
wrapper(wrapped, instance, (), {})

assert instance.call_count == 1
assert instance._async_producer == original_async_producer


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_agent_with_stop_sequences(memory_logger):
"""Test Agent respects stop_sequences in model settings."""
assert not memory_logger.pop()

agent = Agent(
MODEL,
model_settings=ModelSettings(max_tokens=500, stop_sequences=["END", "\n\n"]),
)

start = time.time()
result = await agent.run("Write a short story about a robot.")
end = time.time()

assert result.output

spans = memory_logger.pop()
assert len(spans) >= 2

agent_span = next(
(
s
for s in spans
if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"]
),
None,
)
assert agent_span is not None, "agent_run span not found"
assert agent_span["metadata"]["model"] == "gpt-4o-mini"

# stop_sequences on the agent constructor → in metadata.model_settings
assert "model_settings" in agent_span["metadata"]
settings = agent_span["metadata"]["model_settings"]
assert settings.get("stop_sequences") == ["END", "\n\n"]

_assert_metrics_are_valid(agent_span["metrics"], start, end)


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_agent_with_prefill(memory_logger):
"""Test Agent with a partial assistant response in message_history.

The 'prefill' pattern puts an incomplete assistant message at the end of the
history so the model continues from that point.
"""
from pydantic_ai.messages import ModelResponse, TextPart

assert not memory_logger.pop()

agent = Agent(MODEL, model_settings=ModelSettings(max_tokens=200))

prefill_history = [
ModelRequest(parts=[UserPromptPart(content="Write a haiku about coding.")]),
ModelResponse(parts=[TextPart(content="Here is a haiku:")]),
]

start = time.time()
result = await agent.run("Write a haiku about coding.", message_history=prefill_history)
end = time.time()

assert result.output

spans = memory_logger.pop()
assert len(spans) >= 2

agent_span = next(
(
s
for s in spans
if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"]
),
None,
)
assert agent_span is not None, "agent_run span not found"
assert agent_span["metadata"]["model"] == "gpt-4o-mini"

# The prefill history (including the partial assistant TextPart) must appear
# in the span input so that the trace is complete and auditable.
assert "message_history" in str(agent_span["input"])
assert "Here is a haiku" in str(agent_span["input"])
assert agent_span["output"]

_assert_metrics_are_valid(agent_span["metrics"], start, end)


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_agent_with_short_max_tokens(memory_logger):
"""Test Agent with a very small max_tokens that truncates the response."""
assert not memory_logger.pop()

agent = Agent(MODEL)

start = time.time()
result = await agent.run("What is AI?", model_settings=ModelSettings(max_tokens=5))
end = time.time()

# Truncated responses are still valid output; no exception should be raised.
assert result.output

spans = memory_logger.pop()
assert len(spans) >= 2

agent_span = next(
(
s
for s in spans
if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"]
),
None,
)
assert agent_span is not None, "agent_run span not found"
assert agent_span["metadata"]["model"] == "gpt-4o-mini"

# max_tokens passed to run() → in input.model_settings
assert "model_settings" in agent_span["input"]
assert agent_span["input"]["model_settings"].get("max_tokens") == 5

assert agent_span["output"]
_assert_metrics_are_valid(agent_span["metrics"], start, end)


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_agent_with_long_context(memory_logger):
"""Test Agent handles large input context without errors."""
assert not memory_logger.pop()

agent = Agent(MODEL, model_settings=ModelSettings(max_tokens=100))

long_text = "The quick brown fox jumps over the lazy dog. " * 20
prompt = f"Here is a long text:\n\n{long_text}\n\nHow many times does the word 'fox' appear?"

start = time.time()
result = await agent.run(prompt)
end = time.time()

assert result.output

spans = memory_logger.pop()
assert len(spans) >= 2

agent_span = next(
(
s
for s in spans
if "agent_run" in s["span_attributes"]["name"] and "chat" not in s["span_attributes"]["name"]
),
None,
)
assert agent_span is not None, "agent_run span not found"
assert agent_span["metadata"]["model"] == "gpt-4o-mini"
# The long prompt should be captured in the span input
assert "fox" in str(agent_span["input"]).lower()
assert agent_span["output"]

_assert_metrics_are_valid(agent_span["metrics"], start, end)


@pytest.mark.vcr
@pytest.mark.asyncio
async def test_agent_with_error_handling(memory_logger):
"""Test Agent propagates API errors for invalid binary content.

Sends corrupted image bytes; the API returns a 400 error. The exception must
propagate to the caller rather than being silently swallowed.
"""
from pydantic_ai.models.function import BinaryContent

assert not memory_logger.pop()

agent = Agent(MODEL, model_settings=ModelSettings(max_tokens=100))

corrupted_data = b"INVALID_PNG_DATA_NOT_A_REAL_IMAGE"

with pytest.raises(Exception):
await agent.run(
[
BinaryContent(data=corrupted_data, media_type="image/png"),
"What's in this image?",
]
)
Loading