diff --git a/braintrust-sdk/instrumentation/anthropic_2_2_0/src/test/java/dev/braintrust/instrumentation/anthropic/v2_2_0/BraintrustAnthropicTest.java b/braintrust-sdk/instrumentation/anthropic_2_2_0/src/test/java/dev/braintrust/instrumentation/anthropic/v2_2_0/BraintrustAnthropicTest.java index 4b1695dc..36975fce 100644 --- a/braintrust-sdk/instrumentation/anthropic_2_2_0/src/test/java/dev/braintrust/instrumentation/anthropic/v2_2_0/BraintrustAnthropicTest.java +++ b/braintrust-sdk/instrumentation/anthropic_2_2_0/src/test/java/dev/braintrust/instrumentation/anthropic/v2_2_0/BraintrustAnthropicTest.java @@ -67,7 +67,7 @@ void testWrapAnthropic() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("anthropic.messages.create", span.getName()); + assertFalse(span.getName().isEmpty(), "span name should be non-empty"); // Verify span_attributes String spanAttributesJson = @@ -156,7 +156,7 @@ void testWrapAnthropicStreaming() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("anthropic.messages.create", span.getName()); + assertFalse(span.getName().isEmpty(), "span name should be non-empty"); // Verify metadata String metadataJson = @@ -220,7 +220,7 @@ void testWrapAnthropicAsync() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("anthropic.messages.create", span.getName()); + assertFalse(span.getName().isEmpty(), "span name should be non-empty"); String spanAttributesJson = span.getAttributes().get(AttributeKey.stringKey("braintrust.span_attributes")); @@ -290,7 +290,7 @@ void testWrapAnthropicAsyncStreaming() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("anthropic.messages.create", span.getName()); + assertFalse(span.getName().isEmpty(), "span name should be non-empty"); assertNotNull(span.getAttributes().get(AttributeKey.stringKey("braintrust.input_json"))); @@ -342,7 +342,7 @@ void testWrapAnthropicBeta() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("anthropic.messages.create", span.getName()); + assertFalse(span.getName().isEmpty(), "span name should be non-empty"); // Verify span_attributes String spanAttributesJson = @@ -431,7 +431,7 @@ void testWrapAnthropicBetaStreaming() { assertEquals(1, spans.size()); var span = spans.get(0); - assertEquals("anthropic.messages.create", span.getName()); + assertFalse(span.getName().isEmpty(), "span name should be non-empty"); // Verify metadata String metadataJson = diff --git a/braintrust-sdk/src/main/java/dev/braintrust/instrumentation/InstrumentationSemConv.java b/braintrust-sdk/src/main/java/dev/braintrust/instrumentation/InstrumentationSemConv.java index fd0209fc..d1ffd1a7 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/instrumentation/InstrumentationSemConv.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/instrumentation/InstrumentationSemConv.java @@ -211,6 +211,11 @@ private static void tagAnthropicRequest( if (requestBody != null) { JsonNode requestJson = BraintrustJsonMapper.get().readTree(requestBody); + if (requestJson.has("stream") + && requestJson.get("stream").isBoolean() + && requestJson.get("stream").asBoolean()) { + span.updateName(getStreamingSpanName(providerName, pathSegments)); + } if (requestJson.has("model")) { metadata.put("model", requestJson.get("model").asText()); } @@ -503,4 +508,15 @@ private static String getSpanName(String providerName, List pathSegments default -> lastSegment; }; } + + private static String getStreamingSpanName(String providerName, List pathSegments) { + if (pathSegments.isEmpty()) { + return UNSET_LLM_SPAN_NAME; + } + String lastSegment = pathSegments.get(pathSegments.size() - 1); + return switch (providerName + ":" + lastSegment) { + case PROVIDER_NAME_ANTHROPIC + ":messages" -> "anthropic.messages.stream"; + default -> getSpanName(providerName, pathSegments); + }; + } } diff --git a/braintrust-sdk/src/main/java/dev/braintrust/trace/AttachmentProcessor.java b/braintrust-sdk/src/main/java/dev/braintrust/trace/AttachmentProcessor.java index ed25a46d..0f624ff2 100644 --- a/braintrust-sdk/src/main/java/dev/braintrust/trace/AttachmentProcessor.java +++ b/braintrust-sdk/src/main/java/dev/braintrust/trace/AttachmentProcessor.java @@ -3,32 +3,127 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; -import com.fasterxml.jackson.databind.node.TextNode; import dev.braintrust.config.BraintrustConfig; import dev.braintrust.json.BraintrustJsonMapper; import java.time.Duration; import java.util.Base64; +import java.util.LinkedHashSet; +import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; -import java.util.regex.Matcher; +import java.util.function.BiFunction; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; /** - * Scans JSON content for base64 data URI attachments and replaces them with attachment references - * after uploading to S3. + * Scans JSON content for base64 attachments across multiple LLM provider formats (OpenAI, Bedrock, + * Anthropic, Gemini) and replaces them with attachment references after uploading to object + * storage. Handles all attachment types: images, documents, video, audio, etc. * *

Package-private; not exposed in the public API. */ @Slf4j class AttachmentProcessor { + private static final String DATA_URI_PREFIX = "data:([\\w/\\-.+]+);base64,"; + private static final String BASE64STRING = "([A-Za-z0-9+/=]{20,})"; + + /** Matches data URIs in unquoted text node values — used by the OpenAI matcher/replacer. */ + private static final Pattern DATA_URI_PATTERN = + Pattern.compile("%s%s".formatted(DATA_URI_PREFIX, BASE64STRING)); + + /** Matches data URIs in the raw JSON string (with surrounding quotes) for the heuristic. */ + private static final Pattern DATA_URI_HEURISTIC_PATTERN = + Pattern.compile("\"%s\"".formatted(DATA_URI_PATTERN.pattern())); + + private static final Pattern BYTE_TEXT_VALUE_PATTERN = + Pattern.compile("\"(bytes|data)\"\\s*:\\s*\"%s\"".formatted(BASE64STRING)); + /** - * quick heuristic to determine if the json payload contains a base64 encoded file - * - *

This is used for performance reasons as a fail-fast to avoid doing a json parse. + * Supported provider attachment formats, checked in order during tree traversal. To add a new + * provider, append an {@link AttachmentFormat} entry here. + */ + static final List ATTACHMENT_FORMATS = + List.of( + // OpenAI: data URI in a text node value + // e.g. image_url.url = "data:image/png;base64,..." or + // file.file_data = "data:application/pdf;base64,..." + new AttachmentFormat( + "openai", + DATA_URI_HEURISTIC_PATTERN.pattern(), + (key, node) -> + node.isTextual() + && isEntirelyDataUri(node.asText()) + && DATA_URI_PATTERN.matcher(node.asText()).find(), + AttachmentProcessor::replaceOpenAIDataUri), + + // Bedrock Converse: parent block has a key like "image", "document", + // "video", or "audio" wrapping {"format": "png", "source": {"bytes": "..."}} + // We match at the parent level to use the block type key for MIME resolution, + // so ambiguous formats like "mp4" get the correct media category. + new AttachmentFormat( + "bedrock", + BYTE_TEXT_VALUE_PATTERN.pattern(), + (key, node) -> { + if (!node.isObject()) return false; + return getConverseBlock((ObjectNode) node) != null; + }, + AttachmentProcessor::replaceBedrockAttachment), + + // Anthropic: {"type": "base64", "media_type": "image/png", "data": ""} + // Applies to image and document source objects + new AttachmentFormat( + "anthropic", + BYTE_TEXT_VALUE_PATTERN.pattern(), + (key, node) -> { + if (!node.isObject()) return false; + JsonNode type = node.get("type"); + JsonNode mediaType = node.get("media_type"); + JsonNode data = node.get("data"); + return type != null + && "base64".equals(type.asText()) + && mediaType != null + && mediaType.isTextual() + && data != null + && data.isTextual() + && data.asText().length() >= 20; + }, + AttachmentProcessor::replaceAnthropicAttachment), + + // Gemini: {"inlineData": {"mimeType": "image/png", "data": ""}} + // Applies to all inline binary content (images, PDFs, audio, video) + new AttachmentFormat( + "gemini", + BYTE_TEXT_VALUE_PATTERN.pattern(), + (key, node) -> { + if (!node.isObject()) return false; + JsonNode inlineData = node.get("inlineData"); + if (inlineData == null || !inlineData.isObject()) return false; + JsonNode mimeType = inlineData.get("mimeType"); + JsonNode data = inlineData.get("data"); + return mimeType != null + && mimeType.isTextual() + && data != null + && data.isTextual() + && data.asText().length() >= 20; + }, + AttachmentProcessor::replaceGeminiAttachment)); + + /** + * Fast-path heuristic compiled from all {@link #ATTACHMENT_FORMATS} entries. If this doesn't + * match the raw JSON string, we skip JSON parsing entirely. */ - static final Pattern BASE64_DATA_URI_PATTERN = - Pattern.compile("data:([\\w/\\-.+]+);base64,([A-Za-z0-9+/=]{20,})"); + static final Pattern BASE64_HEURISTIC = buildHeuristic(); + + private static Pattern buildHeuristic() { + var fragments = new LinkedHashSet(); + for (var fmt : ATTACHMENT_FORMATS) { + fragments.add(fmt.heuristicFragment()); + } + return Pattern.compile(fragments.stream().collect(Collectors.joining("|"))); + } private final BraintrustConfig config; private final AttachmentUploader uploader; @@ -39,25 +134,21 @@ class AttachmentProcessor { } /** - * Scans a JSON string for base64 data URIs, uploads them, and returns the modified JSON with + * Scans a JSON string for base64 attachments, uploads them, and returns the modified JSON with * attachment references. - * - * @param json the JSON string to scan - * @return the modified JSON with base64 data replaced by attachment references, or the original - * JSON if no base64 data was found */ String processAndUpload(String json) { if ((!config.autoConvertAIAttachments()) || json == null || uploader.isShutdown() - || !BASE64_DATA_URI_PATTERN.matcher(json).find()) { + || !BASE64_HEURISTIC.matcher(json).find()) { return json; } try { JsonNode root = BraintrustJsonMapper.get().readTree(json); AtomicBoolean modified = new AtomicBoolean(false); - JsonNode result = replaceBase64Attachments(root, modified); + JsonNode result = walkAndReplace(null, null, root, modified); return modified.get() ? BraintrustJsonMapper.get().writeValueAsString(result) : json; } catch (UploaderRejectionException e) { log.debug( @@ -71,66 +162,186 @@ String processAndUpload(String json) { } } - // NOTE: not concerned with recursion blowing the stack because we're mutating AI vendor - // messages which are not deep enough for this to be an issue. - private JsonNode replaceBase64Attachments(JsonNode node, AtomicBoolean modified) { - if (node.isTextual()) { - return replaceInText((TextNode) node, modified); - } else if (node.isObject()) { + /** + * Walks the JSON tree. For each node, checks all {@link #ATTACHMENT_FORMATS} matchers. If one + * matches, calls its replacer and returns the result (no further recursion into that subtree). + * Otherwise recurses into children. + */ + private JsonNode walkAndReplace( + @Nullable ObjectNode parent, + @Nullable String fieldName, + @Nonnull JsonNode node, + @Nonnull AtomicBoolean modified) { + + // Check each registered format + for (var fmt : ATTACHMENT_FORMATS) { + if (fmt.matcher().apply(fieldName, node)) { + JsonNode replacement = fmt.replacer().apply(parent, fieldName, node, uploader); + if (replacement != null) { + modified.set(true); + return replacement; + } + } + } + + // No format matched — recurse into children + if (node.isObject()) { ObjectNode objectNode = (ObjectNode) node; ObjectNode result = BraintrustJsonMapper.get().createObjectNode(); - var fieldNames = objectNode.fieldNames(); - while (fieldNames.hasNext()) { - String fieldName = fieldNames.next(); - JsonNode child = objectNode.get(fieldName); - result.set(fieldName, replaceBase64Attachments(child, modified)); + var fields = objectNode.fieldNames(); + while (fields.hasNext()) { + String childField = fields.next(); + JsonNode child = objectNode.get(childField); + result.set(childField, walkAndReplace(objectNode, childField, child, modified)); } return result; } else if (node.isArray()) { ArrayNode arrayNode = (ArrayNode) node; ArrayNode result = BraintrustJsonMapper.get().createArrayNode(); for (int i = 0; i < arrayNode.size(); i++) { - result.add(replaceBase64Attachments(arrayNode.get(i), modified)); + result.add(walkAndReplace(null, null, arrayNode.get(i), modified)); } return result; } return node; } + // ── Replacer implementations ────────────────────────────────────── + + /** OpenAI: replace a data URI text node with an attachment reference object. */ @SneakyThrows - private JsonNode replaceInText(TextNode textNode, AtomicBoolean modified) { - String value = textNode.asText(); - Matcher matcher = BASE64_DATA_URI_PATTERN.matcher(value); - if (!matcher.find()) { - return textNode; - } - if (!isEntirelyDataUri(value)) { - log.debug("found base64 string but text contained extra content {}", value); - return textNode; - } + private static JsonNode replaceOpenAIDataUri( + ObjectNode parent, String fieldName, JsonNode node, AttachmentUploader uploader) { + var matcher = DATA_URI_PATTERN.matcher(node.asText()); + if (!matcher.find()) return null; + + String contentType = matcher.group(1); + String base64Data = matcher.group(2); + return uploadAndCreateRef(contentType, base64Data, uploader); + } - matcher.reset(); - StringBuilder sb = new StringBuilder(); - while (matcher.find()) { - String contentType = matcher.group(1); - String base64Data = matcher.group(2); - byte[] data = Base64.getDecoder().decode(base64Data); + /** + * Bedrock Converse: find and replace the attachment block within the parent content block. + * Matches at parent level so the block type key (image/video/audio/document) determines the + * MIME category. + */ + @SneakyThrows + private static JsonNode replaceBedrockAttachment( + ObjectNode parent, String fieldName, JsonNode node, AttachmentUploader uploader) { + ObjectNode obj = (ObjectNode) node; + ConverseBlock block = getConverseBlock(obj); + if (block == null) return null; - String extension = contentTypeToExtension(contentType); - String filename = "attachment" + extension; - AttachmentReference ref = AttachmentReference.create(filename, contentType); + ObjectNode inner = block.inner; + String format = inner.get("format").asText(); + String contentType = block.formatMap.get(format.toLowerCase()); + if (contentType == null) return null; - if (!uploader.enqueue(ref, data)) { - throw new UploaderRejectionException("uploader rejected attachment upload"); + String base64Data = inner.get("source").get("bytes").asText(); + JsonNode refNode = uploadAndCreateRef(contentType, base64Data, uploader); + if (refNode == null) return null; + + // Rebuild parent: copy all fields, but replace source.bytes in the matched block + ObjectNode result = BraintrustJsonMapper.get().createObjectNode(); + var fields = obj.fieldNames(); + while (fields.hasNext()) { + String f = fields.next(); + if (f.equals(block.blockTypeKey)) { + ObjectNode newInner = BraintrustJsonMapper.get().createObjectNode(); + var innerFields = inner.fieldNames(); + while (innerFields.hasNext()) { + String inf = innerFields.next(); + if ("source".equals(inf)) { + ObjectNode origSource = (ObjectNode) inner.get("source"); + ObjectNode newSource = BraintrustJsonMapper.get().createObjectNode(); + var sourceFields = origSource.fieldNames(); + while (sourceFields.hasNext()) { + String sf = sourceFields.next(); + newSource.set(sf, "bytes".equals(sf) ? refNode : origSource.get(sf)); + } + newInner.set("source", newSource); + } else { + newInner.set(inf, inner.get(inf)); + } + } + result.set(f, newInner); + } else { + result.set(f, obj.get(f)); } + } + return result; + } - matcher.appendReplacement(sb, Matcher.quoteReplacement(ref.toJson())); + /** Anthropic: replace the entire source object with the attachment ref. */ + @SneakyThrows + private static JsonNode replaceAnthropicAttachment( + ObjectNode parent, String fieldName, JsonNode node, AttachmentUploader uploader) { + ObjectNode obj = (ObjectNode) node; + String contentType = obj.get("media_type").asText(); + String base64Data = obj.get("data").asText(); + return uploadAndCreateRef(contentType, base64Data, uploader); + } + + /** + * Gemini: replace {@code inlineData} with an attachment reference wrapper. Images use {@code + * image_url: {url: ref}}, all other content types use {@code file: {file_data: ref}}. + */ + @SneakyThrows + private static JsonNode replaceGeminiAttachment( + ObjectNode parent, String fieldName, JsonNode node, AttachmentUploader uploader) { + ObjectNode obj = (ObjectNode) node; + ObjectNode inlineData = (ObjectNode) obj.get("inlineData"); + String contentType = inlineData.get("mimeType").asText(); + String base64Data = inlineData.get("data").asText(); + + JsonNode refNode = uploadAndCreateRef(contentType, base64Data, uploader); + if (refNode == null) return null; + + boolean isImage = contentType.startsWith("image/"); + + // Rebuild: swap inlineData for the appropriate wrapper + ObjectNode result = BraintrustJsonMapper.get().createObjectNode(); + var fields = obj.fieldNames(); + while (fields.hasNext()) { + String f = fields.next(); + if ("inlineData".equals(f)) { + if (isImage) { + ObjectNode imageUrl = BraintrustJsonMapper.get().createObjectNode(); + imageUrl.set("url", refNode); + result.set("image_url", imageUrl); + } else { + ObjectNode file = BraintrustJsonMapper.get().createObjectNode(); + file.set("file_data", refNode); + result.set("file", file); + } + } else { + result.set(f, obj.get(f)); + } } - matcher.appendTail(sb); + return result; + } + + // ── Shared helpers ──────────────────────────────────────────────── - modified.set(true); + /** Decodes base64 data, uploads it, and returns the attachment reference as a JsonNode. */ + @SneakyThrows + private static JsonNode uploadAndCreateRef( + String contentType, String base64Data, AttachmentUploader uploader) { + byte[] data; + try { + data = Base64.getDecoder().decode(base64Data); + } catch (IllegalArgumentException e) { + log.debug("Failed to decode base64 data, skipping"); + return null; + } - return BraintrustJsonMapper.get().readTree(sb.toString()); + String extension = contentTypeToExtension(contentType); + String filename = "attachment" + extension; + AttachmentReference ref = AttachmentReference.create(filename, contentType); + if (!uploader.enqueue(ref, data)) { + throw new UploaderRejectionException("uploader rejected attachment upload"); + } + return BraintrustJsonMapper.get().readTree(ref.toJson()); } static boolean isEntirelyDataUri(String value) { @@ -141,31 +352,152 @@ static boolean isEntirelyDataUri(String value) { && !trimmed.contains(" "); } - private static String contentTypeToExtension(String contentType) { - switch (contentType.toLowerCase()) { - case "image/png": - return ".png"; - case "image/jpeg": - case "image/jpg": - return ".jpg"; - case "image/gif": - return ".gif"; - case "image/webp": - return ".webp"; - case "image/svg+xml": - return ".svg"; - case "application/pdf": - return ".pdf"; - case "text/plain": - return ".txt"; - case "application/json": - return ".json"; - default: + // ── Bedrock Converse block detection ───────────────────────────── + + /** Per-block-type format-to-MIME mappings for the AWS Bedrock Converse API. */ + private static final java.util.Map CONVERSE_IMAGE_FORMATS = + java.util.Map.of( + "gif", "image/gif", + "jpeg", "image/jpeg", + "png", "image/png", + "webp", "image/webp"); + + private static final java.util.Map CONVERSE_VIDEO_FORMATS = + java.util.Map.ofEntries( + java.util.Map.entry("flv", "video/x-flv"), + java.util.Map.entry("mkv", "video/x-matroska"), + java.util.Map.entry("mov", "video/quicktime"), + java.util.Map.entry("mp4", "video/mp4"), + java.util.Map.entry("mpeg", "video/mpeg"), + java.util.Map.entry("mpg", "video/mpeg"), + java.util.Map.entry("three_gp", "video/3gpp"), + java.util.Map.entry("webm", "video/webm"), + java.util.Map.entry("wmv", "video/x-ms-wmv")); + + private static final java.util.Map CONVERSE_AUDIO_FORMATS = + java.util.Map.ofEntries( + java.util.Map.entry("aac", "audio/aac"), + java.util.Map.entry("flac", "audio/flac"), + java.util.Map.entry("m4a", "audio/mp4"), + java.util.Map.entry("mka", "audio/x-matroska"), + java.util.Map.entry("mkv", "audio/x-matroska"), + java.util.Map.entry("mp3", "audio/mpeg"), + java.util.Map.entry("mp4", "audio/mp4"), + java.util.Map.entry("mpeg", "audio/mpeg"), + java.util.Map.entry("mpga", "audio/mpeg"), + java.util.Map.entry("ogg", "audio/ogg"), + java.util.Map.entry("opus", "audio/opus"), + java.util.Map.entry("pcm", "audio/pcm"), + java.util.Map.entry("wav", "audio/wav"), + java.util.Map.entry("webm", "audio/webm"), + java.util.Map.entry("x-aac", "audio/aac")); + + private static final java.util.Map CONVERSE_DOCUMENT_FORMATS = + java.util.Map.ofEntries( + java.util.Map.entry("csv", "text/csv"), + java.util.Map.entry("doc", "application/msword"), + java.util.Map.entry( + "docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), + java.util.Map.entry("html", "text/html"), + java.util.Map.entry("md", "text/markdown"), + java.util.Map.entry("pdf", "application/pdf"), + java.util.Map.entry("txt", "text/plain"), + java.util.Map.entry("xls", "application/vnd.ms-excel"), + java.util.Map.entry( + "xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")); + + /** Maps Bedrock Converse block type keys to their format-to-MIME maps. */ + private static final java.util.Map> + CONVERSE_BLOCK_TYPE_FORMATS = + java.util.Map.of( + "image", CONVERSE_IMAGE_FORMATS, + "video", CONVERSE_VIDEO_FORMATS, + "audio", CONVERSE_AUDIO_FORMATS, + "document", CONVERSE_DOCUMENT_FORMATS); + + private record ConverseBlock( + String blockTypeKey, ObjectNode inner, java.util.Map formatMap) {} + + /** + * Checks whether a JSON object is a Bedrock Converse content block containing a recognized + * block type key (image/video/audio/document) wrapping {@code {format, source: {bytes}}}. + */ + @Nullable + private static ConverseBlock getConverseBlock(ObjectNode obj) { + for (var entry : CONVERSE_BLOCK_TYPE_FORMATS.entrySet()) { + String blockKey = entry.getKey(); + var formatMap = entry.getValue(); + JsonNode inner = obj.get(blockKey); + if (inner == null || !inner.isObject()) continue; + JsonNode fmt = inner.get("format"); + JsonNode src = inner.get("source"); + if (fmt == null || !fmt.isTextual() || src == null || !src.isObject()) continue; + if (!formatMap.containsKey(fmt.asText().toLowerCase())) continue; + JsonNode bytes = src.get("bytes"); + if (bytes == null || !bytes.isTextual() || bytes.asText().length() < 20) continue; + return new ConverseBlock(blockKey, (ObjectNode) inner, formatMap); + } + return null; + } + + static String contentTypeToExtension(String contentType) { + return switch (contentType.toLowerCase()) { + case "image/png" -> ".png"; + case "image/jpeg", "image/jpg" -> ".jpg"; + case "image/gif" -> ".gif"; + case "image/webp" -> ".webp"; + case "image/svg+xml" -> ".svg"; + case "application/pdf" -> ".pdf"; + case "text/plain" -> ".txt"; + case "text/csv" -> ".csv"; + case "text/html" -> ".html"; + case "application/json" -> ".json"; + case "application/msword" -> ".doc"; + case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" -> + ".docx"; + case "application/vnd.ms-excel" -> ".xls"; + case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" -> ".xlsx"; + case "video/mp4" -> ".mp4"; + case "video/webm" -> ".webm"; + case "video/quicktime" -> ".mov"; + case "audio/mpeg", "audio/mp3" -> ".mp3"; + case "audio/wav" -> ".wav"; + default -> { String[] parts = contentType.split("/"); - if (parts.length == 2) { - return "." + parts[1].split("[;\\-]")[0]; - } - return ""; + yield (parts.length == 2) ? "." + parts[1].split("[;\\-]")[0] : ""; + } + }; + } + + /** + * Describes how to detect and replace a provider-specific base64 attachment structure. To add a + * new provider format, append an entry to {@link #ATTACHMENT_FORMATS}. + * + * @param name human-readable name for logging/debugging and test coverage tracking + * @param heuristicFragment regex fragment for the fast-path heuristic. Multiple formats' + * fragments are combined with {@code |} into a single pattern. Duplicates are removed. + * @param matcher predicate called on every node during tree traversal. Returns {@code true} if + * this format should handle the node. Parameters: ({@code fieldName} of this node in its + * parent — null for root/array elements, {@code node} the current node). + * @param replacer builds the replacement node. Parameters: ({@code parent} the parent + * ObjectNode or null, {@code fieldName} of this node in the parent, {@code node} the + * matched node). Returns the replacement node, or {@code null} to skip. + */ + record AttachmentFormat( + @Nonnull String name, + @Nonnull String heuristicFragment, + @Nonnull BiFunction matcher, + @Nonnull ReplacerFunction replacer) { + + @FunctionalInterface + interface ReplacerFunction { + JsonNode apply( + @Nullable ObjectNode parent, + @Nullable String fieldName, + @Nonnull JsonNode node, + @Nonnull AttachmentUploader uploader); } } diff --git a/braintrust-sdk/src/test/java/dev/braintrust/trace/AttachmentProcessorTest.java b/braintrust-sdk/src/test/java/dev/braintrust/trace/AttachmentProcessorTest.java index e19d7770..1f2adb45 100644 --- a/braintrust-sdk/src/test/java/dev/braintrust/trace/AttachmentProcessorTest.java +++ b/braintrust-sdk/src/test/java/dev/braintrust/trace/AttachmentProcessorTest.java @@ -7,16 +7,21 @@ import dev.braintrust.json.BraintrustJsonMapper; import io.opentelemetry.api.common.AttributeKey; import io.opentelemetry.api.trace.Tracer; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Consumer; +import java.util.stream.Stream; import lombok.SneakyThrows; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; /** * Integration test for the attachment processing pipeline. * *

Uses the {@link TestHarness} which wires the {@link dev.braintrust.UnitTestSpanExporter} as an * additional delegate inside the {@link BraintrustSpanProcessor}. This means spans retrieved via - * {@code awaitExportedSpans} reflect post-processing (base64 data URIs replaced with attachment + * {@code awaitExportedSpans} reflect post-processing (base64 data replaced with attachment * references). The VCR layer stubs the S3 upload flow (login, signed URL, PUT, status). */ public class AttachmentProcessorTest { @@ -25,11 +30,15 @@ public class AttachmentProcessorTest { private static final String BASE64_PNG = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg=="; + /** Fake base64 content standing in for a PDF document. */ + private static final String BASE64_PDF = "JVBERi0xLjQKMSAwIG9iago="; + private static final AttributeKey INPUT_JSON = AttributeKey.stringKey("braintrust.input_json"); private static TestHarness testHarness; private static Tracer tracer; + private static final AtomicInteger spanCounter = new AtomicInteger(); @BeforeAll static void initHarness() { @@ -37,96 +46,201 @@ static void initHarness() { tracer = testHarness.openTelemetry().getTracer("attachment-processor-test"); } - @Test + // ── Parameterized: one case per provider attachment format ──────── + + record FormatTestCase(String name, String inputJson, Consumer assertions) { + @Override + public String toString() { + return name; + } + } + + static Stream attachmentFormatCases() { + return Stream.of( + // OpenAI: data URI in image_url.url + new FormatTestCase( + "openai-image", + "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"describe" + + " this\"}," + + "{\"type\":\"image_url\",\"image_url\":{\"url\":\"data:image/png;base64," + + BASE64_PNG + + "\"}}]}]", + root -> { + JsonNode url = + root.get(0).get("content").get(1).get("image_url").get("url"); + assertAttachmentRef(url, "image/png"); + }), + + // Bedrock image: {"format": "png", "source": {"bytes": ""}} + new FormatTestCase( + "bedrock-image", + "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"describe" + + " this\"},{\"type\":\"image\",\"image\":{\"format\":\"png\"," + + "\"source\":{\"bytes\":\"" + + BASE64_PNG + + "\"}}}]}]", + root -> { + JsonNode image = root.get(0).get("content").get(1).get("image"); + assertEquals("png", image.get("format").asText()); + assertAttachmentRef(image.get("source").get("bytes"), "image/png"); + }), + + // Bedrock document: {"format": "pdf", "name": "doc", "source": {"bytes": "..."}} + new FormatTestCase( + "bedrock-document", + "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"summarize" + + " this\"},{\"type\":\"document\",\"document\":{\"format\":\"pdf\"," + + "\"name\":\"report\",\"source\":{\"bytes\":\"" + + BASE64_PDF + + "\"}}}]}]", + root -> { + JsonNode doc = root.get(0).get("content").get(1).get("document"); + assertEquals("pdf", doc.get("format").asText()); + assertEquals("report", doc.get("name").asText()); + assertAttachmentRef(doc.get("source").get("bytes"), "application/pdf"); + }), + + // Bedrock audio: uses block type key to resolve mp4 as audio/mp4 (not video/mp4) + new FormatTestCase( + "bedrock-audio", + "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"transcribe" + + " this\"},{\"type\":\"audio\",\"audio\":{\"format\":\"mp4\"," + + "\"source\":{\"bytes\":\"" + + BASE64_PDF + + "\"}}}]}]", + root -> { + JsonNode audio = root.get(0).get("content").get(1).get("audio"); + assertEquals("mp4", audio.get("format").asText()); + assertAttachmentRef(audio.get("source").get("bytes"), "audio/mp4"); + }), + + // Anthropic image: {"type":"base64","media_type":"image/png","data":""} + new FormatTestCase( + "anthropic-image", + "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"describe" + + " this\"},{\"type\":\"image\",\"source\":{\"type\":\"base64\"," + + "\"media_type\":\"image/png\",\"data\":\"" + + BASE64_PNG + + "\"}}]}]", + root -> { + JsonNode source = root.get(0).get("content").get(1).get("source"); + assertAttachmentRef(source, "image/png"); + }), + + // Anthropic document: same source structure, different media_type + new FormatTestCase( + "anthropic-document", + "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"summarize" + + " this\"},{\"type\":\"document\",\"source\":{\"type\":\"base64\"," + + "\"media_type\":\"application/pdf\",\"data\":\"" + + BASE64_PDF + + "\"}}]}]", + root -> { + JsonNode source = root.get(0).get("content").get(1).get("source"); + assertAttachmentRef(source, "application/pdf"); + }), + + // Gemini image: {"inlineData": {"mimeType": "image/png", "data": ""}} + new FormatTestCase( + "gemini-image", + "{\"contents\":[{\"role\":\"user\",\"parts\":[{\"text\":\"describe" + + " this\"},{\"inlineData\":{\"mimeType\":\"image/png\",\"data\":\"" + + BASE64_PNG + + "\"}}]}]}", + root -> { + JsonNode part = root.get("contents").get(0).get("parts").get(1); + assertNull(part.get("inlineData"), "inlineData should be removed"); + assertAttachmentRef(part.get("image_url").get("url"), "image/png"); + }), + + // Gemini PDF: non-image content uses file: {file_data: ref} + new FormatTestCase( + "gemini-document", + "{\"contents\":[{\"role\":\"user\",\"parts\":[{\"text\":\"summarize" + + " this\"},{\"inlineData\":{\"mimeType\":\"application/pdf\"," + + "\"data\":\"" + + BASE64_PDF + + "\"}}]}]}", + root -> { + JsonNode part = root.get("contents").get(0).get("parts").get(1); + assertNull(part.get("inlineData"), "inlineData should be removed"); + assertNull(part.get("image_url"), "non-image should not use image_url"); + assertAttachmentRef( + part.get("file").get("file_data"), "application/pdf"); + })); + } + + @ParameterizedTest(name = "{0}") + @MethodSource("attachmentFormatCases") @SneakyThrows - void base64DataUriInImageUrlIsReplacedWithAttachmentReference() { - String inputJson = - "[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"What is in this" - + " image?\"}," - + "{\"type\":\"image_url\",\"image_url\":{\"url\":\"data:image/png;base64," - + BASE64_PNG - + "\"}}]}]"; + void attachmentFormatReplacesBase64WithRef(FormatTestCase testCase) { + assertTrue( + AttachmentProcessor.BASE64_HEURISTIC.matcher(testCase.inputJson).find(), + "BASE64_HEURISTIC should match test data"); - var span = tracer.spanBuilder("attachment-test-image-url").startSpan(); - span.setAttribute("braintrust.input_json", inputJson); + String spanName = "fmt-" + testCase.name + "-" + spanCounter.incrementAndGet(); + var span = tracer.spanBuilder(spanName).startSpan(); + span.setAttribute("braintrust.input_json", testCase.inputJson); span.setAttribute("braintrust.parent", "project_name:" + TestHarness.defaultProjectName()); span.end(); var spans = testHarness.awaitExportedSpans(1); var exported = spans.stream() - .filter(s -> s.getName().equals("attachment-test-image-url")) + .filter(s -> s.getName().equals(spanName)) .findFirst() .orElseThrow(() -> new AssertionError("span not found")); - String exportedInputJson = exported.getAttributes().get(INPUT_JSON); - assertNotNull(exportedInputJson, "exported span should have braintrust.input_json"); - assertNotEquals(inputJson, exportedInputJson, "base64 data should have been replaced"); - - // Parse and verify the attachment reference - JsonNode root = BraintrustJsonMapper.get().readTree(exportedInputJson); - JsonNode urlNode = root.get(0).get("content").get(1).get("image_url").get("url"); + String exportedJson = exported.getAttributes().get(INPUT_JSON); + assertNotNull(exportedJson, "should have braintrust.input_json"); + assertNotEquals(testCase.inputJson, exportedJson, "base64 data should have been replaced"); - assertTrue( - urlNode.isObject(), "url should be an object (attachment reference), not a string"); - assertEquals("braintrust_attachment", urlNode.get("type").asText()); - assertEquals("image/png", urlNode.get("content_type").asText()); - assertEquals("attachment.png", urlNode.get("filename").asText()); - assertNotNull(urlNode.get("key"), "attachment key must be present"); - assertFalse(urlNode.get("key").asText().isEmpty(), "attachment key must not be empty"); - - // Verify the rest of the message structure is intact - assertEquals("user", root.get(0).get("role").asText()); - assertEquals("text", root.get(0).get("content").get(0).get("type").asText()); - assertEquals( - "What is in this image?", root.get(0).get("content").get(0).get("text").asText()); - assertEquals("image_url", root.get(0).get("content").get(1).get("type").asText()); + testCase.assertions.accept(BraintrustJsonMapper.get().readTree(exportedJson)); } + // ── Negative cases ──────────────────────────────────────────────── + @Test void nonDataUriInputIsUnchanged() { String inputJson = "[{\"role\":\"user\",\"content\":\"Hello, how are you?\"}]"; - - var span = tracer.spanBuilder("attachment-test-no-data-uri").startSpan(); - span.setAttribute("braintrust.input_json", inputJson); - span.setAttribute("braintrust.parent", "project_name:" + TestHarness.defaultProjectName()); - span.end(); - - var spans = testHarness.awaitExportedSpans(1); - var exported = - spans.stream() - .filter(s -> s.getName().equals("attachment-test-no-data-uri")) - .findFirst() - .orElseThrow(() -> new AssertionError("span not found")); - - String exportedInputJson = exported.getAttributes().get(INPUT_JSON); - assertEquals(inputJson, exportedInputJson, "input without base64 data should be unchanged"); + assertEquals(inputJson, sendAndGetExportedInput(inputJson)); } @Test void partialDataUriInTextIsNotReplaced() { - // A data URI embedded in surrounding text should NOT be replaced (isEntirelyDataUri check) String inputJson = "[{\"role\":\"user\",\"content\":\"Check this: data:image/png;base64," + BASE64_PNG + " please\"}]"; + assertEquals(inputJson, sendAndGetExportedInput(inputJson)); + } - var span = tracer.spanBuilder("attachment-test-partial-data-uri").startSpan(); + // ── Helpers ─────────────────────────────────────────────────────── + + private static String sendAndGetExportedInput(String inputJson) { + String spanName = "negative-" + spanCounter.incrementAndGet(); + var span = tracer.spanBuilder(spanName).startSpan(); span.setAttribute("braintrust.input_json", inputJson); span.setAttribute("braintrust.parent", "project_name:" + TestHarness.defaultProjectName()); span.end(); var spans = testHarness.awaitExportedSpans(1); - var exported = - spans.stream() - .filter(s -> s.getName().equals("attachment-test-partial-data-uri")) - .findFirst() - .orElseThrow(() -> new AssertionError("span not found")); + return spans.stream() + .filter(s -> s.getName().equals(spanName)) + .findFirst() + .orElseThrow(() -> new AssertionError("span not found")) + .getAttributes() + .get(INPUT_JSON); + } - String exportedInputJson = exported.getAttributes().get(INPUT_JSON); - assertEquals( - inputJson, - exportedInputJson, - "partial data URI embedded in text should not be replaced"); + private static void assertAttachmentRef(JsonNode node, String expectedContentType) { + assertNotNull(node, "attachment ref node should not be null"); + assertTrue(node.isObject(), "attachment ref should be an object, got: " + node); + assertEquals("braintrust_attachment", node.get("type").asText()); + assertEquals(expectedContentType, node.get("content_type").asText()); + assertNotNull(node.get("filename")); + assertFalse(node.get("filename").asText().isEmpty()); + assertNotNull(node.get("key")); + assertFalse(node.get("key").asText().isEmpty()); } } diff --git a/gradle.properties b/gradle.properties index 3f3be967..3c6d0cc5 100644 --- a/gradle.properties +++ b/gradle.properties @@ -8,7 +8,7 @@ org.gradle.daemon=true org.gradle.warning.mode=summary # braintrust-spec git ref (SHA or tag) used by btx tests -braintrustSpecRef=v0.0.5 +braintrustSpecRef=v0.0.7 # braintrust-openapi commit SHA used by braintrust-api braintrustOpenApiRef=64b79cb9122f50a74eac98ea86c3ec1858c0cdd1 diff --git a/test-harness/src/testFixtures/java/dev/braintrust/TestHarness.java b/test-harness/src/testFixtures/java/dev/braintrust/TestHarness.java index 16523b24..e95ef5a0 100644 --- a/test-harness/src/testFixtures/java/dev/braintrust/TestHarness.java +++ b/test-harness/src/testFixtures/java/dev/braintrust/TestHarness.java @@ -47,6 +47,7 @@ import io.opentelemetry.sdk.trace.SdkTracerProvider; import io.opentelemetry.sdk.trace.data.SpanData; import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor; +import java.lang.ref.Cleaner; import java.util.*; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; @@ -56,7 +57,8 @@ import lombok.SneakyThrows; import lombok.experimental.Accessors; -public class TestHarness { +public class TestHarness implements AutoCloseable { + private static final Cleaner CLEANER = Cleaner.create(); private static final String TEST_HARNESS_CREATED_TAG = "test-harness-created"; private static final VCR vcr; private static final BraintrustConfig envConfig = @@ -164,6 +166,8 @@ private static synchronized TestHarness setup(BraintrustConfig config) { @Accessors(fluent = true) private final OpenTelemetrySdk openTelemetry; + private final Cleaner.Cleanable cleaner; + @Getter @Accessors(fluent = true) private final Braintrust braintrust; @@ -179,7 +183,7 @@ private TestHarness(@Nonnull Braintrust braintrust) { // Wire the in-memory span exporter as an additional delegate inside the // BraintrustSpanProcessor so it sees post-processed spans (attachment references // instead of raw base64 data URIs, etc.). - dev.braintrust.trace.HarnessShim.enableTracing( + HarnessShim.enableTracing( braintrust.config(), tracerBuilder, List.of(SimpleSpanProcessor.create(this.spanExporter)), @@ -190,14 +194,19 @@ private TestHarness(@Nonnull Braintrust braintrust) { TextMapPropagator.composite( W3CTraceContextPropagator.getInstance(), W3CBaggagePropagator.getInstance())); - var openTelemetry = + this.openTelemetry = OpenTelemetrySdk.builder() .setTracerProvider(tracerBuilder.build()) .setLoggerProvider(loggerBuilder.build()) .setMeterProvider(meterBuilder.build()) .setPropagators(contextPropagator) .build(); - this.openTelemetry = openTelemetry; + cleaner = CLEANER.register(this, this.openTelemetry::close); + } + + @Override + public void close() throws Exception { + cleaner.clean(); } private static String apiKeyFromEnv() {