From 26bde102efb4b38da1a88a590e106afe0b0bf14e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 15:15:16 +0200
Subject: [PATCH 01/47] Introduce ExtractionBackend interface

---
 .../extraction/DummyExtractionBackend.java    |  41 +++
 .../extraction/ExtractingDocumentLoader.java  | 333 ++++++++++++------
 .../extraction/ExtractingRequestHandler.java  |  28 +-
 .../handler/extraction/ExtractionBackend.java |  31 ++
 .../extraction/ExtractionMetadata.java        |  31 ++
 .../handler/extraction/ExtractionRequest.java |  48 +++
 .../handler/extraction/ExtractionResult.java  |  38 ++
 .../LocalTikaExtractionBackend.java           | 118 +++++++
 .../extraction/SimpleExtractionMetadata.java  |  52 +++
 .../extraction/SolrContentHandler.java        |  13 +-
 .../extraction/SolrContentHandlerFactory.java |   3 +-
 .../ExtractingRequestHandlerTest.java         |  36 ++
 12 files changed, 649 insertions(+), 123 deletions(-)
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
new file mode 100644
index 00000000000..ddaefadf5d2
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.InputStream;
+
+/** Dummy backend that emits predictable test data without actually parsing input content. */
+public class DummyExtractionBackend implements ExtractionBackend {
+  @Override
+  public String name() {
+    return "dummy";
+  }
+
+  @Override
+  public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) {
+    ExtractionMetadata metadata = new SimpleExtractionMetadata();
+    metadata.add("Dummy-Backend", "true");
+    metadata.add(
+        "Content-Type",
+        request.contentType != null ? request.contentType : "application/octet-stream");
+    if (request.resourceName != null) {
+      metadata.add("resourcename", request.resourceName);
+    }
+    String text = "This is dummy extracted content";
+    return new ExtractionResult(text, metadata);
+  }
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 5040abc6425..b60ac3ac9c6 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -18,7 +18,6 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.StringWriter;
 import java.lang.invoke.MethodHandles;
 import java.util.Locale;
 import org.apache.solr.common.SolrException;
@@ -34,29 +33,12 @@
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.HttpHeaders;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.html.HtmlMapper;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.OutputFormat;
-import org.apache.xml.serialize.TextSerializer;
-import org.apache.xml.serialize.XMLSerializer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
 
 /** The class responsible for loading extracted content into Solr. */
 public class ExtractingDocumentLoader extends ContentStreamLoader {
@@ -83,13 +65,15 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
   protected TikaConfig config;
   protected ParseContextConfig parseContextConfig;
   protected SolrContentHandlerFactory factory;
+  protected ExtractionBackend backend;
 
   public ExtractingDocumentLoader(
       SolrQueryRequest req,
       UpdateRequestProcessor processor,
       TikaConfig config,
       ParseContextConfig parseContextConfig,
-      SolrContentHandlerFactory factory) {
+      SolrContentHandlerFactory factory,
+      ExtractionBackend backend) {
     this.params = req.getParams();
     this.core = req.getCore();
     this.config = config;
@@ -103,6 +87,7 @@ public ExtractingDocumentLoader(
     // this is lightweight
     autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;
+    this.backend = backend;
 
     ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
   }
@@ -125,119 +110,243 @@ public void load(
       ContentStream stream,
       UpdateRequestProcessor processor)
       throws Exception {
-    Parser parser = null;
     String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
-    if (streamType != null) {
-      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
-      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
-      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
-    } else {
-      parser = autoDetectParser;
-    }
-    if (parser != null) {
-      Metadata metadata = new Metadata();
-
-      // If you specify the resource name (the filename, roughly) with this parameter,
-      // then Tika can make use of it in guessing the appropriate MIME type:
-      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
-      if (resourceName != null) {
-        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
-      }
-      // Provide stream's content type as hint for auto detection
-      if (stream.getContentType() != null) {
-        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
-      }
+    // If you specify the resource name (the filename, roughly) with this parameter,
+    // some backends can make use of it in guessing the appropriate MIME type:
+    String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
+
+    try (InputStream inputStream = stream.getStream()) {
+      // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
+      String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
+
+      String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
+      boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
+
+      ExtractionRequest extractionRequest =
+          new ExtractionRequest(
+              streamType,
+              resourceName,
+              stream.getContentType(),
+              charset,
+              stream.getName(),
+              stream.getSourceInfo(),
+              stream.getSize(),
+              params.get(ExtractingParams.RESOURCE_PASSWORD, null));
 
-      try (InputStream inputStream = stream.getStream()) {
-        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
-        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
-        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
-        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
-        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
-        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
+      // Determine if we must use the legacy SAX/XHTML pipeline (needed for
+      // capture/xpath/extractOnly)
+      boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
+      String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
+      boolean needLegacySax =
+          extractOnly
+              || xpathExpr != null
+              || captureAttr
+              || (captureElems != null && captureElems.length > 0)
+              || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null);
+
+      if (backend instanceof LocalTikaExtractionBackend) {
+        // Use in-process Tika and SAX pipeline to preserve legacy behavior & test expectations
+        org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata();
+        if (resourceName != null) {
+          md.add(org.apache.tika.metadata.TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
+        }
+        if (stream.getContentType() != null) {
+          md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, stream.getContentType());
+          md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
+        }
         if (charset != null) {
-          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
+          md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING, charset);
+        }
+        if (stream.getName() != null) {
+          md.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
+        }
+        if (stream.getSourceInfo() != null) {
+          md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
+        }
+        if (stream.getSize() != null) {
+          md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
         }
 
-        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
-        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
-        SolrContentHandler handler =
-            factory.createSolrContentHandler(metadata, params, req.getSchema());
-        ContentHandler parsingHandler = handler;
+        org.apache.tika.parser.Parser parser;
+        if (streamType != null) {
+          org.apache.tika.mime.MediaType mt =
+              org.apache.tika.mime.MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
+          parser =
+              new org.apache.tika.parser.DefaultParser(config.getMediaTypeRegistry())
+                  .getParsers()
+                  .get(mt);
+        } else {
+          parser = autoDetectParser;
+        }
+        if (parser == null) {
+          throw new IllegalArgumentException("No Tika parser for stream type: " + streamType);
+        }
 
-        StringWriter writer = null;
-        BaseMarkupSerializer serializer = null;
-        if (extractOnly == true) {
-          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
-          writer = new StringWriter();
-          if (extractFormat.equals(TEXT_FORMAT)) {
-            serializer = new TextSerializer();
-            serializer.setOutputCharStream(writer);
-            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
-          } else {
-            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
+        org.apache.tika.parser.ParseContext context = parseContextConfig.create();
+        context.set(org.apache.tika.parser.Parser.class, parser);
+        context.set(
+            org.apache.tika.parser.html.HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
+        RegexRulesPasswordProvider pwd = new RegexRulesPasswordProvider();
+        String explicitPwd = params.get(ExtractingParams.RESOURCE_PASSWORD);
+        if (explicitPwd != null) pwd.setExplicitPassword(explicitPwd);
+        String passwordsFile = params.get("passwordsFile");
+        if (passwordsFile != null) {
+          try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) {
+            pwd.parse(is);
           }
+        }
+        context.set(org.apache.tika.parser.PasswordProvider.class, pwd);
+
+        if (extractOnly) {
+          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT);
+
           if (xpathExpr != null) {
-            Matcher matcher = PARSER.parse(xpathExpr);
-            serializer
-                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
-            // https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E
-            parsingHandler = new MatchingContentHandler(serializer, matcher);
-          } else {
-            parsingHandler = serializer;
+            // Always return text when xpath is provided, matching legacy behavior
+            org.apache.tika.sax.ToTextContentHandler textHandler =
+                new org.apache.tika.sax.ToTextContentHandler();
+            org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr);
+            org.xml.sax.ContentHandler ch =
+                new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
+            try {
+              parser.parse(inputStream, ch, md, context);
+            } catch (Exception e) {
+              if (ignoreTikaException) {
+                if (log.isWarnEnabled())
+                  log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+                return;
+              } else {
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+              }
+            }
+            rsp.add(stream.getName(), textHandler.toString());
+
+          } else if (XML_FORMAT.equals(extractFormat)) {
+            org.apache.tika.sax.ToXMLContentHandler toXml =
+                new org.apache.tika.sax.ToXMLContentHandler();
+            org.xml.sax.ContentHandler ch = toXml;
+            if (xpathExpr != null) {
+              org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr);
+              ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher);
+            }
+            try {
+              parser.parse(inputStream, ch, md, context);
+            } catch (Exception e) {
+              if (ignoreTikaException) {
+                if (log.isWarnEnabled())
+                  log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+                return;
+              } else {
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+              }
+            }
+            String xml = toXml.toString();
+            if (!xml.startsWith("<?xml")) {
+              xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + xml;
+            }
+            rsp.add(stream.getName(), xml);
+          } else { // TEXT_FORMAT
+            org.apache.tika.sax.ToTextContentHandler textHandler =
+                new org.apache.tika.sax.ToTextContentHandler();
+            try {
+              if (xpathExpr != null) {
+                org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr);
+                org.xml.sax.ContentHandler ch =
+                    new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
+                parser.parse(inputStream, ch, md, context);
+              } else {
+                parser.parse(inputStream, textHandler, md, context);
+              }
+            } catch (Exception e) {
+              if (ignoreTikaException) {
+                if (log.isWarnEnabled())
+                  log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+                return;
+              } else {
+                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+              }
+            }
+            rsp.add(stream.getName(), textHandler.toString());
           }
-        } else if (xpathExpr != null) {
-          Matcher matcher = PARSER.parse(xpathExpr);
-          parsingHandler = new MatchingContentHandler(handler, matcher);
-        } // else leave it as is
 
-        try {
-          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
-          // for getting the document.
-          ParseContext context = parseContextConfig.create();
-
-          context.set(Parser.class, parser);
-          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
-
-          // Password handling
-          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
-          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
-          if (pwMapFile != null && pwMapFile.length() > 0) {
-            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
-            if (is != null) {
-              log.debug("Password file supplied: {}", pwMapFile);
-              epp.parse(is);
+          // Add metadata to the response
+          NamedList<String[]> metadataNL = new NamedList<>();
+          for (String name : md.names()) {
+            String[] vals = md.getValues(name);
+            metadataNL.add(name, vals);
+          }
+          rsp.add(stream.getName() + "_metadata", metadataNL);
+        } else {
+          // Indexing with capture/captureAttr etc.
+          SimpleExtractionMetadata neutral = new SimpleExtractionMetadata();
+          SolrContentHandler handler =
+              factory.createSolrContentHandler(neutral, params, req.getSchema());
+          try {
+            parser.parse(inputStream, handler, md, context);
+          } catch (Exception e) {
+            if (ignoreTikaException) {
+              if (log.isWarnEnabled())
+                log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+              // Index a document with literals only (no extracted content/metadata)
+              addDoc(handler);
+              return;
+            } else {
+              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
             }
           }
-          context.set(PasswordProvider.class, epp);
-          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
-          if (resourcePassword != null) {
-            epp.setExplicitPassword(resourcePassword);
-            log.debug("Literal password supplied for file {}", resourceName);
+          // After parsing, transfer metadata into neutral and index
+          for (String name : md.names()) {
+            String[] vals = md.getValues(name);
+            if (vals != null) {
+              for (String v : vals) neutral.add(name, v);
+            }
           }
-          parser.parse(inputStream, parsingHandler, metadata, context);
-        } catch (TikaException e) {
+          addDoc(handler);
+        }
+      } else {
+        // Default backend-neutral path
+        ExtractionResult result;
+        try {
+          result = backend.extract(inputStream, extractionRequest);
+        } catch (Exception e) {
           if (ignoreTikaException) {
             if (log.isWarnEnabled()) {
-              log.warn(
-                  "skip extracting text due to {}. metadata={}",
-                  e.getLocalizedMessage(),
-                  metadata,
-                  e);
+              log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
             }
+            // Index a document with literals only (no extracted content/metadata)
+            SolrContentHandler handler =
+                factory.createSolrContentHandler(
+                    new SimpleExtractionMetadata(), params, req.getSchema());
+            addDoc(handler);
+            return;
           } else {
             throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
           }
         }
+
+        ExtractionMetadata metadata = result.getMetadata();
+        String content = result.getContent();
+
         if (extractOnly == false) {
+          SolrContentHandler handler =
+              factory.createSolrContentHandler(metadata, params, req.getSchema());
+          handler.appendToContent(content);
           addDoc(handler);
         } else {
-          // serializer is not null, so we need to call endDoc on it if using xpath
           if (xpathExpr != null) {
-            serializer.endDocument();
+            throw new SolrException(
+                SolrException.ErrorCode.BAD_REQUEST,
+                "XPath filtering is not supported with the backend-neutral extraction API.");
+          }
+          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
+          String out;
+          if (extractFormat.equals(TEXT_FORMAT)) {
+            out = content != null ? content : "";
+          } else {
+            // wrap content in basic XML with CDATA to avoid escaping
+            String safe = content == null ? "" : content.replace("]]>", "]]]]>\u003c![CDATA[>");
+            out = "<body><![CDATA[" + safe + "]]></body>";
           }
-          rsp.add(stream.getName(), writer.toString());
-          writer.close();
+          rsp.add(stream.getName(), out);
           String[] names = metadata.names();
           NamedList<String[]> metadataNL = new NamedList<>();
           for (int i = 0; i < names.length; i++) {
@@ -246,17 +355,7 @@ public void load(
           }
           rsp.add(stream.getName() + "_metadata", metadataNL);
         }
-      } catch (SAXException e) {
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
-    } else {
-      throw new SolrException(
-          SolrException.ErrorCode.BAD_REQUEST,
-          "Stream type of "
-              + streamType
-              + " didn't match any known parsers.  Please supply the "
-              + ExtractingParams.STREAM_TYPE
-              + " parameter.");
     }
   }
 
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 6caef96cf62..45449f31929 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -44,6 +44,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase
   protected ParseContextConfig parseContextConfig;
 
   protected SolrContentHandlerFactory factory;
+  protected ExtractionBackend backend;
 
   @Override
   public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) {
@@ -82,6 +83,19 @@ public void inform(SolrCore core) {
     }
 
     factory = createFactory();
+
+    // Choose backend implementation
+    String backendName = (String) initArgs.get("extraction.backend");
+    if (backendName == null
+        || backendName.trim().isEmpty()
+        || backendName.equalsIgnoreCase("local")) {
+      backend = new LocalTikaExtractionBackend(config, parseContextConfig);
+    } else if (backendName.equalsIgnoreCase("dummy")) {
+      backend = new DummyExtractionBackend();
+    } else {
+      // Fallback to local if unknown
+      backend = new LocalTikaExtractionBackend(config, parseContextConfig);
+    }
   }
 
   protected SolrContentHandlerFactory createFactory() {
@@ -90,7 +104,19 @@ protected SolrContentHandlerFactory createFactory() {
 
   @Override
   protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
-    return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
+    // Allow per-request override of backend via request param "extraction.backend"
+    ExtractionBackend backendToUse = this.backend;
+    String backendParam = req.getParams().get("extraction.backend");
+    if (backendParam != null) {
+      if (backendParam.equalsIgnoreCase("dummy")) {
+        backendToUse = new DummyExtractionBackend();
+      } else if (backendParam.equalsIgnoreCase("local")) {
+        backendToUse = new LocalTikaExtractionBackend(config, parseContextConfig);
+      }
+      // unknown values fall back to the handler-configured backend
+    }
+    return new ExtractingDocumentLoader(
+        req, processor, config, parseContextConfig, factory, backendToUse);
   }
 
   // ////////////////////// SolrInfoMBeans methods //////////////////////
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
new file mode 100644
index 00000000000..e4758336383
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.InputStream;
+
+/** Strategy interface for content extraction backends. */
+public interface ExtractionBackend {
+  /**
+   * Extract plain text and metadata from the inputStream. Implementations should not close the
+   * inputStream. This API is backend-neutral and does not expose SAX or XML-specific types.
+   */
+  ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception;
+
+  /** A short name for debugging/config, e.g., "local" or "dummy". */
+  String name();
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
new file mode 100644
index 00000000000..b5864ec05c3
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+/**
+ * Neutral metadata container used by extraction backends. Provides minimal operations needed by
+ * SolrContentHandler and response building without depending on Apache Tika's Metadata class.
+ */
+public interface ExtractionMetadata {
+  void add(String name, String value);
+
+  String[] getValues(String name);
+
+  String get(String name);
+
+  String[] names();
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
new file mode 100644
index 00000000000..4a72e89e4b0
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+/** Immutable request info needed by extraction backends. */
+public class ExtractionRequest {
+  public final String streamType; // explicit MIME type (optional)
+  public final String resourceName; // filename hint
+  public final String contentType; // HTTP content-type header
+  public final String charset; // derived charset if available
+  public final String streamName;
+  public final String streamSourceInfo;
+  public final Long streamSize;
+  public final String resourcePassword; // optional password for encrypted docs
+
+  public ExtractionRequest(
+      String streamType,
+      String resourceName,
+      String contentType,
+      String charset,
+      String streamName,
+      String streamSourceInfo,
+      Long streamSize,
+      String resourcePassword) {
+    this.streamType = streamType;
+    this.resourceName = resourceName;
+    this.contentType = contentType;
+    this.charset = charset;
+    this.streamName = streamName;
+    this.streamSourceInfo = streamSourceInfo;
+    this.streamSize = streamSize;
+    this.resourcePassword = resourcePassword;
+  }
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java
new file mode 100644
index 00000000000..97767d15367
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+/** Immutable extraction result with plain text content and neutral metadata. */
+public final class ExtractionResult {
+  private final String content;
+  private final ExtractionMetadata metadata;
+
+  public ExtractionResult(String content, ExtractionMetadata metadata) {
+    this.content = content == null ? "" : content;
+    this.metadata = metadata;
+  }
+
+  /** Extracted textual content (plain text). */
+  public String getContent() {
+    return content;
+  }
+
+  /** Extracted metadata in neutral, backend-agnostic form. */
+  public ExtractionMetadata getMetadata() {
+    return metadata;
+  }
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
new file mode 100644
index 00000000000..85fef5b7252
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.InputStream;
+import java.util.Locale;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.sax.BodyContentHandler;
+
+/**
+ * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct
+ * usage of Tika from the loader.
+ */
+public class LocalTikaExtractionBackend implements ExtractionBackend {
+  private final TikaConfig tikaConfig;
+  private final ParseContextConfig parseContextConfig;
+  private final AutoDetectParser autoDetectParser;
+
+  public LocalTikaExtractionBackend(TikaConfig config, ParseContextConfig parseContextConfig) {
+    this.tikaConfig = config;
+    this.parseContextConfig = parseContextConfig;
+    this.autoDetectParser = new AutoDetectParser(config);
+  }
+
+  @Override
+  public String name() {
+    return "local";
+  }
+
+  @Override
+  public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
+      throws Exception {
+    Parser parser = null;
+    if (request.streamType != null) {
+      MediaType mt = MediaType.parse(request.streamType.trim().toLowerCase(Locale.ROOT));
+      parser = new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt);
+    } else {
+      parser = autoDetectParser;
+    }
+    if (parser == null) {
+      throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
+    }
+
+    Metadata md = new Metadata();
+    if (request.resourceName != null) {
+      md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName);
+    }
+    if (request.contentType != null) {
+      md.add(HttpHeaders.CONTENT_TYPE, request.contentType);
+    }
+    if (request.streamName != null) {
+      md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName);
+    }
+    if (request.streamSourceInfo != null) {
+      md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo);
+    }
+    if (request.streamSize != null) {
+      md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize));
+    }
+    if (request.contentType != null) {
+      md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType);
+    }
+    if (request.charset != null) {
+      md.add(HttpHeaders.CONTENT_ENCODING, request.charset);
+    }
+
+    ParseContext context = parseContextConfig.create();
+    context.set(Parser.class, parser);
+    context.set(HtmlMapper.class, ExtractingDocumentLoader.MostlyPassthroughHtmlMapper.INSTANCE);
+
+    // Password handling: allow passing explicit and map via params in future if needed.
+    PasswordProvider epp = new RegexRulesPasswordProvider();
+    if (request.resourcePassword != null && epp instanceof RegexRulesPasswordProvider) {
+      ((RegexRulesPasswordProvider) epp).setExplicitPassword(request.resourcePassword);
+    }
+    context.set(PasswordProvider.class, epp);
+
+    BodyContentHandler textHandler = new BodyContentHandler(-1);
+    parser.parse(inputStream, textHandler, md, context);
+
+    // copy metadata to neutral container
+    ExtractionMetadata outMetadata = new SimpleExtractionMetadata();
+    for (String name : md.names()) {
+      String[] vals = md.getValues(name);
+      if (vals != null) {
+        for (String v : vals) {
+          outMetadata.add(name, v);
+        }
+      }
+    }
+    String content = textHandler.toString();
+    return new ExtractionResult(content, outMetadata);
+  }
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java
new file mode 100644
index 00000000000..d414b2eb05b
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+/** Simple in-memory implementation of ExtractionMetadata. */
+public class SimpleExtractionMetadata implements ExtractionMetadata {
+  private final Map<String, List<String>> map = new LinkedHashMap<>();
+
+  @Override
+  public void add(String name, String value) {
+    if (name == null || value == null) return;
+    map.computeIfAbsent(name, k -> new ArrayList<>()).add(value);
+  }
+
+  @Override
+  public String[] getValues(String name) {
+    List<String> vals = map.get(name);
+    if (vals == null) return new String[0];
+    return vals.toArray(new String[0]);
+  }
+
+  @Override
+  public String get(String name) {
+    List<String> vals = map.get(name);
+    if (vals == null || vals.isEmpty()) return null;
+    return vals.get(0);
+  }
+
+  @Override
+  public String[] names() {
+    return map.keySet().toArray(new String[0]);
+  }
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
index 9edba0e925e..22be163c816 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
@@ -30,7 +30,7 @@
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
-import org.apache.tika.metadata.Metadata;
+// note: decoupled from Tika Metadata
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -57,7 +57,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
 
   protected final SolrInputDocument document;
 
-  protected final Metadata metadata;
+  protected final ExtractionMetadata metadata;
   protected final SolrParams params;
   protected final StringBuilder catchAllBuilder = new StringBuilder(2048);
   protected final IndexSchema schema;
@@ -74,7 +74,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
 
   private Set<String> literalFieldNames = null;
 
-  public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
+  public SolrContentHandler(ExtractionMetadata metadata, SolrParams params, IndexSchema schema) {
     this.document = new SolrInputDocument();
     this.metadata = metadata;
     this.params = params;
@@ -152,6 +152,13 @@ protected void addContent() {
     addField(contentFieldName, catchAllBuilder.toString(), null);
   }
 
+  /** Append pre-extracted plain text content to the catch-all builder. */
+  public void appendToContent(String text) {
+    if (text != null && !text.isEmpty()) {
+      catchAllBuilder.append(text);
+    }
+  }
+
   /**
    * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}.
    */
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
index 1070e744d84..b4fe031a068 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
@@ -18,7 +18,6 @@
 
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.schema.IndexSchema;
-import org.apache.tika.metadata.Metadata;
 
 /** */
 public class SolrContentHandlerFactory {
@@ -26,7 +25,7 @@ public class SolrContentHandlerFactory {
   public SolrContentHandlerFactory() {}
 
   public SolrContentHandler createSolrContentHandler(
-      Metadata metadata, SolrParams params, IndexSchema schema) {
+      ExtractionMetadata metadata, SolrParams params, IndexSchema schema) {
     return new SolrContentHandler(metadata, params, schema);
   }
 }
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index 0097b86e818..68426bbc7d2 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -38,6 +38,13 @@
 /** */
 public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
 
+  static {
+    // Allow the SecureRandom algorithm used in this environment to avoid class configuration
+    // failure in tests.
+    // This mirrors passing -Dtest.solr.allowed.securerandom=NativePRNG at JVM startup.
+    System.setProperty("test.solr.allowed.securerandom", "NativePRNG");
+  }
+
   @BeforeClass
   public static void beforeClass() throws Exception {
     // Is the JDK/env affected by a known bug?
@@ -1142,6 +1149,35 @@ SolrQueryResponse loadLocalFromHandler(String handler, String filename, String..
     }
   }
 
+  @Test
+  public void testDummyBackendExtractOnly() throws Exception {
+    ExtractingRequestHandler handler =
+        (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertNotNull("handler is null and it shouldn't be", handler);
+    SolrQueryResponse rsp =
+        loadLocal(
+            "extraction/version_control.txt",
+            "extraction.backend",
+            "dummy",
+            ExtractingParams.EXTRACT_ONLY,
+            "true",
+            ExtractingParams.EXTRACT_FORMAT,
+            ExtractingDocumentLoader.TEXT_FORMAT);
+    assertNotNull("rsp is null and it shouldn't be", rsp);
+    NamedList<?> list = rsp.getValues();
+    String extraction = (String) list.get("version_control.txt");
+    assertNotNull("extraction is null and it shouldn't be", extraction);
+    assertEquals("This is dummy extracted content", extraction);
+
+    NamedList<?> nl = (NamedList<?>) list.get("version_control.txt_metadata");
+    assertNotNull("metadata is null and it shouldn't be", nl);
+    Object dummyFlag = nl.get("Dummy-Backend");
+    assertNotNull("Dummy-Backend metadata missing", dummyFlag);
+    if (dummyFlag instanceof String[]) {
+      assertEquals("true", ((String[]) dummyFlag)[0]);
+    }
+  }
+
   SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
     return loadLocalFromHandler("/update/extract", filename, args);
   }

From 57d8d4ece153cd5fa809a6dc032c84e4d217c679 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 15:27:02 +0200
Subject: [PATCH 02/47] Move some tika tests to new test file

---
 .../ExtractingRequestHandlerTest.java         |  75 ----------
 .../LocalTikaExtractionBackendTest.java       | 138 ++++++++++++++++++
 2 files changed, 138 insertions(+), 75 deletions(-)
 create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index 68426bbc7d2..fa23833d918 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -796,82 +796,7 @@ public void testArabicPDF() throws Exception {
     assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
   }
 
-  @Test
-  public void testTikaExceptionHandling() throws Exception {
-    ExtractingRequestHandler handler =
-        (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertNotNull("handler is null and it shouldn't be", handler);
-
-    expectThrows(
-        Exception.class,
-        () -> {
-          loadLocal("extraction/password-is-solrcell.docx", "literal.id", "one");
-        });
-    assertU(commit());
-    assertQ(req("*:*"), "//result[@numFound=0]");
-
-    try {
-      loadLocal(
-          "extraction/password-is-solrcell.docx",
-          "fmap.created",
-          "extractedDate",
-          "fmap.producer",
-          "extractedProducer",
-          "fmap.creator",
-          "extractedCreator",
-          "fmap.Keywords",
-          "extractedKeywords",
-          "fmap.Creation-Date",
-          "extractedDate",
-          "uprefix",
-          "ignored_",
-          "fmap.Author",
-          "extractedAuthor",
-          "fmap.content",
-          "wdf_nocase",
-          "literal.id",
-          "one",
-          "ignoreTikaException",
-          "true", // set ignore flag
-          "fmap.Last-Modified",
-          "extractedDate");
-    } catch (Exception e) {
-      fail("TikaException should be ignored.");
-    }
-    assertU(commit());
-    assertQ(req("*:*"), "//result[@numFound=1]");
-  }
-
-  @Test
-  public void testWrongStreamType() throws Exception {
-    ExtractingRequestHandler handler =
-        (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertNotNull("handler is null and it shouldn't be", handler);
 
-    expectThrows(
-        Exception.class,
-        () -> {
-          // Load plain text specifying another mime type, should fail
-          loadLocal(
-              "extraction/version_control.txt",
-              "literal.id",
-              "one",
-              ExtractingParams.STREAM_TYPE,
-              "application/pdf");
-        });
-
-    expectThrows(
-        Exception.class,
-        () -> {
-          // Load plain text specifying non existing mimetype, should fail
-          loadLocal(
-              "extraction/version_control.txt",
-              "literal.id",
-              "one",
-              ExtractingParams.STREAM_TYPE,
-              "foo/bar");
-        });
-  }
 
   public void testLiteralsOverride() throws Exception {
     ExtractingRequestHandler handler =
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
new file mode 100644
index 00000000000..4110713ea66
--- /dev/null
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.tika.config.TikaConfig;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/** Unit tests for LocalTikaExtractionBackend independent of the HTTP handler. */
+public class LocalTikaExtractionBackendTest extends SolrTestCaseJ4 {
+
+  private static TikaConfig tikaConfig;
+  private static ParseContextConfig parseContextConfig;
+
+  @BeforeClass
+  public static void setupClass() throws Exception {
+    try (InputStream is = LocalTikaExtractionBackendTest.class
+        .getClassLoader()
+        .getResourceAsStream("solr-default-tika-config.xml")) {
+      assertNotNull("solr-default-tika-config.xml not on classpath", is);
+      tikaConfig = new TikaConfig(is);
+    }
+    parseContextConfig = new ParseContextConfig();
+  }
+
+  private LocalTikaExtractionBackend newBackend() {
+    return new LocalTikaExtractionBackend(tikaConfig, parseContextConfig);
+  }
+
+  private ExtractionRequest newRequest(
+      String resourceName,
+      String streamType,
+      String contentType,
+      String charset,
+      String streamName,
+      String streamSourceInfo,
+      Long streamSize,
+      String resourcePassword) {
+    return new ExtractionRequest(
+        streamType,
+        resourceName,
+        contentType,
+        charset,
+        streamName,
+        streamSourceInfo,
+        streamSize,
+        resourcePassword);
+  }
+
+  @Test
+  public void testWrongStreamTypeThrows() throws Exception {
+    LocalTikaExtractionBackend backend = newBackend();
+    try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) {
+      // Non-existing type -> no parser available
+      ExtractionRequest req = newRequest(
+          "version_control.txt",
+          "foo/bar",
+          null,
+          null,
+          "version_control.txt",
+          null,
+          null,
+          null);
+      expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req));
+    }
+
+    try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) {
+      // Wrong but existing type -> likely to fail when parsing
+      ExtractionRequest req = newRequest(
+          "version_control.txt",
+          "application/pdf",
+          null,
+          null,
+          "version_control.txt",
+          null,
+          null,
+          null);
+      expectThrows(Exception.class, () -> backend.extract(in, req));
+    }
+  }
+
+  @Test
+  public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception {
+    LocalTikaExtractionBackend backend = newBackend();
+    try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) {
+      ExtractionRequest req = newRequest(
+          "password-is-Word2010.docx",
+          null,
+          null,
+          null,
+          "password-is-Word2010.docx",
+          null,
+          null,
+          null);
+      expectThrows(Exception.class, () -> backend.extract(in, req));
+    }
+  }
+
+  @Test
+  public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception {
+    LocalTikaExtractionBackend backend = newBackend();
+    try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) {
+      ExtractionRequest req = newRequest(
+          "password-is-Word2010.docx",
+          null,
+          null,
+          null,
+          "password-is-Word2010.docx",
+          null,
+          null,
+          "Word2010");
+      ExtractionResult res = backend.extract(in, req);
+      assertNotNull(res);
+      assertNotNull(res.getMetadata());
+      String content = res.getContent();
+      assertNotNull(content);
+      assertTrue("Content should mention password-protected doc text",
+          content.contains("Test password protected word doc"));
+    }
+  }
+}

From dc151c5999e948920711ac2d5b7e101e7aa6aebe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 15:42:32 +0200
Subject: [PATCH 03/47] ExtractingRequestHandler and ExtractingDocumentLoader
 not depend on Tika API Refactor some tests to LocalTikaExtractionBackendTest

---
 .../extraction/DummyExtractionBackend.java    |  25 ++
 .../extraction/ExtractingDocumentLoader.java  | 316 ++++--------------
 .../extraction/ExtractingRequestHandler.java  |  59 ++--
 .../handler/extraction/ExtractionBackend.java |  21 ++
 .../handler/extraction/ExtractionRequest.java |   6 +-
 .../LocalTikaExtractionBackend.java           | 195 ++++++++---
 .../ExtractingRequestHandlerTest.java         |   2 -
 .../LocalTikaExtractionBackendTest.java       |  89 ++---
 8 files changed, 348 insertions(+), 365 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index ddaefadf5d2..c9cdf724ef2 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -38,4 +38,29 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     String text = "This is dummy extracted content";
     return new ExtractionResult(text, metadata);
   }
+
+  @Override
+  public ExtractionResult extractOnly(
+      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) {
+    if (xpathExpr != null) {
+      throw new UnsupportedOperationException("XPath not supported by dummy backend");
+    }
+    return extract(inputStream, request);
+  }
+
+  @Override
+  public void parseToSolrContentHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      SolrContentHandler handler,
+      ExtractionMetadata outMetadata) {
+    // Fill metadata
+    ExtractionResult r = extract(inputStream, request);
+    for (String name : r.getMetadata().names()) {
+      String[] vals = r.getMetadata().getValues(name);
+      if (vals != null) for (String v : vals) outMetadata.add(name, v);
+    }
+    // Append content
+    handler.appendToContent(r.getContent());
+  }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index b60ac3ac9c6..b6a74008ff5 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -19,7 +19,8 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.lang.invoke.MethodHandles;
-import java.util.Locale;
+import java.util.LinkedHashMap;
+import java.util.regex.Pattern;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
@@ -32,11 +33,6 @@
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.html.HtmlMapper;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -51,41 +47,29 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
   /** Extract Only supported format. Default */
   public static final String XML_FORMAT = "xml";
 
-  /** XHTML XPath parser. */
-  private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
-
   final SolrCore core;
   final SolrParams params;
   final UpdateRequestProcessor processor;
   final boolean ignoreTikaException;
-  protected AutoDetectParser autoDetectParser;
 
   private final AddUpdateCommand templateAdd;
 
-  protected TikaConfig config;
-  protected ParseContextConfig parseContextConfig;
   protected SolrContentHandlerFactory factory;
   protected ExtractionBackend backend;
 
   public ExtractingDocumentLoader(
       SolrQueryRequest req,
       UpdateRequestProcessor processor,
-      TikaConfig config,
-      ParseContextConfig parseContextConfig,
       SolrContentHandlerFactory factory,
       ExtractionBackend backend) {
     this.params = req.getParams();
     this.core = req.getCore();
-    this.config = config;
-    this.parseContextConfig = parseContextConfig;
     this.processor = processor;
 
     templateAdd = new AddUpdateCommand(req);
     templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
     templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);
 
-    // this is lightweight
-    autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;
     this.backend = backend;
 
@@ -111,17 +95,23 @@ public void load(
       UpdateRequestProcessor processor)
       throws Exception {
     String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
-    // If you specify the resource name (the filename, roughly) with this parameter,
-    // some backends can make use of it in guessing the appropriate MIME type:
     String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
 
     try (InputStream inputStream = stream.getStream()) {
-      // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
       String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
 
       String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
       boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
 
+      // Parse optional passwords file into a map (keeps Tika usages out of this class)
+      LinkedHashMap<Pattern, String> pwMap = null;
+      String passwordsFile = params.get("passwordsFile");
+      if (passwordsFile != null) {
+        try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) {
+          pwMap = RegexRulesPasswordProvider.parseRulesFile(is);
+        }
+      }
+
       ExtractionRequest extractionRequest =
           new ExtractionRequest(
               streamType,
@@ -131,10 +121,9 @@ public void load(
               stream.getName(),
               stream.getSourceInfo(),
               stream.getSize(),
-              params.get(ExtractingParams.RESOURCE_PASSWORD, null));
+              params.get(ExtractingParams.RESOURCE_PASSWORD, null),
+              pwMap);
 
-      // Determine if we must use the legacy SAX/XHTML pipeline (needed for
-      // capture/xpath/extractOnly)
       boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
       String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
       boolean needLegacySax =
@@ -142,251 +131,84 @@ public void load(
               || xpathExpr != null
               || captureAttr
               || (captureElems != null && captureElems.length > 0)
-              || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null);
-
-      if (backend instanceof LocalTikaExtractionBackend) {
-        // Use in-process Tika and SAX pipeline to preserve legacy behavior & test expectations
-        org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata();
-        if (resourceName != null) {
-          md.add(org.apache.tika.metadata.TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
-        }
-        if (stream.getContentType() != null) {
-          md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, stream.getContentType());
-          md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
-        }
-        if (charset != null) {
-          md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING, charset);
-        }
-        if (stream.getName() != null) {
-          md.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
-        }
-        if (stream.getSourceInfo() != null) {
-          md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
-        }
-        if (stream.getSize() != null) {
-          md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
-        }
+              || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null)
+              || (passwordsFile != null);
 
-        org.apache.tika.parser.Parser parser;
-        if (streamType != null) {
-          org.apache.tika.mime.MediaType mt =
-              org.apache.tika.mime.MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
-          parser =
-              new org.apache.tika.parser.DefaultParser(config.getMediaTypeRegistry())
-                  .getParsers()
-                  .get(mt);
-        } else {
-          parser = autoDetectParser;
-        }
-        if (parser == null) {
-          throw new IllegalArgumentException("No Tika parser for stream type: " + streamType);
-        }
-
-        org.apache.tika.parser.ParseContext context = parseContextConfig.create();
-        context.set(org.apache.tika.parser.Parser.class, parser);
-        context.set(
-            org.apache.tika.parser.html.HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
-        RegexRulesPasswordProvider pwd = new RegexRulesPasswordProvider();
-        String explicitPwd = params.get(ExtractingParams.RESOURCE_PASSWORD);
-        if (explicitPwd != null) pwd.setExplicitPassword(explicitPwd);
-        String passwordsFile = params.get("passwordsFile");
-        if (passwordsFile != null) {
-          try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) {
-            pwd.parse(is);
-          }
-        }
-        context.set(org.apache.tika.parser.PasswordProvider.class, pwd);
-
-        if (extractOnly) {
-          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT);
-
-          if (xpathExpr != null) {
-            // Always return text when xpath is provided, matching legacy behavior
-            org.apache.tika.sax.ToTextContentHandler textHandler =
-                new org.apache.tika.sax.ToTextContentHandler();
-            org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr);
-            org.xml.sax.ContentHandler ch =
-                new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
-            try {
-              parser.parse(inputStream, ch, md, context);
-            } catch (Exception e) {
-              if (ignoreTikaException) {
-                if (log.isWarnEnabled())
-                  log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
-                return;
-              } else {
-                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-              }
-            }
-            rsp.add(stream.getName(), textHandler.toString());
-
-          } else if (XML_FORMAT.equals(extractFormat)) {
-            org.apache.tika.sax.ToXMLContentHandler toXml =
-                new org.apache.tika.sax.ToXMLContentHandler();
-            org.xml.sax.ContentHandler ch = toXml;
-            if (xpathExpr != null) {
-              org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr);
-              ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher);
-            }
-            try {
-              parser.parse(inputStream, ch, md, context);
-            } catch (Exception e) {
-              if (ignoreTikaException) {
-                if (log.isWarnEnabled())
-                  log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
-                return;
-              } else {
-                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-              }
-            }
-            String xml = toXml.toString();
-            if (!xml.startsWith("<?xml")) {
-              xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + xml;
-            }
-            rsp.add(stream.getName(), xml);
-          } else { // TEXT_FORMAT
-            org.apache.tika.sax.ToTextContentHandler textHandler =
-                new org.apache.tika.sax.ToTextContentHandler();
-            try {
-              if (xpathExpr != null) {
-                org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr);
-                org.xml.sax.ContentHandler ch =
-                    new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
-                parser.parse(inputStream, ch, md, context);
-              } else {
-                parser.parse(inputStream, textHandler, md, context);
-              }
-            } catch (Exception e) {
-              if (ignoreTikaException) {
-                if (log.isWarnEnabled())
-                  log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
-                return;
-              } else {
-                throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-              }
-            }
-            rsp.add(stream.getName(), textHandler.toString());
-          }
-
-          // Add metadata to the response
+      if (extractOnly) {
+        String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT);
+        try {
+          ExtractionResult result =
+              backend.extractOnly(inputStream, extractionRequest, extractFormat, xpathExpr);
+          // Write content
+          rsp.add(stream.getName(), result.getContent());
+          // Write metadata
           NamedList<String[]> metadataNL = new NamedList<>();
-          for (String name : md.names()) {
-            String[] vals = md.getValues(name);
-            metadataNL.add(name, vals);
+          for (String name : result.getMetadata().names()) {
+            metadataNL.add(name, result.getMetadata().getValues(name));
           }
           rsp.add(stream.getName() + "_metadata", metadataNL);
-        } else {
-          // Indexing with capture/captureAttr etc.
-          SimpleExtractionMetadata neutral = new SimpleExtractionMetadata();
-          SolrContentHandler handler =
-              factory.createSolrContentHandler(neutral, params, req.getSchema());
-          try {
-            parser.parse(inputStream, handler, md, context);
-          } catch (Exception e) {
-            if (ignoreTikaException) {
-              if (log.isWarnEnabled())
-                log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
-              // Index a document with literals only (no extracted content/metadata)
-              addDoc(handler);
-              return;
-            } else {
-              throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-            }
-          }
-          // After parsing, transfer metadata into neutral and index
-          for (String name : md.names()) {
-            String[] vals = md.getValues(name);
-            if (vals != null) {
-              for (String v : vals) neutral.add(name, v);
-            }
+        } catch (UnsupportedOperationException uoe) {
+          // For backends that don't support xpath
+          throw new SolrException(
+              SolrException.ErrorCode.BAD_REQUEST,
+              "XPath filtering is not supported by backend '" + backend.name() + "'.");
+        } catch (Exception e) {
+          if (ignoreTikaException) {
+            if (log.isWarnEnabled())
+              log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+            return;
           }
-          addDoc(handler);
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
         }
-      } else {
-        // Default backend-neutral path
-        ExtractionResult result;
+        return;
+      }
+
+      if (needLegacySax) {
+        // Indexing with capture/xpath/etc: delegate SAX parse to backend
+        SimpleExtractionMetadata neutral = new SimpleExtractionMetadata();
+        SolrContentHandler handler =
+            factory.createSolrContentHandler(neutral, params, req.getSchema());
         try {
-          result = backend.extract(inputStream, extractionRequest);
+          backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral);
         } catch (Exception e) {
           if (ignoreTikaException) {
-            if (log.isWarnEnabled()) {
+            if (log.isWarnEnabled())
               log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
-            }
             // Index a document with literals only (no extracted content/metadata)
-            SolrContentHandler handler =
-                factory.createSolrContentHandler(
-                    new SimpleExtractionMetadata(), params, req.getSchema());
             addDoc(handler);
             return;
-          } else {
-            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
           }
+          throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
         }
+        addDoc(handler);
+        return;
+      }
 
-        ExtractionMetadata metadata = result.getMetadata();
-        String content = result.getContent();
-
-        if (extractOnly == false) {
+      // Default simple backend-neutral path
+      ExtractionResult result;
+      try {
+        result = backend.extract(inputStream, extractionRequest);
+      } catch (Exception e) {
+        if (ignoreTikaException) {
+          if (log.isWarnEnabled())
+            log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+          // Index a document with literals only (no extracted content/metadata)
           SolrContentHandler handler =
-              factory.createSolrContentHandler(metadata, params, req.getSchema());
-          handler.appendToContent(content);
+              factory.createSolrContentHandler(
+                  new SimpleExtractionMetadata(), params, req.getSchema());
           addDoc(handler);
-        } else {
-          if (xpathExpr != null) {
-            throw new SolrException(
-                SolrException.ErrorCode.BAD_REQUEST,
-                "XPath filtering is not supported with the backend-neutral extraction API.");
-          }
-          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
-          String out;
-          if (extractFormat.equals(TEXT_FORMAT)) {
-            out = content != null ? content : "";
-          } else {
-            // wrap content in basic XML with CDATA to avoid escaping
-            String safe = content == null ? "" : content.replace("]]>", "]]]]>\u003c![CDATA[>");
-            out = "<body><![CDATA[" + safe + "]]></body>";
-          }
-          rsp.add(stream.getName(), out);
-          String[] names = metadata.names();
-          NamedList<String[]> metadataNL = new NamedList<>();
-          for (int i = 0; i < names.length; i++) {
-            String[] vals = metadata.getValues(names[i]);
-            metadataNL.add(names[i], vals);
-          }
-          rsp.add(stream.getName() + "_metadata", metadataNL);
+          return;
         }
+        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
-    }
-  }
-
-  public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
-    public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
 
-    /**
-     * Keep all elements and their content.
-     *
-     * <p>Apparently &lt;SCRIPT&gt; and &lt;STYLE&gt; elements are blocked elsewhere
-     */
-    @Override
-    public boolean isDiscardElement(String name) {
-      return false;
-    }
-
-    /** Lowercases the attribute name */
-    @Override
-    public String mapSafeAttribute(String elementName, String attributeName) {
-      return attributeName.toLowerCase(Locale.ENGLISH);
-    }
+      ExtractionMetadata metadata = result.getMetadata();
+      String content = result.getContent();
 
-    /**
-     * Lowercases the element name, but returns null for &lt;BR&gt;, which suppresses the
-     * start-element event for lt;BR&gt; tags. This also suppresses the &lt;BODY&gt; tags because
-     * those are handled internally by Tika's XHTMLContentHandler.
-     */
-    @Override
-    public String mapSafeElement(String name) {
-      String lowerName = name.toLowerCase(Locale.ROOT);
-      return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName;
+      SolrContentHandler handler =
+          factory.createSolrContentHandler(metadata, params, req.getSchema());
+      handler.appendToContent(content);
+      addDoc(handler);
     }
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 45449f31929..6250601d6b1 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -16,8 +16,6 @@
  */
 package org.apache.solr.handler.extraction;
 
-import java.io.InputStream;
-import java.nio.file.Path;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.core.SolrCore;
@@ -28,7 +26,6 @@
 import org.apache.solr.security.PermissionNameProvider;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
 import org.apache.solr.util.plugin.SolrCoreAware;
-import org.apache.tika.config.TikaConfig;
 
 /**
  * Handler for rich documents like PDF or Word or any other file format that Tika handles that need
@@ -40,7 +37,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase
   public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
   public static final String CONFIG_LOCATION = "tika.config";
 
-  protected TikaConfig config;
+  protected String tikaConfigLoc;
   protected ParseContextConfig parseContextConfig;
 
   protected SolrContentHandlerFactory factory;
@@ -54,22 +51,8 @@ public PermissionNameProvider.Name getPermissionName(AuthorizationContext reques
   @Override
   public void inform(SolrCore core) {
     try {
-      String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
-      if (tikaConfigLoc == null) { // default
-        ClassLoader classLoader = core.getResourceLoader().getClassLoader();
-        try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) {
-          config = new TikaConfig(is);
-        }
-      } else {
-        Path configFile = Path.of(tikaConfigLoc);
-        if (configFile.isAbsolute()) {
-          config = new TikaConfig(configFile);
-        } else { // in conf/
-          try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) {
-            config = new TikaConfig(is);
-          }
-        }
-      }
+      // Store tika config location (backend-specific)
+      this.tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION);
 
       String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
       if (parseContextConfigLoc == null) { // default:
@@ -79,22 +62,27 @@ public void inform(SolrCore core) {
             new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
       }
     } catch (Exception e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to load Tika Config", e);
+      throw new SolrException(
+          ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e);
     }
 
     factory = createFactory();
 
     // Choose backend implementation
     String backendName = (String) initArgs.get("extraction.backend");
-    if (backendName == null
-        || backendName.trim().isEmpty()
-        || backendName.equalsIgnoreCase("local")) {
-      backend = new LocalTikaExtractionBackend(config, parseContextConfig);
-    } else if (backendName.equalsIgnoreCase("dummy")) {
-      backend = new DummyExtractionBackend();
-    } else {
-      // Fallback to local if unknown
-      backend = new LocalTikaExtractionBackend(config, parseContextConfig);
+    try {
+      if (backendName == null
+          || backendName.trim().isEmpty()
+          || backendName.equalsIgnoreCase("local")) {
+        backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
+      } else if (backendName.equalsIgnoreCase("dummy")) {
+        backend = new DummyExtractionBackend();
+      } else {
+        // Fallback to local if unknown
+        backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
+      }
+    } catch (Exception e) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e);
     }
   }
 
@@ -111,12 +99,17 @@ protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProce
       if (backendParam.equalsIgnoreCase("dummy")) {
         backendToUse = new DummyExtractionBackend();
       } else if (backendParam.equalsIgnoreCase("local")) {
-        backendToUse = new LocalTikaExtractionBackend(config, parseContextConfig);
+        try {
+          backendToUse =
+              new LocalTikaExtractionBackend(req.getCore(), tikaConfigLoc, parseContextConfig);
+        } catch (Exception e) {
+          throw new SolrException(
+              ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e);
+        }
       }
       // unknown values fall back to the handler-configured backend
     }
-    return new ExtractingDocumentLoader(
-        req, processor, config, parseContextConfig, factory, backendToUse);
+    return new ExtractingDocumentLoader(req, processor, factory, backendToUse);
   }
 
   // ////////////////////// SolrInfoMBeans methods //////////////////////
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index e4758336383..3a253dc1ec3 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -26,6 +26,27 @@ public interface ExtractionBackend {
    */
   ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception;
 
+  /**
+   * Perform extractOnly operation. If extractFormat equals ExtractingDocumentLoader.TEXT_FORMAT,
+   * return plain text. If XML, return XML body as string. Implementations may support optional
+   * xpathExpr; if unsupported and xpathExpr is not null, they should throw
+   * UnsupportedOperationException.
+   */
+  ExtractionResult extractOnly(
+      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
+      throws Exception;
+
+  /**
+   * Parse the content and stream SAX events into the provided SolrContentHandler, while also
+   * filling outMetadata with extracted metadata.
+   */
+  void parseToSolrContentHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      SolrContentHandler handler,
+      ExtractionMetadata outMetadata)
+      throws Exception;
+
   /** A short name for debugging/config, e.g., "local" or "dummy". */
   String name();
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
index 4a72e89e4b0..f1af3029193 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
@@ -26,6 +26,8 @@ public class ExtractionRequest {
   public final String streamSourceInfo;
   public final Long streamSize;
   public final String resourcePassword; // optional password for encrypted docs
+  public final java.util.LinkedHashMap<java.util.regex.Pattern, String>
+      passwordsMap; // optional passwords map
 
   public ExtractionRequest(
       String streamType,
@@ -35,7 +37,8 @@ public ExtractionRequest(
       String streamName,
       String streamSourceInfo,
       Long streamSize,
-      String resourcePassword) {
+      String resourcePassword,
+      java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap) {
     this.streamType = streamType;
     this.resourceName = resourceName;
     this.contentType = contentType;
@@ -44,5 +47,6 @@ public ExtractionRequest(
     this.streamSourceInfo = streamSourceInfo;
     this.streamSize = streamSize;
     this.resourcePassword = resourcePassword;
+    this.passwordsMap = passwordsMap;
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 85fef5b7252..315a582ea2a 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -17,7 +17,9 @@
 package org.apache.solr.handler.extraction;
 
 import java.io.InputStream;
+import java.nio.file.Path;
 import java.util.Locale;
+import org.apache.solr.core.SolrCore;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
@@ -40,79 +42,190 @@ public class LocalTikaExtractionBackend implements ExtractionBackend {
   private final ParseContextConfig parseContextConfig;
   private final AutoDetectParser autoDetectParser;
 
+  // Local HtmlMapper moved from ExtractingDocumentLoader
+  private static class MostlyPassthroughHtmlMapper implements HtmlMapper {
+    static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();
+
+    @Override
+    public boolean isDiscardElement(String name) {
+      return false;
+    }
+
+    @Override
+    public String mapSafeAttribute(String elementName, String attributeName) {
+      return attributeName.toLowerCase(java.util.Locale.ENGLISH);
+    }
+
+    @Override
+    public String mapSafeElement(String name) {
+      String lowerName = name.toLowerCase(java.util.Locale.ROOT);
+      return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName;
+    }
+  }
+
   public LocalTikaExtractionBackend(TikaConfig config, ParseContextConfig parseContextConfig) {
     this.tikaConfig = config;
     this.parseContextConfig = parseContextConfig;
     this.autoDetectParser = new AutoDetectParser(config);
   }
 
+  /**
+   * Construct backend by loading TikaConfig based on handler/core configuration without exposing
+   * Tika types to the handler.
+   */
+  public LocalTikaExtractionBackend(
+      SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) throws Exception {
+    TikaConfig cfg;
+    if (tikaConfigLoc == null) { // default
+      ClassLoader classLoader = core.getResourceLoader().getClassLoader();
+      try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) {
+        cfg = new TikaConfig(is);
+      }
+    } else {
+      Path configFile = Path.of(tikaConfigLoc);
+      if (configFile.isAbsolute()) {
+        cfg = new TikaConfig(configFile);
+      } else { // in conf/
+        try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) {
+          cfg = new TikaConfig(is);
+        }
+      }
+    }
+    this.tikaConfig = cfg;
+    this.parseContextConfig = parseContextConfig;
+    this.autoDetectParser = new AutoDetectParser(cfg);
+  }
+
   @Override
   public String name() {
     return "local";
   }
 
-  @Override
-  public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
-      throws Exception {
-    Parser parser = null;
+  private Parser selectParser(ExtractionRequest request) {
     if (request.streamType != null) {
       MediaType mt = MediaType.parse(request.streamType.trim().toLowerCase(Locale.ROOT));
-      parser = new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt);
-    } else {
-      parser = autoDetectParser;
-    }
-    if (parser == null) {
-      throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
+      return new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt);
     }
+    return autoDetectParser;
+  }
 
+  private Metadata buildMetadata(ExtractionRequest request) {
     Metadata md = new Metadata();
-    if (request.resourceName != null) {
+    if (request.resourceName != null)
       md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName);
-    }
-    if (request.contentType != null) {
-      md.add(HttpHeaders.CONTENT_TYPE, request.contentType);
-    }
-    if (request.streamName != null) {
+    if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType);
+    if (request.streamName != null)
       md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName);
-    }
-    if (request.streamSourceInfo != null) {
+    if (request.streamSourceInfo != null)
       md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo);
-    }
-    if (request.streamSize != null) {
+    if (request.streamSize != null)
       md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize));
-    }
-    if (request.contentType != null) {
+    if (request.contentType != null)
       md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType);
-    }
-    if (request.charset != null) {
-      md.add(HttpHeaders.CONTENT_ENCODING, request.charset);
-    }
+    if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset);
+    return md;
+  }
 
+  private ParseContext buildContext(Parser parser, ExtractionRequest request) {
     ParseContext context = parseContextConfig.create();
     context.set(Parser.class, parser);
-    context.set(HtmlMapper.class, ExtractingDocumentLoader.MostlyPassthroughHtmlMapper.INSTANCE);
+    context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
+    PasswordProvider pwd = new RegexRulesPasswordProvider();
+    if (request.resourcePassword != null && pwd instanceof RegexRulesPasswordProvider) {
+      ((RegexRulesPasswordProvider) pwd).setExplicitPassword(request.resourcePassword);
+    }
+    if (request.passwordsMap != null && pwd instanceof RegexRulesPasswordProvider) {
+      ((RegexRulesPasswordProvider) pwd).setPasswordMap(request.passwordsMap);
+    }
+    context.set(PasswordProvider.class, pwd);
+    return context;
+  }
 
-    // Password handling: allow passing explicit and map via params in future if needed.
-    PasswordProvider epp = new RegexRulesPasswordProvider();
-    if (request.resourcePassword != null && epp instanceof RegexRulesPasswordProvider) {
-      ((RegexRulesPasswordProvider) epp).setExplicitPassword(request.resourcePassword);
+  private static ExtractionMetadata copyToNeutral(Metadata md) {
+    ExtractionMetadata out = new SimpleExtractionMetadata();
+    for (String name : md.names()) {
+      String[] vals = md.getValues(name);
+      if (vals != null) for (String v : vals) out.add(name, v);
     }
-    context.set(PasswordProvider.class, epp);
+    return out;
+  }
 
+  @Override
+  public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
+      throws Exception {
+    Parser parser = selectParser(request);
+    if (parser == null) {
+      throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
+    }
+    Metadata md = buildMetadata(request);
+    ParseContext context = buildContext(parser, request);
     BodyContentHandler textHandler = new BodyContentHandler(-1);
     parser.parse(inputStream, textHandler, md, context);
+    return new ExtractionResult(textHandler.toString(), copyToNeutral(md));
+  }
+
+  @Override
+  public ExtractionResult extractOnly(
+      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
+      throws Exception {
+    Parser parser = selectParser(request);
+    if (parser == null) {
+      throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
+    }
+    Metadata md = buildMetadata(request);
+    ParseContext context = buildContext(parser, request);
+
+    String content;
+    if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractFormat) || xpathExpr != null) {
+      org.apache.tika.sax.ToTextContentHandler textHandler =
+          new org.apache.tika.sax.ToTextContentHandler();
+      org.xml.sax.ContentHandler ch = textHandler;
+      if (xpathExpr != null) {
+        org.apache.tika.sax.xpath.XPathParser xparser =
+            new org.apache.tika.sax.xpath.XPathParser(
+                "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
+        org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
+        ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
+      }
+      parser.parse(inputStream, ch, md, context);
+      content = textHandler.toString();
+    } else { // XML format
+      org.apache.tika.sax.ToXMLContentHandler toXml = new org.apache.tika.sax.ToXMLContentHandler();
+      org.xml.sax.ContentHandler ch = toXml;
+      if (xpathExpr != null) {
+        org.apache.tika.sax.xpath.XPathParser xparser =
+            new org.apache.tika.sax.xpath.XPathParser(
+                "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
+        org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
+        ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher);
+      }
+      parser.parse(inputStream, ch, md, context);
+      content = toXml.toString();
+      if (!content.startsWith("<?xml")) {
+        content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + content;
+      }
+    }
+    return new ExtractionResult(content, copyToNeutral(md));
+  }
 
-    // copy metadata to neutral container
-    ExtractionMetadata outMetadata = new SimpleExtractionMetadata();
+  @Override
+  public void parseToSolrContentHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      SolrContentHandler handler,
+      ExtractionMetadata outMetadata)
+      throws Exception {
+    Parser parser = selectParser(request);
+    if (parser == null) {
+      throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
+    }
+    Metadata md = buildMetadata(request);
+    ParseContext context = buildContext(parser, request);
+    parser.parse(inputStream, handler, md, context);
+    // populate outMetadata
     for (String name : md.names()) {
       String[] vals = md.getValues(name);
-      if (vals != null) {
-        for (String v : vals) {
-          outMetadata.add(name, v);
-        }
-      }
+      if (vals != null) for (String v : vals) outMetadata.add(name, v);
     }
-    String content = textHandler.toString();
-    return new ExtractionResult(content, outMetadata);
   }
 }
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index fa23833d918..acff92e1071 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -796,8 +796,6 @@ public void testArabicPDF() throws Exception {
     assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
   }
 
-
-
   public void testLiteralsOverride() throws Exception {
     ExtractingRequestHandler handler =
         (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
index 4110713ea66..df365f2bedf 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
@@ -31,9 +31,10 @@ public class LocalTikaExtractionBackendTest extends SolrTestCaseJ4 {
 
   @BeforeClass
   public static void setupClass() throws Exception {
-    try (InputStream is = LocalTikaExtractionBackendTest.class
-        .getClassLoader()
-        .getResourceAsStream("solr-default-tika-config.xml")) {
+    try (InputStream is =
+        LocalTikaExtractionBackendTest.class
+            .getClassLoader()
+            .getResourceAsStream("solr-default-tika-config.xml")) {
       assertNotNull("solr-default-tika-config.xml not on classpath", is);
       tikaConfig = new TikaConfig(is);
     }
@@ -61,7 +62,8 @@ private ExtractionRequest newRequest(
         streamName,
         streamSourceInfo,
         streamSize,
-        resourcePassword);
+        resourcePassword,
+        null);
   }
 
   @Test
@@ -69,29 +71,31 @@ public void testWrongStreamTypeThrows() throws Exception {
     LocalTikaExtractionBackend backend = newBackend();
     try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) {
       // Non-existing type -> no parser available
-      ExtractionRequest req = newRequest(
-          "version_control.txt",
-          "foo/bar",
-          null,
-          null,
-          "version_control.txt",
-          null,
-          null,
-          null);
+      ExtractionRequest req =
+          newRequest(
+              "version_control.txt",
+              "foo/bar",
+              null,
+              null,
+              "version_control.txt",
+              null,
+              null,
+              null);
       expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req));
     }
 
     try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) {
       // Wrong but existing type -> likely to fail when parsing
-      ExtractionRequest req = newRequest(
-          "version_control.txt",
-          "application/pdf",
-          null,
-          null,
-          "version_control.txt",
-          null,
-          null,
-          null);
+      ExtractionRequest req =
+          newRequest(
+              "version_control.txt",
+              "application/pdf",
+              null,
+              null,
+              "version_control.txt",
+              null,
+              null,
+              null);
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
   }
@@ -100,15 +104,16 @@ public void testWrongStreamTypeThrows() throws Exception {
   public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception {
     LocalTikaExtractionBackend backend = newBackend();
     try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) {
-      ExtractionRequest req = newRequest(
-          "password-is-Word2010.docx",
-          null,
-          null,
-          null,
-          "password-is-Word2010.docx",
-          null,
-          null,
-          null);
+      ExtractionRequest req =
+          newRequest(
+              "password-is-Word2010.docx",
+              null,
+              null,
+              null,
+              "password-is-Word2010.docx",
+              null,
+              null,
+              null);
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
   }
@@ -117,21 +122,23 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception {
   public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception {
     LocalTikaExtractionBackend backend = newBackend();
     try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) {
-      ExtractionRequest req = newRequest(
-          "password-is-Word2010.docx",
-          null,
-          null,
-          null,
-          "password-is-Word2010.docx",
-          null,
-          null,
-          "Word2010");
+      ExtractionRequest req =
+          newRequest(
+              "password-is-Word2010.docx",
+              null,
+              null,
+              null,
+              "password-is-Word2010.docx",
+              null,
+              null,
+              "Word2010");
       ExtractionResult res = backend.extract(in, req);
       assertNotNull(res);
       assertNotNull(res.getMetadata());
       String content = res.getContent();
       assertNotNull(content);
-      assertTrue("Content should mention password-protected doc text",
+      assertTrue(
+          "Content should mention password-protected doc text",
           content.contains("Test password protected word doc"));
     }
   }

From 5a19251c1f39e8cfe2b3220df4dc875728bcc0e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 15:53:29 +0200
Subject: [PATCH 04/47] Use a factory to create the backend to keep it DRY

---
 .../extraction/ExtractingRequestHandler.java  | 57 +++++----------
 .../extraction/ExtractionBackendFactory.java  | 72 +++++++++++++++++++
 2 files changed, 90 insertions(+), 39 deletions(-)
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 6250601d6b1..5f1b6f2be3f 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -41,7 +41,8 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase
   protected ParseContextConfig parseContextConfig;
 
   protected SolrContentHandlerFactory factory;
-  protected ExtractionBackend backend;
+  protected ExtractionBackendFactory backendFactory;
+  protected String defaultBackendName;
 
   @Override
   public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) {
@@ -61,55 +62,33 @@ public void inform(SolrCore core) {
         parseContextConfig =
             new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
       }
-    } catch (Exception e) {
-      throw new SolrException(
-          ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e);
-    }
 
-    factory = createFactory();
+      // Initialize backend factory once; backends are created lazily on demand
+      backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig);
+
+      // Choose default backend name (do not instantiate yet)
+      String backendName = (String) initArgs.get("extraction.backend");
+      defaultBackendName =
+          (backendName == null || backendName.trim().isEmpty()) ? "local" : backendName;
 
-    // Choose backend implementation
-    String backendName = (String) initArgs.get("extraction.backend");
-    try {
-      if (backendName == null
-          || backendName.trim().isEmpty()
-          || backendName.equalsIgnoreCase("local")) {
-        backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
-      } else if (backendName.equalsIgnoreCase("dummy")) {
-        backend = new DummyExtractionBackend();
-      } else {
-        // Fallback to local if unknown
-        backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
-      }
     } catch (Exception e) {
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e);
+      throw new SolrException(
+          ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e);
     }
-  }
 
-  protected SolrContentHandlerFactory createFactory() {
-    return new SolrContentHandlerFactory();
+    factory = new SolrContentHandlerFactory();
   }
 
   @Override
   protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
     // Allow per-request override of backend via request param "extraction.backend"
-    ExtractionBackend backendToUse = this.backend;
     String backendParam = req.getParams().get("extraction.backend");
-    if (backendParam != null) {
-      if (backendParam.equalsIgnoreCase("dummy")) {
-        backendToUse = new DummyExtractionBackend();
-      } else if (backendParam.equalsIgnoreCase("local")) {
-        try {
-          backendToUse =
-              new LocalTikaExtractionBackend(req.getCore(), tikaConfigLoc, parseContextConfig);
-        } catch (Exception e) {
-          throw new SolrException(
-              ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e);
-        }
-      }
-      // unknown values fall back to the handler-configured backend
-    }
-    return new ExtractingDocumentLoader(req, processor, factory, backendToUse);
+    String nameToUse =
+        (backendParam != null && !backendParam.trim().isEmpty())
+            ? backendParam
+            : defaultBackendName;
+    ExtractionBackend extractionBackend = backendFactory.getBackend(nameToUse);
+    return new ExtractingDocumentLoader(req, processor, factory, extractionBackend);
   }
 
   // ////////////////////// SolrInfoMBeans methods //////////////////////
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
new file mode 100644
index 00000000000..234a6064c62
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import org.apache.solr.core.SolrCore;
+
+/**
+ * Factory for ExtractionBackend instances. Lazily constructs backends by short name (e.g., "local",
+ * "dummy") and caches them for reuse.
+ */
+public class ExtractionBackendFactory {
+  private final SolrCore core;
+  private final String tikaConfigLoc;
+  private final ParseContextConfig parseContextConfig;
+  private final Map<String, ExtractionBackend> cache = new ConcurrentHashMap<>();
+
+  public ExtractionBackendFactory(
+      SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) {
+    this.core = core;
+    this.tikaConfigLoc = tikaConfigLoc;
+    this.parseContextConfig = parseContextConfig;
+  }
+
+  /** Returns a backend instance for the given name, creating it if necessary. */
+  public ExtractionBackend getBackend(String name) {
+    String key = normalize(name);
+    return cache.computeIfAbsent(
+        key,
+        k -> {
+          try {
+            return create(k);
+          } catch (Exception e) {
+            throw new RuntimeException("Failed to create extraction backend '" + k + "'", e);
+          }
+        });
+  }
+
+  private String normalize(String name) {
+    if (name == null || name.trim().isEmpty()) return "local";
+    return name.trim().toLowerCase(Locale.ROOT);
+  }
+
+  /** Creates a new backend instance for the given normalized name. */
+  protected ExtractionBackend create(String normalizedName) throws Exception {
+    switch (normalizedName) {
+      case "local":
+        return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
+      case "dummy":
+        return new DummyExtractionBackend();
+      default:
+        // Fallback to local for unknown names
+        return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
+    }
+  }
+}

From 35fef11f3c78b48d6965f0c05b6bc21fd9719e7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 16:25:27 +0200
Subject: [PATCH 05/47] Add TikaServerExtractionBackend

---
 .../extraction/DummyExtractionBackend.java    |   3 +-
 .../handler/extraction/ExtractingParams.java  |   3 +
 .../extraction/ExtractingRequestHandler.java  |  12 +-
 .../extraction/ExtractionBackendFactory.java  |  12 +-
 .../LocalTikaExtractionBackend.java           |   4 +-
 .../TikaServerExtractionBackend.java          | 210 ++++++++++++++++++
 .../ExtractingRequestHandlerTest.java         |   4 +-
 .../TikaServerExtractionBackendTest.java      | 199 +++++++++++++++++
 8 files changed, 434 insertions(+), 13 deletions(-)
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
 create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index c9cdf724ef2..4d3955b4b1b 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -20,9 +20,10 @@
 
 /** Dummy backend that emits predictable test data without actually parsing input content. */
 public class DummyExtractionBackend implements ExtractionBackend {
+  public static final String ID = "dummy";
   @Override
   public String name() {
-    return "dummy";
+    return ID;
   }
 
   @Override
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
index a7d159678f1..840af280243 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
@@ -136,4 +136,7 @@ public interface ExtractingParams {
    * .*=&lt;defaultmypassword&gt; at the end
    */
   public static final String PASSWORD_MAP_FILE = "passwordsFile";
+
+  /** Backend selection parameter and */
+  public static final String EXTRACTION_BACKEND = "extraction.backend";
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 5f1b6f2be3f..ff4bddd0039 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -36,6 +36,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase
 
   public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
   public static final String CONFIG_LOCATION = "tika.config";
+  public static final String TIKASERVER_URL = "tikaserver.url";
 
   protected String tikaConfigLoc;
   protected ParseContextConfig parseContextConfig;
@@ -64,12 +65,13 @@ public void inform(SolrCore core) {
       }
 
       // Initialize backend factory once; backends are created lazily on demand
-      backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig);
+      String tikaServerUrl = (String) initArgs.get(TIKASERVER_URL);
+      backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl);
 
       // Choose default backend name (do not instantiate yet)
-      String backendName = (String) initArgs.get("extraction.backend");
+      String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND);
       defaultBackendName =
-          (backendName == null || backendName.trim().isEmpty()) ? "local" : backendName;
+          (backendName == null || backendName.trim().isEmpty()) ? LocalTikaExtractionBackend.ID : backendName;
 
     } catch (Exception e) {
       throw new SolrException(
@@ -81,8 +83,8 @@ public void inform(SolrCore core) {
 
   @Override
   protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
-    // Allow per-request override of backend via request param "extraction.backend"
-    String backendParam = req.getParams().get("extraction.backend");
+    // Allow per-request override of backend via request param
+    String backendParam = req.getParams().get(ExtractingParams.EXTRACTION_BACKEND);
     String nameToUse =
         (backendParam != null && !backendParam.trim().isEmpty())
             ? backendParam
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
index 234a6064c62..558e5cd7f72 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
@@ -29,13 +29,15 @@ public class ExtractionBackendFactory {
   private final SolrCore core;
   private final String tikaConfigLoc;
   private final ParseContextConfig parseContextConfig;
+  private final String tikaServerUrl;
   private final Map<String, ExtractionBackend> cache = new ConcurrentHashMap<>();
 
   public ExtractionBackendFactory(
-      SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) {
+      SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig, String tikaServerUrl) {
     this.core = core;
     this.tikaConfigLoc = tikaConfigLoc;
     this.parseContextConfig = parseContextConfig;
+    this.tikaServerUrl = tikaServerUrl;
   }
 
   /** Returns a backend instance for the given name, creating it if necessary. */
@@ -53,17 +55,19 @@ public ExtractionBackend getBackend(String name) {
   }
 
   private String normalize(String name) {
-    if (name == null || name.trim().isEmpty()) return "local";
+    if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.ID;
     return name.trim().toLowerCase(Locale.ROOT);
   }
 
   /** Creates a new backend instance for the given normalized name. */
   protected ExtractionBackend create(String normalizedName) throws Exception {
     switch (normalizedName) {
-      case "local":
+      case LocalTikaExtractionBackend.ID:
         return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
-      case "dummy":
+      case DummyExtractionBackend.ID:
         return new DummyExtractionBackend();
+      case TikaServerExtractionBackend.ID:
+        return new TikaServerExtractionBackend(tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
       default:
         // Fallback to local for unknown names
         return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 315a582ea2a..687f0e6cc1e 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -96,9 +96,11 @@ public LocalTikaExtractionBackend(
     this.autoDetectParser = new AutoDetectParser(cfg);
   }
 
+  public static final String ID = "local";
+
   @Override
   public String name() {
-    return "local";
+    return ID;
   }
 
   private Parser selectParser(ExtractionRequest request) {
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
new file mode 100644
index 00000000000..33ac66e7d86
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.nio.charset.StandardCharsets;
+import java.time.Duration;
+import org.noggit.JSONParser;
+
+/**
+ * Extraction backend that delegates parsing to a remote Apache Tika Server.
+ *
+ * <p>This backend uses Java 11 HttpClient to call Tika Server endpoints. It supports
+ * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing
+ * is not supported and will throw UnsupportedOperationException.
+ */
+public class TikaServerExtractionBackend implements ExtractionBackend {
+  private final HttpClient httpClient;
+  private final String baseUrl; // e.g., http://localhost:9998
+  private final Duration timeout = Duration.ofSeconds(30);
+
+  public TikaServerExtractionBackend(String baseUrl) {
+    this(HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build(), baseUrl);
+  }
+
+  // Visible for tests
+  TikaServerExtractionBackend(HttpClient httpClient, String baseUrl) {
+    if (baseUrl.endsWith("/")) {
+      this.baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
+    } else {
+      this.baseUrl = baseUrl;
+    }
+    this.httpClient = httpClient;
+  }
+
+  public static final String ID = "tikaserver";
+
+  @Override
+  public String name() {
+    return ID;
+  }
+
+  @Override
+  public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
+      throws Exception {
+    // 1) Extract text
+    String text = requestText(inputStream, request, false, null);
+
+    // 2) Fetch metadata as JSON and convert to neutral metadata
+    ExtractionMetadata md = fetchMetadata(request);
+
+    return new ExtractionResult(text, md);
+  }
+
+  @Override
+  public ExtractionResult extractOnly(
+      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
+      throws Exception {
+    if (xpathExpr != null) {
+      throw new UnsupportedOperationException("XPath filtering is not supported by TikaServer backend");
+    }
+    boolean wantXml = !ExtractingDocumentLoader.TEXT_FORMAT.equalsIgnoreCase(extractFormat);
+    String content = requestText(inputStream, request, wantXml, xpathExpr);
+    ExtractionMetadata md = fetchMetadata(request);
+    return new ExtractionResult(content, md);
+  }
+
+  @Override
+  public void parseToSolrContentHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      SolrContentHandler handler,
+      ExtractionMetadata outMetadata)
+      throws Exception {
+    throw new UnsupportedOperationException(
+        "Legacy SAX-based parsing is not supported by TikaServer backend");
+  }
+
+  private String requestText(
+      InputStream inputStream, ExtractionRequest request, boolean wantXml, String xpath)
+      throws IOException, InterruptedException {
+    String url = baseUrl + "/tika";
+    HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream));
+    // Content-Type
+    String contentType = firstNonNull(request.streamType, request.contentType);
+    if (contentType != null) {
+      b.header("Content-Type", contentType);
+    }
+    // Filename hint
+    if (request.resourceName != null) {
+      b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
+    }
+    // Response type
+    b.header("Accept", wantXml ? "application/xml" : "text/plain");
+
+    HttpResponse<byte[]> resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofByteArray());
+    int code = resp.statusCode();
+    if (code < 200 || code >= 300) {
+      throw new IOException("TikaServer /tika returned status " + code);
+    }
+    return new String(resp.body(), StandardCharsets.UTF_8);
+  }
+
+  private ExtractionMetadata fetchMetadata(ExtractionRequest request)
+      throws IOException, InterruptedException {
+    // Call /meta to get metadata. Ask JSON form; Tika Server returns application/json map.
+    String url = baseUrl + "/meta";
+    HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.noBody());
+    String contentType = firstNonNull(request.streamType, request.contentType);
+    if (contentType != null) {
+      b.header("Content-Type", contentType);
+    }
+    if (request.resourceName != null) {
+      b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
+    }
+    b.header("Accept", "application/json");
+
+    HttpResponse<String> resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
+    int code = resp.statusCode();
+    if (code < 200 || code >= 300) {
+      throw new IOException("TikaServer /meta returned status " + code);
+    }
+    return parseJsonToMetadata(resp.body());
+  }
+
+  private static String firstNonNull(String a, String b) {
+    return a != null ? a : b;
+  }
+
+  // Parse Tika Server metadata JSON using Noggit JSONParser. Supports values as strings,
+  // arrays of strings, and basic scalars (numbers/booleans) which are coerced to String.
+  private static ExtractionMetadata parseJsonToMetadata(String json) {
+    SimpleExtractionMetadata md = new SimpleExtractionMetadata();
+    if (json == null) return md;
+    try {
+      JSONParser p = new JSONParser(json);
+      int ev = p.nextEvent();
+      if (ev != JSONParser.OBJECT_START) {
+        return md;
+      }
+      String currentKey = null;
+      while (true) {
+        ev = p.nextEvent();
+        if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) {
+          break;
+        }
+        if (ev == JSONParser.STRING && p.wasKey()) {
+          currentKey = p.getString();
+          // Next event is the value for this key
+          ev = p.nextEvent();
+          if (ev == JSONParser.STRING) {
+            md.add(currentKey, p.getString());
+          } else if (ev == JSONParser.ARRAY_START) {
+            // Read array elements
+            while (true) {
+              ev = p.nextEvent();
+              if (ev == JSONParser.ARRAY_END) break;
+              if (ev == JSONParser.STRING) {
+                md.add(currentKey, p.getString());
+              } else if (ev == JSONParser.LONG
+                  || ev == JSONParser.NUMBER
+                  || ev == JSONParser.BIGNUMBER) {
+                md.add(currentKey, p.getNumberChars().toString());
+              } else if (ev == JSONParser.BOOLEAN) {
+                md.add(currentKey, String.valueOf(p.getBoolean()));
+              } else if (ev == JSONParser.NULL) {
+                // ignore nulls
+              } else {
+                // skip nested objects or unsupported types within arrays
+              }
+            }
+          } else if (ev == JSONParser.LONG
+              || ev == JSONParser.NUMBER
+              || ev == JSONParser.BIGNUMBER) {
+            md.add(currentKey, p.getNumberChars().toString());
+          } else if (ev == JSONParser.BOOLEAN) {
+            md.add(currentKey, String.valueOf(p.getBoolean()));
+          } else if (ev == JSONParser.NULL) {
+            // ignore nulls
+          } else {
+            // skip nested objects or unsupported value types
+          }
+        }
+      }
+    } catch (java.io.IOException ioe) {
+      // Fall back to empty metadata on parsing error
+      return md;
+    }
+    return md;
+  }
+}
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index acff92e1071..c7098665027 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -1080,8 +1080,8 @@ public void testDummyBackendExtractOnly() throws Exception {
     SolrQueryResponse rsp =
         loadLocal(
             "extraction/version_control.txt",
-            "extraction.backend",
-            "dummy",
+            ExtractingParams.EXTRACTION_BACKEND,
+            DummyExtractionBackend.ID,
             ExtractingParams.EXTRACT_ONLY,
             "true",
             ExtractingParams.EXTRACT_FORMAT,
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
new file mode 100644
index 00000000000..19846da1142
--- /dev/null
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.CookieHandler;
+import java.net.ProxySelector;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpHeaders;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
+import java.security.SecureRandom;
+import java.time.Duration;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.Executor;
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.SSLParameters;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.Test;
+
+/** Unit tests for TikaServerExtractionBackend using a mocked HttpClient (no networking). */
+public class TikaServerExtractionBackendTest extends SolrTestCaseJ4 {
+
+  static {
+    // Allow the SecureRandom algorithm used in this environment to avoid class configuration
+    // failure in tests.
+    System.setProperty("test.solr.allowed.securerandom", "NativePRNG");
+  }
+
+  private static class FakeHttpClient extends HttpClient {
+    @Override
+    public Optional<CookieHandler> cookieHandler() { return Optional.empty(); }
+
+    @Override
+    public Optional<Duration> connectTimeout() { return Optional.of(Duration.ofSeconds(5)); }
+
+    @Override
+    public Redirect followRedirects() { return Redirect.NEVER; }
+
+    @Override
+    public Optional<ProxySelector> proxy() { return Optional.empty(); }
+
+    @Override
+    public SSLContext sslContext() { try { return SSLContext.getDefault(); } catch (Exception e) { throw new RuntimeException(e);} }
+
+    @Override
+    public SSLParameters sslParameters() { return new SSLParameters(); }
+
+    @Override
+    public Optional<Executor> executor() { return Optional.empty(); }
+
+    @Override
+    public Optional<java.net.Authenticator> authenticator() { return Optional.empty(); }
+
+    @Override
+    public Version version() { return Version.HTTP_1_1; }
+
+    @Override
+    public <T> HttpResponse<T> send(HttpRequest request, HttpResponse.BodyHandler<T> responseBodyHandler)
+        throws IOException, InterruptedException {
+      return respond(request, responseBodyHandler);
+    }
+
+    @Override
+    public <T> CompletableFuture<HttpResponse<T>> sendAsync(HttpRequest request, HttpResponse.BodyHandler<T> responseBodyHandler) {
+      return CompletableFuture.completedFuture(respond(request, responseBodyHandler));
+    }
+
+    @Override
+    public <T> CompletableFuture<HttpResponse<T>> sendAsync(HttpRequest request, HttpResponse.BodyHandler<T> responseBodyHandler, HttpResponse.PushPromiseHandler<T> pushPromiseHandler) {
+      return CompletableFuture.completedFuture(respond(request, responseBodyHandler));
+    }
+
+    private <T> HttpResponse<T> respond(HttpRequest request, HttpResponse.BodyHandler<T> handler) {
+      try {
+        URI uri = request.uri();
+        String path = uri.getPath();
+        byte[] body;
+        String ct;
+        int sc = 200;
+        if ("/tika".equals(path)) {
+          String accept = request.headers().firstValue("Accept").orElse("text/plain");
+          if ("application/xml".equalsIgnoreCase(accept)) {
+            String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><xhtml><body>XML OUT</body></xhtml>";
+            body = xml.getBytes(java.nio.charset.StandardCharsets.UTF_8);
+            ct = "application/xml";
+          } else {
+            body = "TEXT OUT".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+            ct = "text/plain";
+          }
+        } else if ("/meta".equals(path)) {
+          String json =
+              "{\"Content-Type\":[\"text/plain\"],\"resourcename\":[\"test.txt\"],\"X-Parsed-By\":[\"SomeParser\"]}";
+          body = json.getBytes(java.nio.charset.StandardCharsets.UTF_8);
+          ct = "application/json";
+        } else {
+          body = "Not Found".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+          sc = 404;
+          ct = "text/plain";
+        }
+        final int status = sc;
+        final String contentType = ct;
+        // Decide expected body type based on endpoint (mimics our backend usage)
+        final Object bodyObj =
+            "/meta".equals(path)
+                ? new String(body, java.nio.charset.StandardCharsets.UTF_8)
+                : body; // /tika returns bytes
+        return new HttpResponse<>() {
+          @Override public int statusCode() { return status; }
+          @Override public HttpRequest request() { return request; }
+          @Override public Optional<HttpResponse<T>> previousResponse() { return Optional.empty(); }
+          @Override public HttpHeaders headers() { return HttpHeaders.of(java.util.Map.of("Content-Type", java.util.List.of(contentType)), (k,v)->true); }
+          @Override public T body() { @SuppressWarnings("unchecked") T t = (T) bodyObj; return t; }
+          @Override public Optional<javax.net.ssl.SSLSession> sslSession() { return Optional.empty(); }
+          @Override public URI uri() { return uri; }
+          @Override public Version version() { return Version.HTTP_1_1; }
+        };
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  private static ExtractionRequest newRequest(String resourceName, String contentType) {
+    return new ExtractionRequest(
+        contentType, // streamType
+        resourceName, // resourceName
+        contentType, // contentType
+        null, // charset
+        resourceName, // streamName
+        null, // sourceInfo
+        null, // size
+        null, // resourcePassword
+        null // passwordsMap
+        );
+  }
+
+  @Test
+  public void testExtractTextAndMetadata() throws Exception {
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example");
+    byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
+      ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain"));
+      assertNotNull(res);
+      assertEquals("TEXT OUT", res.getContent());
+      assertNotNull(res.getMetadata());
+      assertArrayEquals(new String[] {"text/plain"}, res.getMetadata().getValues("Content-Type"));
+      assertArrayEquals(new String[] {"test.txt"}, res.getMetadata().getValues("resourcename"));
+    }
+  }
+
+  @Test
+  public void testExtractOnlyXml() throws Exception {
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example");
+    byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
+      ExtractionResult res =
+          backend.extractOnly(
+              in, newRequest("test.txt", "text/plain"), ExtractingDocumentLoader.XML_FORMAT, null);
+      assertNotNull(res);
+      assertTrue(res.getContent().contains("<?xml"));
+      assertTrue(res.getContent().contains("XML OUT"));
+    }
+  }
+
+  @Test
+  public void testParseToSolrContentHandlerUnsupported() throws Exception {
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example");
+    byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
+      expectThrows(
+          UnsupportedOperationException.class,
+          () ->
+              backend.parseToSolrContentHandler(
+                  in,
+                  newRequest("test.txt", "text/plain"),
+                  new SolrContentHandler(new SimpleExtractionMetadata(), params(), null),
+                  new SimpleExtractionMetadata()));
+    }
+  }
+}

From 196dcdc1ca7c8d435f1f4d8f30a0df5e46cf6466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 16:58:50 +0200
Subject: [PATCH 06/47] Change testing to use TestContainers

---
 gradle/libs.versions.toml                     |   2 +
 solr/licenses/docker-java-LICENSE-ASL.txt     | 176 +++++++++++++++
 solr/licenses/docker-java-NOTICE.txt          |   7 +
 solr/licenses/docker-java-api-3.4.0.jar.sha1  |   1 +
 .../docker-java-transport-3.4.0.jar.sha1      |   1 +
 ...cker-java-transport-zerodep-3.4.0.jar.sha1 |   1 +
 solr/licenses/duct-tape-1.0.8.jar.sha1        |   1 +
 solr/licenses/duct-tape-LICENSE-MIT.txt       |  19 ++
 solr/licenses/jna-5.13.0.jar.sha1             |   1 +
 solr/licenses/testcontainers-1.20.4.jar.sha1  |   1 +
 solr/licenses/testcontainers-LICENSE-MIT.txt  |  19 ++
 solr/modules/extraction/build.gradle          |   5 +-
 solr/modules/extraction/gradle.lockfile       |   9 +-
 .../extraction/DummyExtractionBackend.java    |   1 +
 .../extraction/ExtractingRequestHandler.java  |   7 +-
 .../extraction/ExtractionBackendFactory.java  |   8 +-
 .../TikaServerExtractionBackend.java          |  47 ++--
 .../TikaServerExtractionBackendTest.java      | 205 ++++++++----------
 18 files changed, 374 insertions(+), 137 deletions(-)
 create mode 100644 solr/licenses/docker-java-LICENSE-ASL.txt
 create mode 100644 solr/licenses/docker-java-NOTICE.txt
 create mode 100644 solr/licenses/docker-java-api-3.4.0.jar.sha1
 create mode 100644 solr/licenses/docker-java-transport-3.4.0.jar.sha1
 create mode 100644 solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1
 create mode 100644 solr/licenses/duct-tape-1.0.8.jar.sha1
 create mode 100644 solr/licenses/duct-tape-LICENSE-MIT.txt
 create mode 100644 solr/licenses/jna-5.13.0.jar.sha1
 create mode 100644 solr/licenses/testcontainers-1.20.4.jar.sha1
 create mode 100644 solr/licenses/testcontainers-LICENSE-MIT.txt

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index ea14d91ce6b..396befbab3c 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -194,6 +194,7 @@ squareup-okhttp3-okhttp = "4.12.0"
 stephenc-jcip = "1.0-1"
 swagger3 = "2.2.22"
 tdunning-tdigest = "3.3"
+testcontainers = "1.20.4"
 thetaphi-forbiddenapis = "3.9"
 thisptr-jacksonjq = "0.0.13"
 threeten-bp = "1.6.8"
@@ -512,6 +513,7 @@ stephenc-jcip-annotations = { module = "com.github.stephenc.jcip:jcip-annotation
 swagger3-annotations-jakarta = { module = "io.swagger.core.v3:swagger-annotations-jakarta", version.ref = "swagger3" }
 swagger3-jaxrs2-jakarta = { module = "io.swagger.core.v3:swagger-jaxrs2-jakarta", version.ref = "swagger3" }
 tdunning-tdigest = { module = "com.tdunning:t-digest", version.ref = "tdunning-tdigest" }
+testcontainers = { module = "org.testcontainers:testcontainers", version.ref = "testcontainers" }
 thisptr-jacksonjq = { module = "net.thisptr:jackson-jq", version.ref = "thisptr-jacksonjq" }
 threeten-bp = { module = "org.threeten:threetenbp", version.ref = "threeten-bp" }
 xerces-impl = { module = "xerces:xercesImpl", version.ref = "xerces" }
diff --git a/solr/licenses/docker-java-LICENSE-ASL.txt b/solr/licenses/docker-java-LICENSE-ASL.txt
new file mode 100644
index 00000000000..492933f08c2
--- /dev/null
+++ b/solr/licenses/docker-java-LICENSE-ASL.txt
@@ -0,0 +1,176 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+    and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+    the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+    other entities that control, are controlled by, or are under common
+    control with that entity. For the purposes of this definition,
+    "control" means (i) the power, direct or indirect, to cause the
+    direction or management of such entity, whether by contract or
+    otherwise, or (ii) ownership of fifty percent (50%) or more of the
+    outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+    exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+    including but not limited to software source code, documentation
+    source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+    transformation or translation of a Source form, including but
+    not limited to compiled object code, generated documentation,
+    and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+    Object form, made available under the License, as indicated by a
+    copyright notice that is included in or attached to the work
+    (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+    form, that is based on (or derived from) the Work and for which the
+    editorial revisions, annotations, elaborations, or other modifications
+    represent, as a whole, an original work of authorship. For the purposes
+    of this License, Derivative Works shall not include works that remain
+    separable from, or merely link (or bind by name) to the interfaces of,
+    the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+    the original version of the Work and any modifications or additions
+    to that Work or Derivative Works thereof, that is intentionally
+    submitted to Licensor for inclusion in the Work by the copyright owner
+    or by an individual or Legal Entity authorized to submit on behalf of
+    the copyright owner. For the purposes of this definition, "submitted"
+    means any form of electronic, verbal, or written communication sent
+    to the Licensor or its representatives, including but not limited to
+    communication on electronic mailing lists, source code control systems,
+    and issue tracking systems that are managed by, or on behalf of, the
+    Licensor for the purpose of discussing and improving the Work, but
+    excluding communication that is conspicuously marked or otherwise
+    designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+    on behalf of whom a Contribution has been received by Licensor and
+    subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+    this License, each Contributor hereby grants to You a perpetual,
+    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+    copyright license to reproduce, prepare Derivative Works of,
+    publicly display, publicly perform, sublicense, and distribute the
+    Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+    this License, each Contributor hereby grants to You a perpetual,
+    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+    (except as stated in this section) patent license to make, have made,
+    use, offer to sell, sell, import, and otherwise transfer the Work,
+    where such license applies only to those patent claims licensable
+    by such Contributor that are necessarily infringed by their
+    Contribution(s) alone or by combination of their Contribution(s)
+    with the Work to which such Contribution(s) was submitted. If You
+    institute patent litigation against any entity (including a
+    cross-claim or counterclaim in a lawsuit) alleging that the Work
+    or a Contribution incorporated within the Work constitutes direct
+    or contributory patent infringement, then any patent licenses
+    granted to You under this License for that Work shall terminate
+    as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+    Work or Derivative Works thereof in any medium, with or without
+    modifications, and in Source or Object form, provided that You
+    meet the following conditions:
+
+    (a) You must give any other recipients of the Work or
+        Derivative Works a copy of this License; and
+
+    (b) You must cause any modified files to carry prominent notices
+        stating that You changed the files; and
+
+    (c) You must retain, in the Source form of any Derivative Works
+        that You distribute, all copyright, patent, trademark, and
+        attribution notices from the Source form of the Work,
+        excluding those notices that do not pertain to any part of
+        the Derivative Works; and
+
+    (d) If the Work includes a "NOTICE" text file as part of its
+        distribution, then any Derivative Works that You distribute must
+        include a readable copy of the attribution notices contained
+        within such NOTICE file, excluding those notices that do not
+        pertain to any part of the Derivative Works, in at least one
+        of the following places: within a NOTICE text file distributed
+        as part of the Derivative Works; within the Source form or
+        documentation, if provided along with the Derivative Works; or,
+        within a display generated by the Derivative Works, if and
+        wherever such third-party notices normally appear. The contents
+        of the NOTICE file are for informational purposes only and
+        do not modify the License. You may add Your own attribution
+        notices within Derivative Works that You distribute, alongside
+        or as an addendum to the NOTICE text from the Work, provided
+        that such additional attribution notices cannot be construed
+        as modifying the License.
+
+    You may add Your own copyright statement to Your modifications and
+    may provide additional or different license terms and conditions
+    for use, reproduction, or distribution of Your modifications, or
+    for any such Derivative Works as a whole, provided Your use,
+    reproduction, and distribution of the Work otherwise complies with
+    the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+    any Contribution intentionally submitted for inclusion in the Work
+    by You to the Licensor shall be under the terms and conditions of
+    this License, without any additional terms or conditions.
+    Notwithstanding the above, nothing herein shall supersede or modify
+    the terms of any separate license agreement you may have executed
+    with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+    names, trademarks, service marks, or product names of the Licensor,
+    except as required for reasonable and customary use in describing the
+    origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+    agreed to in writing, Licensor provides the Work (and each
+    Contributor provides its Contributions) on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+    implied, including, without limitation, any warranties or conditions
+    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+    PARTICULAR PURPOSE. You are solely responsible for determining the
+    appropriateness of using or redistributing the Work and assume any
+    risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+    whether in tort (including negligence), contract, or otherwise,
+    unless required by applicable law (such as deliberate and grossly
+    negligent acts) or agreed to in writing, shall any Contributor be
+    liable to You for damages, including any direct, indirect, special,
+    incidental, or consequential damages of any character arising as a
+    result of this License or out of the use or inability to use the
+    Work (including but not limited to damages for loss of goodwill,
+    work stoppage, computer failure or malfunction, or any and all
+    other commercial damages or losses), even if such Contributor
+    has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+    the Work or Derivative Works thereof, You may choose to offer,
+    and charge a fee for, acceptance of support, warranty, indemnity,
+    or other liability obligations and/or rights consistent with this
+    License. However, in accepting such obligations, You may act only
+    on Your own behalf and on Your sole responsibility, not on behalf
+    of any other Contributor, and only if You agree to indemnify,
+    defend, and hold each Contributor harmless for any liability
+    incurred by, or claims asserted against, such Contributor by reason
+    of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
diff --git a/solr/licenses/docker-java-NOTICE.txt b/solr/licenses/docker-java-NOTICE.txt
new file mode 100644
index 00000000000..49a9e022cce
--- /dev/null
+++ b/solr/licenses/docker-java-NOTICE.txt
@@ -0,0 +1,7 @@
+This product includes software developed by the docker-java project.
+
+Copyright (c) 2013-2025, docker-java project contributors
+
+Project: https://github.com/docker-java/docker-java
+
+Licensed under the Apache License, Version 2.0.
diff --git a/solr/licenses/docker-java-api-3.4.0.jar.sha1 b/solr/licenses/docker-java-api-3.4.0.jar.sha1
new file mode 100644
index 00000000000..bf5ca0d6db4
--- /dev/null
+++ b/solr/licenses/docker-java-api-3.4.0.jar.sha1
@@ -0,0 +1 @@
+9ef23dcc93693f15e69b64632be096c38e31bc44
diff --git a/solr/licenses/docker-java-transport-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-3.4.0.jar.sha1
new file mode 100644
index 00000000000..c1232d24a6b
--- /dev/null
+++ b/solr/licenses/docker-java-transport-3.4.0.jar.sha1
@@ -0,0 +1 @@
+c058705684d782effc4b2edfdef1a87544ba4af8
diff --git a/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1
new file mode 100644
index 00000000000..b658f8f0810
--- /dev/null
+++ b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1
@@ -0,0 +1 @@
+c4ce6d8695cfdb0027872f99cc20f8f679f8a969
diff --git a/solr/licenses/duct-tape-1.0.8.jar.sha1 b/solr/licenses/duct-tape-1.0.8.jar.sha1
new file mode 100644
index 00000000000..8ccb86d64ea
--- /dev/null
+++ b/solr/licenses/duct-tape-1.0.8.jar.sha1
@@ -0,0 +1 @@
+92edc22a9ab2f3e17c9bf700aaee377d50e8b530
diff --git a/solr/licenses/duct-tape-LICENSE-MIT.txt b/solr/licenses/duct-tape-LICENSE-MIT.txt
new file mode 100644
index 00000000000..9cf106272ac
--- /dev/null
+++ b/solr/licenses/duct-tape-LICENSE-MIT.txt
@@ -0,0 +1,19 @@
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/solr/licenses/jna-5.13.0.jar.sha1 b/solr/licenses/jna-5.13.0.jar.sha1
new file mode 100644
index 00000000000..93b456b9293
--- /dev/null
+++ b/solr/licenses/jna-5.13.0.jar.sha1
@@ -0,0 +1 @@
+1200e7ebeedbe0d10062093f32925a912020e747
diff --git a/solr/licenses/testcontainers-1.20.4.jar.sha1 b/solr/licenses/testcontainers-1.20.4.jar.sha1
new file mode 100644
index 00000000000..29746a98e88
--- /dev/null
+++ b/solr/licenses/testcontainers-1.20.4.jar.sha1
@@ -0,0 +1 @@
+ee2fe3afc9fa6cb2e6a43233998f3633f761692f
diff --git a/solr/licenses/testcontainers-LICENSE-MIT.txt b/solr/licenses/testcontainers-LICENSE-MIT.txt
new file mode 100644
index 00000000000..9cf106272ac
--- /dev/null
+++ b/solr/licenses/testcontainers-LICENSE-MIT.txt
@@ -0,0 +1,19 @@
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/solr/modules/extraction/build.gradle b/solr/modules/extraction/build.gradle
index da6ebaccd68..0cbb4c0174f 100644
--- a/solr/modules/extraction/build.gradle
+++ b/solr/modules/extraction/build.gradle
@@ -35,11 +35,10 @@ dependencies {
     exclude group: 'org.quartz-scheduler', module: 'quartz'
     exclude group: 'xml-apis', module: 'xml-apis'
   })
-  implementation (libs.xerces.impl, {
-    exclude group: 'xml-apis', module: 'xml-apis'
-  })
 
   testImplementation project(':solr:test-framework')
   testImplementation libs.apache.lucene.testframework
   testImplementation libs.junit.junit
+  testImplementation libs.testcontainers
+  testImplementation libs.carrotsearch.randomizedtesting.runner
 }
diff --git a/solr/modules/extraction/gradle.lockfile b/solr/modules/extraction/gradle.lockfile
index 458aae19c39..5e498280731 100644
--- a/solr/modules/extraction/gradle.lockfile
+++ b/solr/modules/extraction/gradle.lockfile
@@ -15,6 +15,9 @@ com.fasterxml.jackson.module:jackson-module-jakarta-xmlbind-annotations:2.20.0=j
 com.fasterxml.jackson:jackson-bom:2.20.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 com.fasterxml.woodstox:woodstox-core:7.0.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 com.github.ben-manes.caffeine:caffeine:3.2.2=annotationProcessor,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testRuntimeClasspath
+com.github.docker-java:docker-java-api:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath
+com.github.docker-java:docker-java-transport-zerodep:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath
+com.github.docker-java:docker-java-transport:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath
 com.github.jai-imageio:jai-imageio-core:1.4.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 com.github.junrar:junrar:7.5.3=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 com.github.kevinstern:software-and-algorithms:1.0=annotationProcessor,errorprone,testAnnotationProcessor
@@ -99,7 +102,8 @@ javax.inject:javax.inject:1=annotationProcessor,errorprone,testAnnotationProcess
 javax.measure:unit-api:1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 joda-time:joda-time:2.14.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 junit:junit:4.13.2=jarValidation,testCompileClasspath,testRuntimeClasspath
-net.java.dev.jna:jna:5.12.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
+net.java.dev.jna:jna:5.12.1=compileClasspath,runtimeClasspath,runtimeLibs
+net.java.dev.jna:jna:5.13.0=jarValidation,testCompileClasspath,testRuntimeClasspath
 net.sf.ehcache:ehcache-core:2.6.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 org.antlr:antlr4-runtime:4.13.2=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath
 org.apache.commons:commons-collections4:4.5.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
@@ -215,6 +219,7 @@ org.hamcrest:hamcrest:3.0=jarValidation,testCompileClasspath,testRuntimeClasspat
 org.itadaki:bzip2:0.9.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 org.javassist:javassist:3.30.2-GA=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath
 org.jdom:jdom2:2.0.6.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
+org.jetbrains:annotations:26.0.2=jarValidation,testCompileClasspath,testRuntimeClasspath
 org.jspecify:jspecify:1.0.0=annotationProcessor,compileClasspath,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testCompileClasspath,testRuntimeClasspath
 org.junit.jupiter:junit-jupiter-api:5.6.2=jarValidation,testRuntimeClasspath
 org.junit.platform:junit-platform-commons:1.6.2=jarValidation,testRuntimeClasspath
@@ -226,6 +231,7 @@ org.ow2.asm:asm-commons:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatf
 org.ow2.asm:asm-tree:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath
 org.ow2.asm:asm:9.8=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 org.pcollections:pcollections:4.0.1=annotationProcessor,errorprone,testAnnotationProcessor
+org.rnorth.duct-tape:duct-tape:1.0.8=jarValidation,testCompileClasspath,testRuntimeClasspath
 org.semver4j:semver4j:6.0.0=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath
 org.slf4j:jcl-over-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 org.slf4j:jul-to-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
@@ -234,6 +240,7 @@ org.tallison.xmp:xmpcore-shaded:6.1.10=compileClasspath,jarValidation,runtimeCla
 org.tallison:isoparser:1.9.41.7=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 org.tallison:jmatio:1.5=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 org.tallison:metadata-extractor:2.17.1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
+org.testcontainers:testcontainers:1.20.4=jarValidation,testCompileClasspath,testRuntimeClasspath
 org.tukaani:xz:1.9=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
 org.xerial.snappy:snappy-java:1.1.10.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath
 xerces:xercesImpl:2.12.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index 4d3955b4b1b..864bba00fdd 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -21,6 +21,7 @@
 /** Dummy backend that emits predictable test data without actually parsing input content. */
 public class DummyExtractionBackend implements ExtractionBackend {
   public static final String ID = "dummy";
+
   @Override
   public String name() {
     return ID;
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index ff4bddd0039..224ee54f0ac 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -66,12 +66,15 @@ public void inform(SolrCore core) {
 
       // Initialize backend factory once; backends are created lazily on demand
       String tikaServerUrl = (String) initArgs.get(TIKASERVER_URL);
-      backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl);
+      backendFactory =
+          new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl);
 
       // Choose default backend name (do not instantiate yet)
       String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND);
       defaultBackendName =
-          (backendName == null || backendName.trim().isEmpty()) ? LocalTikaExtractionBackend.ID : backendName;
+          (backendName == null || backendName.trim().isEmpty())
+              ? LocalTikaExtractionBackend.ID
+              : backendName;
 
     } catch (Exception e) {
       throw new SolrException(
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
index 558e5cd7f72..38033d8b935 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
@@ -33,7 +33,10 @@ public class ExtractionBackendFactory {
   private final Map<String, ExtractionBackend> cache = new ConcurrentHashMap<>();
 
   public ExtractionBackendFactory(
-      SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig, String tikaServerUrl) {
+      SolrCore core,
+      String tikaConfigLoc,
+      ParseContextConfig parseContextConfig,
+      String tikaServerUrl) {
     this.core = core;
     this.tikaConfigLoc = tikaConfigLoc;
     this.parseContextConfig = parseContextConfig;
@@ -67,7 +70,8 @@ protected ExtractionBackend create(String normalizedName) throws Exception {
       case DummyExtractionBackend.ID:
         return new DummyExtractionBackend();
       case TikaServerExtractionBackend.ID:
-        return new TikaServerExtractionBackend(tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
+        return new TikaServerExtractionBackend(
+            tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
       default:
         // Fallback to local for unknown names
         return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index 33ac66e7d86..c37cd1ba76c 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -30,8 +30,8 @@
  * Extraction backend that delegates parsing to a remote Apache Tika Server.
  *
  * <p>This backend uses Java 11 HttpClient to call Tika Server endpoints. It supports
- * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing
- * is not supported and will throw UnsupportedOperationException.
+ * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing is not supported
+ * and will throw UnsupportedOperationException.
  */
 public class TikaServerExtractionBackend implements ExtractionBackend {
   private final HttpClient httpClient;
@@ -62,11 +62,14 @@ public String name() {
   @Override
   public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
       throws Exception {
+    // Buffer the input so we can send it to multiple Tika Server endpoints
+    byte[] data = inputStream.readAllBytes();
+
     // 1) Extract text
-    String text = requestText(inputStream, request, false, null);
+    String text = requestText(data, request, false, null);
 
     // 2) Fetch metadata as JSON and convert to neutral metadata
-    ExtractionMetadata md = fetchMetadata(request);
+    ExtractionMetadata md = fetchMetadata(data, request);
 
     return new ExtractionResult(text, md);
   }
@@ -76,11 +79,15 @@ public ExtractionResult extractOnly(
       InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
       throws Exception {
     if (xpathExpr != null) {
-      throw new UnsupportedOperationException("XPath filtering is not supported by TikaServer backend");
+      throw new UnsupportedOperationException(
+          "XPath filtering is not supported by TikaServer backend");
     }
+    // Buffer the input so we can send it to multiple Tika Server endpoints
+    byte[] data = inputStream.readAllBytes();
+
     boolean wantXml = !ExtractingDocumentLoader.TEXT_FORMAT.equalsIgnoreCase(extractFormat);
-    String content = requestText(inputStream, request, wantXml, xpathExpr);
-    ExtractionMetadata md = fetchMetadata(request);
+    String content = requestText(data, request, wantXml, xpathExpr);
+    ExtractionMetadata md = fetchMetadata(data, request);
     return new ExtractionResult(content, md);
   }
 
@@ -95,11 +102,14 @@ public void parseToSolrContentHandler(
         "Legacy SAX-based parsing is not supported by TikaServer backend");
   }
 
-  private String requestText(
-      InputStream inputStream, ExtractionRequest request, boolean wantXml, String xpath)
+  private String requestText(byte[] data, ExtractionRequest request, boolean wantXml, String xpath)
       throws IOException, InterruptedException {
-    String url = baseUrl + "/tika";
-    HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream));
+    String path = wantXml ? "/tika/xhtml" : "/tika/text";
+    String url = baseUrl + path;
+    HttpRequest.Builder b =
+        HttpRequest.newBuilder(URI.create(url))
+            .timeout(timeout)
+            .PUT(HttpRequest.BodyPublishers.ofByteArray(data));
     // Content-Type
     String contentType = firstNonNull(request.streamType, request.contentType);
     if (contentType != null) {
@@ -109,8 +119,7 @@ private String requestText(
     if (request.resourceName != null) {
       b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
     }
-    // Response type
-    b.header("Accept", wantXml ? "application/xml" : "text/plain");
+    // Do not set Accept, let server choose default representation for the endpoint
 
     HttpResponse<byte[]> resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofByteArray());
     int code = resp.statusCode();
@@ -120,11 +129,14 @@ private String requestText(
     return new String(resp.body(), StandardCharsets.UTF_8);
   }
 
-  private ExtractionMetadata fetchMetadata(ExtractionRequest request)
+  private ExtractionMetadata fetchMetadata(byte[] data, ExtractionRequest request)
       throws IOException, InterruptedException {
-    // Call /meta to get metadata. Ask JSON form; Tika Server returns application/json map.
+    // Call /meta to get metadata for the provided content. Ask JSON form.
     String url = baseUrl + "/meta";
-    HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.noBody());
+    HttpRequest.Builder b =
+        HttpRequest.newBuilder(URI.create(url))
+            .timeout(timeout)
+            .PUT(HttpRequest.BodyPublishers.ofByteArray(data));
     String contentType = firstNonNull(request.streamType, request.contentType);
     if (contentType != null) {
       b.header("Content-Type", contentType);
@@ -134,7 +146,8 @@ private ExtractionMetadata fetchMetadata(ExtractionRequest request)
     }
     b.header("Accept", "application/json");
 
-    HttpResponse<String> resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
+    HttpResponse<String> resp =
+        httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
     int code = resp.statusCode();
     if (code < 200 || code >= 300) {
       throw new IOException("TikaServer /meta returned status " + code);
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
index 19846da1142..5c51ccba38f 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
@@ -16,127 +16,95 @@
  */
 package org.apache.solr.handler.extraction;
 
+import com.carrotsearch.randomizedtesting.ThreadFilter;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
 import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.CookieHandler;
-import java.net.ProxySelector;
-import java.net.URI;
 import java.net.http.HttpClient;
-import java.net.http.HttpHeaders;
-import java.net.http.HttpRequest;
-import java.net.http.HttpResponse;
-import java.security.SecureRandom;
-import java.time.Duration;
-import java.util.Optional;
-import java.util.concurrent.CompletableFuture;
-import java.util.concurrent.Executor;
-import javax.net.ssl.SSLContext;
-import javax.net.ssl.SSLParameters;
+import java.util.concurrent.ExecutorService;
+import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
+import org.apache.solr.SolrIgnoredThreadsFilter;
 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.junit.AfterClass;
+import org.junit.Assume;
+import org.junit.BeforeClass;
 import org.junit.Test;
+import org.testcontainers.containers.GenericContainer;
 
-/** Unit tests for TikaServerExtractionBackend using a mocked HttpClient (no networking). */
+/**
+ * Integration tests for TikaServerExtractionBackend using a real Tika Server via Testcontainers.
+ */
+@ThreadLeakFilters(
+    defaultFilters = true,
+    filters = {
+      SolrIgnoredThreadsFilter.class,
+      QuickPatchThreadsFilter.class,
+      TikaServerExtractionBackendTest.TestcontainersThreadsFilter.class
+    })
 public class TikaServerExtractionBackendTest extends SolrTestCaseJ4 {
 
+  // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test
+  public static class TestcontainersThreadsFilter implements ThreadFilter {
+    @Override
+    public boolean reject(Thread t) {
+      if (t == null || t.getName() == null) return false;
+      String n = t.getName();
+      return n.startsWith("testcontainers-ryuk")
+          || n.startsWith("testcontainers-wait-")
+          || n.startsWith("HttpClient-")
+          || n.startsWith("HttpClient-TestContainers");
+    }
+  }
+
   static {
     // Allow the SecureRandom algorithm used in this environment to avoid class configuration
     // failure in tests.
     System.setProperty("test.solr.allowed.securerandom", "NativePRNG");
   }
 
-  private static class FakeHttpClient extends HttpClient {
-    @Override
-    public Optional<CookieHandler> cookieHandler() { return Optional.empty(); }
-
-    @Override
-    public Optional<Duration> connectTimeout() { return Optional.of(Duration.ofSeconds(5)); }
-
-    @Override
-    public Redirect followRedirects() { return Redirect.NEVER; }
-
-    @Override
-    public Optional<ProxySelector> proxy() { return Optional.empty(); }
-
-    @Override
-    public SSLContext sslContext() { try { return SSLContext.getDefault(); } catch (Exception e) { throw new RuntimeException(e);} }
-
-    @Override
-    public SSLParameters sslParameters() { return new SSLParameters(); }
-
-    @Override
-    public Optional<Executor> executor() { return Optional.empty(); }
-
-    @Override
-    public Optional<java.net.Authenticator> authenticator() { return Optional.empty(); }
-
-    @Override
-    public Version version() { return Version.HTTP_1_1; }
-
-    @Override
-    public <T> HttpResponse<T> send(HttpRequest request, HttpResponse.BodyHandler<T> responseBodyHandler)
-        throws IOException, InterruptedException {
-      return respond(request, responseBodyHandler);
-    }
-
-    @Override
-    public <T> CompletableFuture<HttpResponse<T>> sendAsync(HttpRequest request, HttpResponse.BodyHandler<T> responseBodyHandler) {
-      return CompletableFuture.completedFuture(respond(request, responseBodyHandler));
+  private static GenericContainer<?> tika;
+  private static String baseUrl;
+  private static ExecutorService httpExec;
+  private static HttpClient client;
+
+  @BeforeClass
+  public static void startTikaServer() {
+    try {
+      httpExec =
+          ExecutorUtil.newMDCAwareFixedThreadPool(
+              2,
+              r -> {
+                Thread t = new Thread(r, "HttpClient-TestContainers");
+                t.setDaemon(true);
+                return t;
+              });
+      client = HttpClient.newBuilder().executor(httpExec).build();
+      tika = new GenericContainer<>("apache/tika:3.2.3.0-full").withExposedPorts(9998);
+      tika.start();
+      baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998);
+    } catch (Throwable t) {
+      // Skip tests if Docker/Testcontainers are not available in the environment
+      Assume.assumeNoException("Docker/Testcontainers not available; skipping TikaServer tests", t);
     }
+  }
 
-    @Override
-    public <T> CompletableFuture<HttpResponse<T>> sendAsync(HttpRequest request, HttpResponse.BodyHandler<T> responseBodyHandler, HttpResponse.PushPromiseHandler<T> pushPromiseHandler) {
-      return CompletableFuture.completedFuture(respond(request, responseBodyHandler));
+  @AfterClass
+  public static void stopTikaServer() {
+    if (tika != null) {
+      try {
+        tika.stop();
+      } catch (Throwable ignore) {
+      }
+      tika = null;
     }
-
-    private <T> HttpResponse<T> respond(HttpRequest request, HttpResponse.BodyHandler<T> handler) {
+    if (httpExec != null) {
       try {
-        URI uri = request.uri();
-        String path = uri.getPath();
-        byte[] body;
-        String ct;
-        int sc = 200;
-        if ("/tika".equals(path)) {
-          String accept = request.headers().firstValue("Accept").orElse("text/plain");
-          if ("application/xml".equalsIgnoreCase(accept)) {
-            String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><xhtml><body>XML OUT</body></xhtml>";
-            body = xml.getBytes(java.nio.charset.StandardCharsets.UTF_8);
-            ct = "application/xml";
-          } else {
-            body = "TEXT OUT".getBytes(java.nio.charset.StandardCharsets.UTF_8);
-            ct = "text/plain";
-          }
-        } else if ("/meta".equals(path)) {
-          String json =
-              "{\"Content-Type\":[\"text/plain\"],\"resourcename\":[\"test.txt\"],\"X-Parsed-By\":[\"SomeParser\"]}";
-          body = json.getBytes(java.nio.charset.StandardCharsets.UTF_8);
-          ct = "application/json";
-        } else {
-          body = "Not Found".getBytes(java.nio.charset.StandardCharsets.UTF_8);
-          sc = 404;
-          ct = "text/plain";
-        }
-        final int status = sc;
-        final String contentType = ct;
-        // Decide expected body type based on endpoint (mimics our backend usage)
-        final Object bodyObj =
-            "/meta".equals(path)
-                ? new String(body, java.nio.charset.StandardCharsets.UTF_8)
-                : body; // /tika returns bytes
-        return new HttpResponse<>() {
-          @Override public int statusCode() { return status; }
-          @Override public HttpRequest request() { return request; }
-          @Override public Optional<HttpResponse<T>> previousResponse() { return Optional.empty(); }
-          @Override public HttpHeaders headers() { return HttpHeaders.of(java.util.Map.of("Content-Type", java.util.List.of(contentType)), (k,v)->true); }
-          @Override public T body() { @SuppressWarnings("unchecked") T t = (T) bodyObj; return t; }
-          @Override public Optional<javax.net.ssl.SSLSession> sslSession() { return Optional.empty(); }
-          @Override public URI uri() { return uri; }
-          @Override public Version version() { return Version.HTTP_1_1; }
-        };
-      } catch (Exception e) {
-        throw new RuntimeException(e);
+        httpExec.shutdownNow();
+      } catch (Throwable ignore) {
       }
+      httpExec = null;
     }
+    client = null;
   }
 
   private static ExtractionRequest newRequest(String resourceName, String contentType) {
@@ -155,35 +123,48 @@ private static ExtractionRequest newRequest(String resourceName, String contentT
 
   @Test
   public void testExtractTextAndMetadata() throws Exception {
-    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example");
-    byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    Assume.assumeTrue("Tika server container not started", tika != null);
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
+    byte[] data = "Hello TestContainers".getBytes(java.nio.charset.StandardCharsets.UTF_8);
     try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
       ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain"));
       assertNotNull(res);
-      assertEquals("TEXT OUT", res.getContent());
+      assertNotNull(res.getContent());
+      assertTrue(res.getContent().contains("Hello TestContainers"));
       assertNotNull(res.getMetadata());
-      assertArrayEquals(new String[] {"text/plain"}, res.getMetadata().getValues("Content-Type"));
-      assertArrayEquals(new String[] {"test.txt"}, res.getMetadata().getValues("resourcename"));
+      String[] cts = res.getMetadata().getValues("Content-Type");
+      assertNotNull(cts);
+      assertTrue(cts.length >= 1);
+      // Tika may append charset; be flexible
+      assertTrue(cts[0].startsWith("text/plain"));
     }
   }
 
   @Test
   public void testExtractOnlyXml() throws Exception {
-    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example");
-    byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    Assume.assumeTrue("Tika server container not started", tika != null);
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
+    byte[] data = "Hello XML".getBytes(java.nio.charset.StandardCharsets.UTF_8);
     try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
       ExtractionResult res =
           backend.extractOnly(
               in, newRequest("test.txt", "text/plain"), ExtractingDocumentLoader.XML_FORMAT, null);
       assertNotNull(res);
-      assertTrue(res.getContent().contains("<?xml"));
-      assertTrue(res.getContent().contains("XML OUT"));
+      String c = res.getContent();
+      assertNotNull(c);
+      // Tika Server may return XHTML without XML declaration; be flexible
+      assertTrue(
+          c.contains("<?xml")
+              || c.toLowerCase(java.util.Locale.ROOT).contains("<html")
+              || c.toLowerCase(java.util.Locale.ROOT).contains("<xhtml"));
+      assertTrue(c.contains("Hello XML"));
     }
   }
 
   @Test
   public void testParseToSolrContentHandlerUnsupported() throws Exception {
-    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example");
+    Assume.assumeTrue("Tika server container not started", tika != null);
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
     byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
     try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
       expectThrows(

From 11ea400e4d0502e6d17b0600cdee08135312f4e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 17:57:19 +0200
Subject: [PATCH 07/47] Draft docs

---
 .../pages/indexing-with-tika.adoc             | 33 +++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc
index b0cdb7eba30..183af23e30b 100644
--- a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc
+++ b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc
@@ -18,9 +18,9 @@
 
 If the documents you need to index are in a binary format, such as Word, Excel, PDFs, etc., Solr includes a request handler which uses http://tika.apache.org/[Apache Tika] to extract text for indexing to Solr.
 
-Solr uses code from the Tika project to provide a framework for incorporating many different file-format parsers such as http://pdfbox.apache.org/[Apache PDFBox] and http://poi.apache.org/index.html[Apache POI] into Solr itself.
+There are two backends for this module. The `local` backend uses code from the Tika project to provide a framework for incorporating many different file-format parsers such as http://pdfbox.apache.org/[Apache PDFBox] and http://poi.apache.org/index.html[Apache POI] into Solr itself. The `tikaserver` backend uses an external Tika server process to do the extraction.
 
-Working with this framework, Solr's `ExtractingRequestHandler` uses Tika internally to support uploading binary files
+Working with this framework, Solr's `ExtractingRequestHandler` uses Tika to support uploading binary files
 for data extraction and indexing.
 Downloading Tika is not required to use Solr Cell.
 
@@ -49,6 +49,9 @@ By default it maps to the same name but several parameters control how this is d
 * When Solr Cell finishes creating the internal `SolrInputDocument`, the rest of the indexing stack takes over.
 The next step after any update handler is the xref:configuration-guide:update-request-processors.adoc[Update Request Processor] chain.
 
+== Tika Server
+
+TODO: Add documentation about Tika Server backend.
 
 == Module
 
@@ -170,6 +173,32 @@ The following parameters are accepted by the `ExtractingRequestHandler`.
 
 These parameters can be set for each indexing request (as request parameters), or they can be set for all requests to the request handler by defining them in <<solrconfig.xml Configuration,`solrconfig.xml`>>.
 
+`extraction.backend`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: local
+|===
++
+Choose the backend to use for extraction. The options are `local` or `tikaserver`.
+The `local` backend uses Tika libraries included with Solr to do the extraction, and is the default in Solr 9.
+The `tikaserver` backend uses an external Tika server process to do the extraction.
+**The `local` backend is deprecated and will be removed in a future release.**
++
+Example: In `solrconfig.xml`: `<str name="extraction.backend">tikaserver</str>`.
+
+`tikaserver.url`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: none
+|===
++
+Specifies the URL of the Tika server to use when the `extraction.backend` parameter is set to `tikaserver`.
+This parameter is required when using the `tikaserver` backend.
++
+Example: In `solrconfig.xml`: `<str name="tikaserver.url">http://my.tika.server</str>`.
+
 `capture`::
 +
 [%autowidth,frame=none]

From a3794cee910514be522f9661b41386a10479c9e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 19 Sep 2025 18:58:50 +0200
Subject: [PATCH 08/47] Use json response from Tika Cleanup TestContainer
 Refactor ExtractionMetadata Add returnType to ExtractionRequest Remove static
 initializers

---
 .../extraction/DummyExtractionBackend.java    |   4 +-
 .../extraction/ExtractingDocumentLoader.java  |  14 +-
 .../handler/extraction/ExtractionBackend.java |   3 +-
 .../extraction/ExtractionMetadata.java        |  66 ++++-
 .../handler/extraction/ExtractionRequest.java |   5 +-
 .../LocalTikaExtractionBackend.java           |   7 +-
 .../extraction/SimpleExtractionMetadata.java  |  52 ----
 .../TikaServerExtractionBackend.java          | 257 ++++++++++--------
 .../solr/collection1/conf/solrconfig.xml      |   2 +
 .../ExtractingRequestHandlerTest.java         |  72 ++++-
 .../LocalTikaExtractionBackendTest.java       |  18 +-
 .../TikaServerExtractionBackendTest.java      |  23 +-
 12 files changed, 304 insertions(+), 219 deletions(-)
 delete mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index 864bba00fdd..33ae55c63c8 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -29,7 +29,7 @@ public String name() {
 
   @Override
   public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) {
-    ExtractionMetadata metadata = new SimpleExtractionMetadata();
+    ExtractionMetadata metadata = new ExtractionMetadata();
     metadata.add("Dummy-Backend", "true");
     metadata.add(
         "Content-Type",
@@ -43,7 +43,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
 
   @Override
   public ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) {
+      InputStream inputStream, ExtractionRequest request, String xpathExpr) {
     if (xpathExpr != null) {
       throw new UnsupportedOperationException("XPath not supported by dummy backend");
     }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index b6a74008ff5..2214059e2f9 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -102,6 +102,8 @@ public void load(
 
       String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
       boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
+      String extractFormat =
+          params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT);
 
       // Parse optional passwords file into a map (keeps Tika usages out of this class)
       LinkedHashMap<Pattern, String> pwMap = null;
@@ -122,7 +124,8 @@ public void load(
               stream.getSourceInfo(),
               stream.getSize(),
               params.get(ExtractingParams.RESOURCE_PASSWORD, null),
-              pwMap);
+              pwMap,
+              extractFormat);
 
       boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
       String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
@@ -135,10 +138,8 @@ public void load(
               || (passwordsFile != null);
 
       if (extractOnly) {
-        String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT);
         try {
-          ExtractionResult result =
-              backend.extractOnly(inputStream, extractionRequest, extractFormat, xpathExpr);
+          ExtractionResult result = backend.extractOnly(inputStream, extractionRequest, xpathExpr);
           // Write content
           rsp.add(stream.getName(), result.getContent());
           // Write metadata
@@ -165,7 +166,7 @@ public void load(
 
       if (needLegacySax) {
         // Indexing with capture/xpath/etc: delegate SAX parse to backend
-        SimpleExtractionMetadata neutral = new SimpleExtractionMetadata();
+        ExtractionMetadata neutral = new ExtractionMetadata();
         SolrContentHandler handler =
             factory.createSolrContentHandler(neutral, params, req.getSchema());
         try {
@@ -194,8 +195,7 @@ public void load(
             log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
           // Index a document with literals only (no extracted content/metadata)
           SolrContentHandler handler =
-              factory.createSolrContentHandler(
-                  new SimpleExtractionMetadata(), params, req.getSchema());
+              factory.createSolrContentHandler(new ExtractionMetadata(), params, req.getSchema());
           addDoc(handler);
           return;
         }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index 3a253dc1ec3..715c73636e2 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -32,8 +32,7 @@ public interface ExtractionBackend {
    * xpathExpr; if unsupported and xpathExpr is not null, they should throw
    * UnsupportedOperationException.
    */
-  ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
+  ExtractionResult extractOnly(InputStream inputStream, ExtractionRequest request, String xpathExpr)
       throws Exception;
 
   /**
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
index b5864ec05c3..6229089d502 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
@@ -16,16 +16,64 @@
  */
 package org.apache.solr.handler.extraction;
 
-/**
- * Neutral metadata container used by extraction backends. Provides minimal operations needed by
- * SolrContentHandler and response building without depending on Apache Tika's Metadata class.
- */
-public interface ExtractionMetadata {
-  void add(String name, String value);
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+/** Simple metadata bean */
+public class ExtractionMetadata {
+  private final Map<String, List<String>> map = new LinkedHashMap<>();
+
+  public void add(String name, String value) {
+    if (name == null || value == null) return;
+    map.computeIfAbsent(name, k -> new ArrayList<>()).add(value);
+  }
+
+  public String[] getValues(String name) {
+    List<String> vals = map.get(name);
+    if (vals == null) return new String[0];
+    return vals.toArray(new String[0]);
+  }
+
+  public String get(String name) {
+    List<String> vals = map.get(name);
+    if (vals == null || vals.isEmpty()) return null;
+    return vals.get(0);
+  }
+
+  public String[] names() {
+    return map.keySet().toArray(new String[0]);
+  }
+
+  public void remove(String name) {
+    map.remove(name);
+  }
 
-  String[] getValues(String name);
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder("ExtractionMetadata{");
+    boolean first = true;
+    for (Map.Entry<String, List<String>> e : map.entrySet()) {
+      if (!first) sb.append(", ");
+      first = false;
+      sb.append(e.getKey()).append('=').append(e.getValue());
+    }
+    sb.append('}');
+    return sb.toString();
+  }
 
-  String get(String name);
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (!(obj instanceof ExtractionMetadata)) return false;
+    ExtractionMetadata that = (ExtractionMetadata) obj;
+    return Objects.equals(this.map, that.map);
+  }
 
-  String[] names();
+  @Override
+  public int hashCode() {
+    return Objects.hash(map);
+  }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
index f1af3029193..010f6633472 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
@@ -28,6 +28,7 @@ public class ExtractionRequest {
   public final String resourcePassword; // optional password for encrypted docs
   public final java.util.LinkedHashMap<java.util.regex.Pattern, String>
       passwordsMap; // optional passwords map
+  public final String extractFormat;
 
   public ExtractionRequest(
       String streamType,
@@ -38,7 +39,8 @@ public ExtractionRequest(
       String streamSourceInfo,
       Long streamSize,
       String resourcePassword,
-      java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap) {
+      java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap,
+      String extractFormat) {
     this.streamType = streamType;
     this.resourceName = resourceName;
     this.contentType = contentType;
@@ -48,5 +50,6 @@ public ExtractionRequest(
     this.streamSize = streamSize;
     this.resourcePassword = resourcePassword;
     this.passwordsMap = passwordsMap;
+    this.extractFormat = extractFormat;
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 687f0e6cc1e..e91716e1652 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -144,7 +144,7 @@ private ParseContext buildContext(Parser parser, ExtractionRequest request) {
   }
 
   private static ExtractionMetadata copyToNeutral(Metadata md) {
-    ExtractionMetadata out = new SimpleExtractionMetadata();
+    ExtractionMetadata out = new ExtractionMetadata();
     for (String name : md.names()) {
       String[] vals = md.getValues(name);
       if (vals != null) for (String v : vals) out.add(name, v);
@@ -168,8 +168,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
 
   @Override
   public ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
-      throws Exception {
+      InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception {
     Parser parser = selectParser(request);
     if (parser == null) {
       throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
@@ -178,7 +177,7 @@ public ExtractionResult extractOnly(
     ParseContext context = buildContext(parser, request);
 
     String content;
-    if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractFormat) || xpathExpr != null) {
+    if (ExtractingDocumentLoader.TEXT_FORMAT.equals(request.extractFormat) || xpathExpr != null) {
       org.apache.tika.sax.ToTextContentHandler textHandler =
           new org.apache.tika.sax.ToTextContentHandler();
       org.xml.sax.ContentHandler ch = textHandler;
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java
deleted file mode 100644
index d414b2eb05b..00000000000
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-import java.util.List;
-import java.util.Map;
-
-/** Simple in-memory implementation of ExtractionMetadata. */
-public class SimpleExtractionMetadata implements ExtractionMetadata {
-  private final Map<String, List<String>> map = new LinkedHashMap<>();
-
-  @Override
-  public void add(String name, String value) {
-    if (name == null || value == null) return;
-    map.computeIfAbsent(name, k -> new ArrayList<>()).add(value);
-  }
-
-  @Override
-  public String[] getValues(String name) {
-    List<String> vals = map.get(name);
-    if (vals == null) return new String[0];
-    return vals.toArray(new String[0]);
-  }
-
-  @Override
-  public String get(String name) {
-    List<String> vals = map.get(name);
-    if (vals == null || vals.isEmpty()) return null;
-    return vals.get(0);
-  }
-
-  @Override
-  public String[] names() {
-    return map.keySet().toArray(new String[0]);
-  }
-}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index c37cd1ba76c..ba12680ce7b 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -24,6 +24,8 @@
 import java.net.http.HttpResponse;
 import java.nio.charset.StandardCharsets;
 import java.time.Duration;
+import java.util.Arrays;
+import java.util.Set;
 import org.noggit.JSONParser;
 
 /**
@@ -62,33 +64,43 @@ public String name() {
   @Override
   public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
       throws Exception {
-    // Buffer the input so we can send it to multiple Tika Server endpoints
-    byte[] data = inputStream.readAllBytes();
-
-    // 1) Extract text
-    String text = requestText(data, request, false, null);
-
-    // 2) Fetch metadata as JSON and convert to neutral metadata
-    ExtractionMetadata md = fetchMetadata(data, request);
+    String url =
+        baseUrl
+            + "/tika/"
+            + (Set.of("html", "xml").contains(request.extractFormat) ? "html" : "text");
+    HttpRequest.Builder b =
+        HttpRequest.newBuilder(URI.create(url))
+            .timeout(timeout)
+            .header("Accept", "application/json");
+    String contentType = firstNonNull(request.streamType, request.contentType);
+    if (contentType != null) {
+      b.header("Content-Type", contentType);
+    }
+    if (request.resourceName != null) {
+      b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
+    }
+    b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream));
 
-    return new ExtractionResult(text, md);
+    HttpResponse<String> resp =
+        httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
+    int code = resp.statusCode();
+    if (code < 200 || code >= 300) {
+      String body = resp.body();
+      String preview = body == null ? "" : body.substring(0, Math.min(body.length(), 512));
+      throw new IOException("TikaServer " + url + " returned status " + code + " body: " + preview);
+    }
+    String body = resp.body();
+    return parseCombinedJson(body);
   }
 
   @Override
   public ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr)
-      throws Exception {
+      InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception {
     if (xpathExpr != null) {
       throw new UnsupportedOperationException(
           "XPath filtering is not supported by TikaServer backend");
     }
-    // Buffer the input so we can send it to multiple Tika Server endpoints
-    byte[] data = inputStream.readAllBytes();
-
-    boolean wantXml = !ExtractingDocumentLoader.TEXT_FORMAT.equalsIgnoreCase(extractFormat);
-    String content = requestText(data, request, wantXml, xpathExpr);
-    ExtractionMetadata md = fetchMetadata(data, request);
-    return new ExtractionResult(content, md);
+    return extract(inputStream, request);
   }
 
   @Override
@@ -96,128 +108,145 @@ public void parseToSolrContentHandler(
       InputStream inputStream,
       ExtractionRequest request,
       SolrContentHandler handler,
-      ExtractionMetadata outMetadata)
-      throws Exception {
+      ExtractionMetadata outMetadata) {
     throw new UnsupportedOperationException(
         "Legacy SAX-based parsing is not supported by TikaServer backend");
   }
 
-  private String requestText(byte[] data, ExtractionRequest request, boolean wantXml, String xpath)
-      throws IOException, InterruptedException {
-    String path = wantXml ? "/tika/xhtml" : "/tika/text";
-    String url = baseUrl + path;
-    HttpRequest.Builder b =
-        HttpRequest.newBuilder(URI.create(url))
-            .timeout(timeout)
-            .PUT(HttpRequest.BodyPublishers.ofByteArray(data));
-    // Content-Type
-    String contentType = firstNonNull(request.streamType, request.contentType);
-    if (contentType != null) {
-      b.header("Content-Type", contentType);
-    }
-    // Filename hint
-    if (request.resourceName != null) {
-      b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
-    }
-    // Do not set Accept, let server choose default representation for the endpoint
-
-    HttpResponse<byte[]> resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofByteArray());
-    int code = resp.statusCode();
-    if (code < 200 || code >= 300) {
-      throw new IOException("TikaServer /tika returned status " + code);
-    }
-    return new String(resp.body(), StandardCharsets.UTF_8);
+  private static String firstNonNull(String a, String b) {
+    return a != null ? a : b;
   }
 
-  private ExtractionMetadata fetchMetadata(byte[] data, ExtractionRequest request)
-      throws IOException, InterruptedException {
-    // Call /meta to get metadata for the provided content. Ask JSON form.
-    String url = baseUrl + "/meta";
-    HttpRequest.Builder b =
-        HttpRequest.newBuilder(URI.create(url))
-            .timeout(timeout)
-            .PUT(HttpRequest.BodyPublishers.ofByteArray(data));
-    String contentType = firstNonNull(request.streamType, request.contentType);
-    if (contentType != null) {
-      b.header("Content-Type", contentType);
-    }
-    if (request.resourceName != null) {
-      b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
-    }
-    b.header("Accept", "application/json");
-
-    HttpResponse<String> resp =
-        httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
-    int code = resp.statusCode();
-    if (code < 200 || code >= 300) {
-      throw new IOException("TikaServer /meta returned status " + code);
+  // Reads key-values of the current object into md. Assumes the parser is positioned
+  // right after OBJECT_START of that object.
+  private static ExtractionMetadata parseMetadataObject(JSONParser p) throws java.io.IOException {
+    ExtractionMetadata md = new ExtractionMetadata();
+    String currentKey;
+    while (true) {
+      int ev = p.nextEvent();
+      if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) {
+        break;
+      }
+      if (ev == JSONParser.STRING && p.wasKey()) {
+        currentKey = p.getString();
+        ev = p.nextEvent();
+        if (ev == JSONParser.STRING) {
+          md.add(currentKey, p.getString());
+        } else if (ev == JSONParser.ARRAY_START) {
+          while (true) {
+            ev = p.nextEvent();
+            if (ev == JSONParser.ARRAY_END) break;
+            if (ev == JSONParser.STRING) {
+              md.add(currentKey, p.getString());
+            } else if (ev == JSONParser.LONG
+                || ev == JSONParser.NUMBER
+                || ev == JSONParser.BIGNUMBER) {
+              md.add(currentKey, p.getNumberChars().toString());
+            } else if (ev == JSONParser.BOOLEAN) {
+              md.add(currentKey, String.valueOf(p.getBoolean()));
+            } else if (ev == JSONParser.NULL) {
+              // ignore nulls
+            } else {
+              // skip nested objects or unsupported types within arrays
+            }
+          }
+        } else if (ev == JSONParser.LONG || ev == JSONParser.NUMBER || ev == JSONParser.BIGNUMBER) {
+          md.add(currentKey, p.getNumberChars().toString());
+        } else if (ev == JSONParser.BOOLEAN) {
+          md.add(currentKey, String.valueOf(p.getBoolean()));
+        } else if (ev == JSONParser.NULL) {
+          // ignore nulls
+        } else if (ev == JSONParser.OBJECT_START) {
+          // Unexpected nested object; skip it entirely
+          skipObject(p);
+        } else {
+          // skip unsupported value types
+        }
+      }
     }
-    return parseJsonToMetadata(resp.body());
+    return md;
   }
 
-  private static String firstNonNull(String a, String b) {
-    return a != null ? a : b;
+  private static void skipObject(JSONParser p) throws java.io.IOException {
+    int depth = 1;
+    while (depth > 0) {
+      int ev = p.nextEvent();
+      if (ev == JSONParser.OBJECT_START) depth++;
+      else if (ev == JSONParser.OBJECT_END) depth--;
+      else if (ev == JSONParser.EOF) break;
+    }
   }
 
-  // Parse Tika Server metadata JSON using Noggit JSONParser. Supports values as strings,
-  // arrays of strings, and basic scalars (numbers/booleans) which are coerced to String.
-  private static ExtractionMetadata parseJsonToMetadata(String json) {
-    SimpleExtractionMetadata md = new SimpleExtractionMetadata();
-    if (json == null) return md;
+  // Parses combined JSON from /tika/text with Accept: application/json and returns both content
+  // and metadata. Supports two shapes:
+  // 1) {"content": "...", "metadata": { ... }}
+  // 2) {"content": "...", <flat metadata fields> }
+  private static ExtractionResult parseCombinedJson(String json) {
+    String content = "";
+    ExtractionMetadata md = new ExtractionMetadata();
+    if (json == null) return new ExtractionResult(content, md);
     try {
       JSONParser p = new JSONParser(json);
       int ev = p.nextEvent();
       if (ev != JSONParser.OBJECT_START) {
-        return md;
+        return new ExtractionResult(content, md);
       }
-      String currentKey = null;
       while (true) {
         ev = p.nextEvent();
-        if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) {
-          break;
-        }
+        if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) break;
         if (ev == JSONParser.STRING && p.wasKey()) {
-          currentKey = p.getString();
-          // Next event is the value for this key
+          String key = p.getString();
           ev = p.nextEvent();
-          if (ev == JSONParser.STRING) {
-            md.add(currentKey, p.getString());
-          } else if (ev == JSONParser.ARRAY_START) {
-            // Read array elements
-            while (true) {
-              ev = p.nextEvent();
-              if (ev == JSONParser.ARRAY_END) break;
-              if (ev == JSONParser.STRING) {
-                md.add(currentKey, p.getString());
-              } else if (ev == JSONParser.LONG
-                  || ev == JSONParser.NUMBER
-                  || ev == JSONParser.BIGNUMBER) {
-                md.add(currentKey, p.getNumberChars().toString());
-              } else if (ev == JSONParser.BOOLEAN) {
-                md.add(currentKey, String.valueOf(p.getBoolean()));
-              } else if (ev == JSONParser.NULL) {
-                // ignore nulls
-              } else {
-                // skip nested objects or unsupported types within arrays
-              }
+          if ("X-TIKA:content".equals(key)) {
+            if (ev == JSONParser.STRING) {
+              content = p.getString();
+            } else {
+              // Skip non-string content
+              if (ev == JSONParser.OBJECT_START) skipObject(p);
+            }
+          } else if ("metadata".equals(key)) {
+            if (ev == JSONParser.OBJECT_START) {
+              md = parseMetadataObject(p);
+            } else {
+              // unexpected shape; skip
+              if (ev == JSONParser.OBJECT_START) skipObject(p);
             }
-          } else if (ev == JSONParser.LONG
-              || ev == JSONParser.NUMBER
-              || ev == JSONParser.BIGNUMBER) {
-            md.add(currentKey, p.getNumberChars().toString());
-          } else if (ev == JSONParser.BOOLEAN) {
-            md.add(currentKey, String.valueOf(p.getBoolean()));
-          } else if (ev == JSONParser.NULL) {
-            // ignore nulls
           } else {
-            // skip nested objects or unsupported value types
+            // Treat as flat metadata field
+            if (ev == JSONParser.STRING) {
+              md.add(key, p.getString());
+            } else if (ev == JSONParser.ARRAY_START) {
+              while (true) {
+                ev = p.nextEvent();
+                if (ev == JSONParser.ARRAY_END) break;
+                if (ev == JSONParser.STRING) md.add(key, p.getString());
+                else if (ev == JSONParser.LONG
+                    || ev == JSONParser.NUMBER
+                    || ev == JSONParser.BIGNUMBER) md.add(key, p.getNumberChars().toString());
+                else if (ev == JSONParser.BOOLEAN) md.add(key, String.valueOf(p.getBoolean()));
+                else if (ev == JSONParser.NULL) {
+                  // ignore
+                }
+              }
+            } else if (ev == JSONParser.LONG
+                || ev == JSONParser.NUMBER
+                || ev == JSONParser.BIGNUMBER) {
+              md.add(key, p.getNumberChars().toString());
+            } else if (ev == JSONParser.BOOLEAN) {
+              md.add(key, String.valueOf(p.getBoolean()));
+            } else if (ev == JSONParser.NULL) {
+              // ignore
+            } else if (ev == JSONParser.OBJECT_START) {
+              // skip nested object for unknown key
+              skipObject(p);
+            }
           }
         }
       }
     } catch (java.io.IOException ioe) {
-      // Fall back to empty metadata on parsing error
-      return md;
+      // ignore, return what we have
     }
-    return md;
+    Arrays.stream(md.names()).filter(k -> k.startsWith("X-TIKA:Parsed-")).forEach(md::remove);
+    return new ExtractionResult(content, md);
   }
 }
diff --git a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
index 2c52f4591e8..f8a227b8cf9 100644
--- a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
+++ b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
@@ -152,6 +152,8 @@
 
   <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
     <str name="parseContext.config">parseContext.xml</str>
+    <str name="extraction.backend">${solr.test.extraction.backend:local}</str>
+    <str name="tikaserver.url">${solr.test.tikaserver.url:}</str>
   </requestHandler>
 
   <requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index c7098665027..1983f4e34e2 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -16,10 +16,15 @@
  */
 package org.apache.solr.handler.extraction;
 
+import com.carrotsearch.randomizedtesting.ThreadFilter;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
+import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.TimeZone;
+import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
+import org.apache.solr.SolrIgnoredThreadsFilter;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.util.ContentStream;
@@ -31,20 +36,44 @@
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.BufferingRequestProcessor;
+import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
-
-/** */
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.containers.wait.strategy.Wait;
+
+/** Generic tests, randomized between local and tikaserver backends */
+@ThreadLeakFilters(
+    defaultFilters = true,
+    filters = {
+      SolrIgnoredThreadsFilter.class,
+      QuickPatchThreadsFilter.class,
+      ExtractingRequestHandlerTest.TestcontainersThreadsFilter.class
+    })
 public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
-
-  static {
-    // Allow the SecureRandom algorithm used in this environment to avoid class configuration
-    // failure in tests.
-    // This mirrors passing -Dtest.solr.allowed.securerandom=NativePRNG at JVM startup.
-    System.setProperty("test.solr.allowed.securerandom", "NativePRNG");
+  // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test
+  @SuppressWarnings("NewClassNamingConvention")
+  public static class TestcontainersThreadsFilter implements ThreadFilter {
+    @Override
+    public boolean reject(Thread t) {
+      if (t == null || t.getName() == null) return false;
+      String n = t.getName();
+      return n.startsWith("testcontainers-ryuk")
+          || n.startsWith("testcontainers-wait-")
+          || n.startsWith("HttpClient-")
+          || n.startsWith("HttpClient-TestContainers");
+    }
   }
 
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  private static GenericContainer<?> tika;
+  private static boolean useTikaServer;
+
+  @SuppressWarnings("resource")
   @BeforeClass
   public static void beforeClass() throws Exception {
     // Is the JDK/env affected by a known bug?
@@ -59,9 +88,36 @@ public static void beforeClass() throws Exception {
           false);
     }
 
+    useTikaServer = random().nextBoolean();
+    if (useTikaServer) {
+      String baseUrl;
+      tika =
+          new GenericContainer<>("apache/tika:3.2.3.0-full")
+              .withExposedPorts(9998)
+              .waitingFor(Wait.forListeningPort());
+      tika.start();
+      baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998);
+      System.setProperty("solr.test.tikaserver.url", baseUrl);
+      System.setProperty("solr.test.extraction.backend", "tikaserver");
+      log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl);
+    } else {
+      log.info("Using extraction backend 'local'");
+    }
+
     initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr"));
   }
 
+  @AfterClass
+  public static void afterClass() throws Exception {
+    System.clearProperty("solr.test.tikaserver.url");
+    System.clearProperty("solr.test.extraction.backend");
+    if (useTikaServer && tika != null) {
+      tika.stop();
+      tika.close();
+      tika = null;
+    }
+  }
+
   @Override
   @Before
   public void setUp() throws Exception {
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
index df365f2bedf..4974f5a1903 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
@@ -53,7 +53,8 @@ private ExtractionRequest newRequest(
       String streamName,
       String streamSourceInfo,
       Long streamSize,
-      String resourcePassword) {
+      String resourcePassword,
+      String returnType) {
     return new ExtractionRequest(
         streamType,
         resourceName,
@@ -63,7 +64,8 @@ private ExtractionRequest newRequest(
         streamSourceInfo,
         streamSize,
         resourcePassword,
-        null);
+        null,
+        returnType);
   }
 
   @Test
@@ -80,7 +82,8 @@ public void testWrongStreamTypeThrows() throws Exception {
               "version_control.txt",
               null,
               null,
-              null);
+              null,
+              "text");
       expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req));
     }
 
@@ -95,7 +98,8 @@ public void testWrongStreamTypeThrows() throws Exception {
               "version_control.txt",
               null,
               null,
-              null);
+              null,
+              "text");
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
   }
@@ -113,7 +117,8 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception {
               "password-is-Word2010.docx",
               null,
               null,
-              null);
+              null,
+              "text");
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
   }
@@ -131,7 +136,8 @@ public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception {
               "password-is-Word2010.docx",
               null,
               null,
-              "Word2010");
+              "Word2010",
+              "text");
       ExtractionResult res = backend.extract(in, req);
       assertNotNull(res);
       assertNotNull(res.getMetadata());
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
index 5c51ccba38f..15f54707638 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
@@ -56,12 +56,6 @@ public boolean reject(Thread t) {
     }
   }
 
-  static {
-    // Allow the SecureRandom algorithm used in this environment to avoid class configuration
-    // failure in tests.
-    System.setProperty("test.solr.allowed.securerandom", "NativePRNG");
-  }
-
   private static GenericContainer<?> tika;
   private static String baseUrl;
   private static ExecutorService httpExec;
@@ -107,7 +101,8 @@ public static void stopTikaServer() {
     client = null;
   }
 
-  private static ExtractionRequest newRequest(String resourceName, String contentType) {
+  private static ExtractionRequest newRequest(
+      String resourceName, String contentType, String extractFormat) {
     return new ExtractionRequest(
         contentType, // streamType
         resourceName, // resourceName
@@ -117,7 +112,8 @@ private static ExtractionRequest newRequest(String resourceName, String contentT
         null, // sourceInfo
         null, // size
         null, // resourcePassword
-        null // passwordsMap
+        null, // passwordsMap
+        extractFormat // extraction format xml or text
         );
   }
 
@@ -127,7 +123,7 @@ public void testExtractTextAndMetadata() throws Exception {
     TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
     byte[] data = "Hello TestContainers".getBytes(java.nio.charset.StandardCharsets.UTF_8);
     try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
-      ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain"));
+      ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain", "text"));
       assertNotNull(res);
       assertNotNull(res.getContent());
       assertTrue(res.getContent().contains("Hello TestContainers"));
@@ -147,8 +143,7 @@ public void testExtractOnlyXml() throws Exception {
     byte[] data = "Hello XML".getBytes(java.nio.charset.StandardCharsets.UTF_8);
     try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
       ExtractionResult res =
-          backend.extractOnly(
-              in, newRequest("test.txt", "text/plain"), ExtractingDocumentLoader.XML_FORMAT, null);
+          backend.extractOnly(in, newRequest("test.txt", "text/plain", "xml"), null);
       assertNotNull(res);
       String c = res.getContent();
       assertNotNull(c);
@@ -172,9 +167,9 @@ public void testParseToSolrContentHandlerUnsupported() throws Exception {
           () ->
               backend.parseToSolrContentHandler(
                   in,
-                  newRequest("test.txt", "text/plain"),
-                  new SolrContentHandler(new SimpleExtractionMetadata(), params(), null),
-                  new SimpleExtractionMetadata()));
+                  newRequest("test.txt", "text/plain", "text"),
+                  new SolrContentHandler(new ExtractionMetadata(), params(), null),
+                  new ExtractionMetadata()));
     }
   }
 }

From cf971699209f3761a44c404aae8de6970aae4c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sun, 21 Sep 2025 01:44:14 +0200
Subject: [PATCH 09/47] Allow testcontainers to read config

---
 gradle/testing/randomization/policies/solr-tests.policy | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gradle/testing/randomization/policies/solr-tests.policy b/gradle/testing/randomization/policies/solr-tests.policy
index 2d3246c6d9b..7eb635db831 100644
--- a/gradle/testing/randomization/policies/solr-tests.policy
+++ b/gradle/testing/randomization/policies/solr-tests.policy
@@ -31,6 +31,9 @@ grant {
   permission java.io.FilePermission "${java.io.tmpdir}", "read,write";
   permission java.io.FilePermission "${java.io.tmpdir}${/}-", "read,write,delete";
 
+  // Allow Testcontainers to read user-level configuration
+  permission java.io.FilePermission "${user.home}${/}.testcontainers.properties", "read";
+
   permission java.io.FilePermission "${tests.linedocsfile}", "read";
   // DirectoryFactoryTest messes with these (wtf?)
   permission java.io.FilePermission "/tmp/inst1/conf/solrcore.properties", "read";
@@ -130,11 +133,11 @@ grant {
   permission javax.management.MBeanServerPermission "findMBeanServer";
   permission javax.management.MBeanServerPermission "releaseMBeanServer";
   permission javax.management.MBeanTrustPermission "register";
-  
+
   // needed by crossdc
   permission javax.security.auth.AuthPermission "getLoginConfiguration";
   permission javax.security.auth.AuthPermission "setLoginConfiguration";
-  
+
   // needed by benchmark
   permission java.security.SecurityPermission "insertProvider";
 
@@ -206,7 +209,7 @@ grant {
 
 // additional permissions based on system properties set by /bin/solr
 // NOTE: if the property is not set, the permission entry is ignored.
-grant {  
+grant {
   permission java.io.FilePermission "${solr.jetty.keystore}", "read,write,delete,readlink";
   permission java.io.FilePermission "${solr.jetty.keystore}${/}-", "read,write,delete,readlink";
 

From 87cb45c3f9b9912db7c8a48b5b576cdf1644621f Mon Sep 17 00:00:00 2001
From: Eric Pugh <epugh@opensourceconnections.com>
Date: Mon, 22 Sep 2025 10:27:45 -0400
Subject: [PATCH 10/47] Disable JSM

Java Security Manager and Testcontainers do not play nicely together.  We prefer Testcontainers, so disable JSM
---
 solr/modules/extraction/build.gradle | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/solr/modules/extraction/build.gradle b/solr/modules/extraction/build.gradle
index 0cbb4c0174f..66f3c2d0c00 100644
--- a/solr/modules/extraction/build.gradle
+++ b/solr/modules/extraction/build.gradle
@@ -19,6 +19,11 @@ apply plugin: 'java-library'
 
 description = 'Solr Integration with Tika for extracting content from binary file formats such as Microsoft Word and Adobe PDF'
 
+ext {
+  // Disable security manager for extraction module tests
+  useSecurityManager = false
+}
+
 dependencies {
   implementation platform(project(':platform'))
   implementation project(':solr:core')

From 7ebed82375d6e4b29f3128a7a0acb3e96e10fd16 Mon Sep 17 00:00:00 2001
From: Eric Pugh <epugh@opensourceconnections.com>
Date: Mon, 22 Sep 2025 10:28:02 -0400
Subject: [PATCH 11/47] IntelliJ prompted me..  and I couldn't resist.

---
 .../ExtractingRequestHandlerTest.java         | 33 ++++++++-----------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index 1983f4e34e2..2bd099ebe78 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -108,7 +108,7 @@ public static void beforeClass() throws Exception {
   }
 
   @AfterClass
-  public static void afterClass() throws Exception {
+  public static void afterClass() {
     System.clearProperty("solr.test.tikaserver.url");
     System.clearProperty("solr.test.extraction.backend");
     if (useTikaServer && tika != null) {
@@ -404,20 +404,18 @@ public void testDefaultField() throws Exception {
       ignoreException("unknown field 'meta'"); // TODO: should this exception be happening?
       expectThrows(
           SolrException.class,
-          () -> {
-            loadLocal(
-                "extraction/simple.html",
-                "literal.id",
-                "simple2",
-                "lowernames",
-                "true",
-                "captureAttr",
-                "true",
-                // "fmap.content_type", "abcxyz",
-                "commit",
-                "true" // test immediate commit
-                );
-          });
+          () -> loadLocal(
+              "extraction/simple.html",
+              "literal.id",
+              "simple2",
+              "lowernames",
+              "true",
+              "captureAttr",
+              "true",
+              // "fmap.content_type", "abcxyz",
+              "commit",
+              "true" // test immediate commit
+              ));
     } finally {
       resetExceptionIgnores();
     }
@@ -1115,16 +1113,13 @@ public void testPasswordProtected() throws Exception {
   SolrQueryResponse loadLocalFromHandler(String handler, String filename, String... args)
       throws Exception {
 
-    LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
-    try {
+    try (LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args)) {
       // TODO: stop using locally defined streams once stream.file and
       // stream.body work everywhere
       List<ContentStream> cs = new ArrayList<>();
       cs.add(new ContentStreamBase.FileStream(getFile(filename)));
       req.setContentStreams(cs);
       return h.queryAndResponse(handler, req);
-    } finally {
-      req.close();
     }
   }
 

From f25631ded66d0bfcd18736de5984c14443fd8829 Mon Sep 17 00:00:00 2001
From: Eric Pugh <epugh@opensourceconnections.com>
Date: Mon, 22 Sep 2025 10:46:29 -0400
Subject: [PATCH 12/47] lint

---
 .../ExtractingRequestHandlerTest.java         | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
index 2bd099ebe78..947860337f8 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
@@ -404,18 +404,19 @@ public void testDefaultField() throws Exception {
       ignoreException("unknown field 'meta'"); // TODO: should this exception be happening?
       expectThrows(
           SolrException.class,
-          () -> loadLocal(
-              "extraction/simple.html",
-              "literal.id",
-              "simple2",
-              "lowernames",
-              "true",
-              "captureAttr",
-              "true",
-              // "fmap.content_type", "abcxyz",
-              "commit",
-              "true" // test immediate commit
-              ));
+          () ->
+              loadLocal(
+                  "extraction/simple.html",
+                  "literal.id",
+                  "simple2",
+                  "lowernames",
+                  "true",
+                  "captureAttr",
+                  "true",
+                  // "fmap.content_type", "abcxyz",
+                  "commit",
+                  "true" // test immediate commit
+                  ));
     } finally {
       resetExceptionIgnores();
     }

From 5aa381f072283f524da8f0b92c29599e1a01dedd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 23 Sep 2025 19:16:40 +0200
Subject: [PATCH 13/47] Split test in two sub classes Add common metadata
 Adjust some tests with dc:title instead of title Support passwords in
 TikaServer backend

---
 .../extraction/DummyExtractionBackend.java    |   2 +-
 .../handler/extraction/ExtractionBackend.java |  20 +++
 .../LocalTikaExtractionBackend.java           |  19 +--
 .../RegexRulesPasswordProvider.java           |  11 ++
 .../TikaServerExtractionBackend.java          |  20 ++-
 .../ExtractingRequestHandlerLocalTest.java    |  19 +++
 ...ExtractingRequestHandlerTestAbstract.java} |  81 ++----------
 ...xtractingRequestHandlerTikaServerTest.java | 117 ++++++++++++++++++
 8 files changed, 198 insertions(+), 91 deletions(-)
 create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java
 rename solr/modules/extraction/src/test/org/apache/solr/handler/extraction/{ExtractingRequestHandlerTest.java => ExtractingRequestHandlerTestAbstract.java} (92%)
 create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index 33ae55c63c8..745216eb31a 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -29,7 +29,7 @@ public String name() {
 
   @Override
   public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) {
-    ExtractionMetadata metadata = new ExtractionMetadata();
+    ExtractionMetadata metadata = buildMetadataFromRequest(request);
     metadata.add("Dummy-Backend", "true");
     metadata.add(
         "Content-Type",
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index 715c73636e2..fd5c5409113 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -17,6 +17,8 @@
 package org.apache.solr.handler.extraction;
 
 import java.io.InputStream;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.TikaMetadataKeys;
 
 /** Strategy interface for content extraction backends. */
 public interface ExtractionBackend {
@@ -46,6 +48,24 @@ void parseToSolrContentHandler(
       ExtractionMetadata outMetadata)
       throws Exception;
 
+  /** Build ExtractionMetadata from the request context */
+  default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) {
+    ExtractionMetadata md = new ExtractionMetadata();
+    if (request.resourceName != null)
+      md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName);
+    if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType);
+    if (request.streamName != null)
+      md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName);
+    if (request.streamSourceInfo != null)
+      md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo);
+    if (request.streamSize != null)
+      md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize));
+    if (request.contentType != null)
+      md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType);
+    if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset);
+    return md;
+  }
+
   /** A short name for debugging/config, e.g., "local" or "dummy". */
   String name();
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index e91716e1652..d39011cf5a2 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -21,9 +21,7 @@
 import java.util.Locale;
 import org.apache.solr.core.SolrCore;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.DefaultParser;
@@ -112,19 +110,12 @@ private Parser selectParser(ExtractionRequest request) {
   }
 
   private Metadata buildMetadata(ExtractionRequest request) {
+    ExtractionMetadata extractionMetadata = buildMetadataFromRequest(request);
     Metadata md = new Metadata();
-    if (request.resourceName != null)
-      md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName);
-    if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType);
-    if (request.streamName != null)
-      md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName);
-    if (request.streamSourceInfo != null)
-      md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo);
-    if (request.streamSize != null)
-      md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize));
-    if (request.contentType != null)
-      md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType);
-    if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset);
+    for (String name : extractionMetadata.names()) {
+      String[] vals = extractionMetadata.getValues(name);
+      if (vals != null) for (String v : vals) md.add(name, v);
+    }
     return md;
   }
 
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
index 84b4e94171c..8e7f876da83 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
@@ -55,6 +55,17 @@ public String getPassword(Metadata meta) {
     return null;
   }
 
+  public String getPassword(ExtractionMetadata extractionMetadata) {
+    if (getExplicitPassword() != null) {
+      return getExplicitPassword();
+    }
+
+    if (passwordMap.size() > 0)
+      return lookupPasswordFromMap(extractionMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+
+    return null;
+  }
+
   private String lookupPasswordFromMap(String fileName) {
     if (fileName != null && fileName.length() > 0) {
       for (Entry<Pattern, String> e : passwordMap.entrySet()) {
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index ba12680ce7b..c7b0adaf0f5 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -76,6 +76,21 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     if (contentType != null) {
       b.header("Content-Type", contentType);
     }
+    ExtractionMetadata md = buildMetadataFromRequest(request);
+    if (request.resourcePassword != null || request.passwordsMap != null) {
+      RegexRulesPasswordProvider passwordProvider = new RegexRulesPasswordProvider();
+      if (request.resourcePassword != null) {
+        passwordProvider.setExplicitPassword(request.resourcePassword);
+      }
+      if (request.passwordsMap != null) {
+        passwordProvider.setPasswordMap(request.passwordsMap);
+      }
+
+      String pwd = passwordProvider.getPassword(md);
+      if (pwd != null) {
+        b.header("Password", pwd);
+      }
+    }
     if (request.resourceName != null) {
       b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\"");
     }
@@ -90,7 +105,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
       throw new IOException("TikaServer " + url + " returned status " + code + " body: " + preview);
     }
     String body = resp.body();
-    return parseCombinedJson(body);
+    return parseCombinedJson(body, md);
   }
 
   @Override
@@ -181,9 +196,8 @@ private static void skipObject(JSONParser p) throws java.io.IOException {
   // and metadata. Supports two shapes:
   // 1) {"content": "...", "metadata": { ... }}
   // 2) {"content": "...", <flat metadata fields> }
-  private static ExtractionResult parseCombinedJson(String json) {
+  private ExtractionResult parseCombinedJson(String json, ExtractionMetadata md) {
     String content = "";
-    ExtractionMetadata md = new ExtractionMetadata();
     if (json == null) return new ExtractionResult(content, md);
     try {
       JSONParser p = new JSONParser(json);
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java
new file mode 100644
index 00000000000..64dc90c1b50
--- /dev/null
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+public class ExtractingRequestHandlerLocalTest extends ExtractingRequestHandlerTestAbstract {}
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
similarity index 92%
rename from solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
rename to solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index 947860337f8..b9ed368e1ed 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -16,108 +16,39 @@
  */
 package org.apache.solr.handler.extraction;
 
-import com.carrotsearch.randomizedtesting.ThreadFilter;
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.TimeZone;
-import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
-import org.apache.solr.SolrIgnoredThreadsFilter;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.ContentStreamBase;
-import org.apache.solr.common.util.EnvUtils;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.BufferingRequestProcessor;
-import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.testcontainers.containers.GenericContainer;
-import org.testcontainers.containers.wait.strategy.Wait;
-
-/** Generic tests, randomized between local and tikaserver backends */
-@ThreadLeakFilters(
-    defaultFilters = true,
-    filters = {
-      SolrIgnoredThreadsFilter.class,
-      QuickPatchThreadsFilter.class,
-      ExtractingRequestHandlerTest.TestcontainersThreadsFilter.class
-    })
-public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
-  // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test
-  @SuppressWarnings("NewClassNamingConvention")
-  public static class TestcontainersThreadsFilter implements ThreadFilter {
-    @Override
-    public boolean reject(Thread t) {
-      if (t == null || t.getName() == null) return false;
-      String n = t.getName();
-      return n.startsWith("testcontainers-ryuk")
-          || n.startsWith("testcontainers-wait-")
-          || n.startsWith("HttpClient-")
-          || n.startsWith("HttpClient-TestContainers");
-    }
-  }
 
+public abstract class ExtractingRequestHandlerTestAbstract extends SolrTestCaseJ4 {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
-  private static GenericContainer<?> tika;
-  private static boolean useTikaServer;
-
   @SuppressWarnings("resource")
   @BeforeClass
   public static void beforeClass() throws Exception {
     // Is the JDK/env affected by a known bug?
     final String tzDisplayName =
         TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
-    if (!tzDisplayName.matches("[A-Za-z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) {
-      assertTrue(
-          "Is some other JVM affected?  Or bad regex? TzDisplayName: " + tzDisplayName,
-          EnvUtils.getProperty("java.version").startsWith("11"));
-      assumeTrue(
-          "SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.",
-          false);
-    }
-
-    useTikaServer = random().nextBoolean();
-    if (useTikaServer) {
-      String baseUrl;
-      tika =
-          new GenericContainer<>("apache/tika:3.2.3.0-full")
-              .withExposedPorts(9998)
-              .waitingFor(Wait.forListeningPort());
-      tika.start();
-      baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998);
-      System.setProperty("solr.test.tikaserver.url", baseUrl);
-      System.setProperty("solr.test.extraction.backend", "tikaserver");
-      log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl);
-    } else {
-      log.info("Using extraction backend 'local'");
-    }
-
     initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr"));
   }
 
-  @AfterClass
-  public static void afterClass() {
-    System.clearProperty("solr.test.tikaserver.url");
-    System.clearProperty("solr.test.extraction.backend");
-    if (useTikaServer && tika != null) {
-      tika.stop();
-      tika.close();
-      tika = null;
-    }
-  }
-
   @Override
   @Before
   public void setUp() throws Exception {
@@ -754,9 +685,12 @@ public void testExtractOnly() throws Exception {
 
     NamedList<?> nl = (NamedList<?>) list.get("solr-word.pdf_metadata");
     assertNotNull("nl is null and it shouldn't be", nl);
-    Object title = nl.get("title");
+    // TODO: Tika server v3.x has normalized metadata and do not return the 'title' key. Consider
+    // backcompat mode mapping dc:title to title???
+    Object title = nl.get("dc:title");
     assertNotNull("title is null and it shouldn't be", title);
-    assertTrue(extraction.contains("<?xml"));
+    // TODO: Tika Server return xhtml, without xml header, otherwise fairly similar
+    assertTrue(extraction.contains("<?xml") || extraction.contains("<html xmlns"));
 
     rsp =
         loadLocal(
@@ -775,7 +709,8 @@ public void testExtractOnly() throws Exception {
 
     nl = (NamedList<?>) list.get("solr-word.pdf_metadata");
     assertNotNull("nl is null and it shouldn't be", nl);
-    title = nl.get("title");
+    // TODO: See above
+    title = nl.get("dc:title");
     assertNotNull("title is null and it shouldn't be", title);
   }
 
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
new file mode 100644
index 00000000000..14cc89d0cc6
--- /dev/null
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import com.carrotsearch.randomizedtesting.ThreadFilter;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
+import java.lang.invoke.MethodHandles;
+import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
+import org.apache.solr.SolrIgnoredThreadsFilter;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.testcontainers.containers.GenericContainer;
+import org.testcontainers.containers.wait.strategy.Wait;
+
+/** Generic tests, randomized between local and tikaserver backends */
+@ThreadLeakFilters(
+    defaultFilters = true,
+    filters = {
+      SolrIgnoredThreadsFilter.class,
+      QuickPatchThreadsFilter.class,
+      ExtractingRequestHandlerTikaServerTest.TestcontainersThreadsFilter.class
+    })
+public class ExtractingRequestHandlerTikaServerTest extends ExtractingRequestHandlerTestAbstract {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+  private static GenericContainer<?> tika;
+
+  // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test
+  @SuppressWarnings("NewClassNamingConvention")
+  public static class TestcontainersThreadsFilter implements ThreadFilter {
+    @Override
+    public boolean reject(Thread t) {
+      if (t == null || t.getName() == null) return false;
+      String n = t.getName();
+      return n.startsWith("testcontainers-ryuk")
+          || n.startsWith("testcontainers-wait-")
+          || n.startsWith("HttpClient-")
+          || n.startsWith("HttpClient-TestContainers");
+    }
+  }
+
+  @BeforeClass
+  public static void beforeClassTika() throws Exception {
+    String baseUrl = null;
+    tika =
+        new GenericContainer<>("apache/tika:3.2.3.0-full")
+            .withExposedPorts(9998)
+            .waitingFor(Wait.forListeningPort());
+    try {
+      tika.start();
+      baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998);
+      System.setProperty("solr.test.tikaserver.url", baseUrl);
+      System.setProperty("solr.test.extraction.backend", "tikaserver");
+      log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl);
+      ExtractingRequestHandlerTestAbstract.beforeClass();
+    } catch (Throwable t) {
+      // Best-effort cleanup to avoid leaking resources if class initialization fails
+      try {
+        System.clearProperty("solr.test.tikaserver.url");
+        System.clearProperty("solr.test.extraction.backend");
+      } catch (Throwable ignored) {
+      }
+      try {
+        // Ensure any partially initialized core and clients are released
+        org.apache.solr.SolrTestCaseJ4.deleteCore();
+      } catch (Throwable ignored) {
+      }
+      if (tika != null) {
+        try {
+          tika.stop();
+        } catch (Throwable ignored) {
+        }
+        try {
+          tika.close();
+        } catch (Throwable ignored) {
+        }
+        tika = null;
+      }
+      throw t;
+    }
+  }
+
+  @AfterClass
+  public static void afterClassTika() throws Exception {
+    // TODO: There are still thread leaks after these tests, probably due to failing tests
+    deleteCore();
+    // Stop and dispose of the Tika container if it was started
+    if (tika != null) {
+      try {
+        tika.stop();
+      } finally {
+        try {
+          tika.close();
+        } catch (Throwable ignore2) {
+        }
+        tika = null;
+      }
+    }
+    System.clearProperty("solr.test.tikaserver.url");
+    System.clearProperty("solr.test.extraction.backend");
+  }
+}

From ef7850d0cb0c95ef1f6d5ec358be8eba4a5478cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 23 Sep 2025 21:32:02 +0200
Subject: [PATCH 14/47] Some error handling

---
 .../handler/extraction/ExtractingDocumentLoader.java   |  6 ++++++
 .../extraction/TikaServerExtractionBackend.java        | 10 ++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 2214059e2f9..770076cd548 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -171,6 +171,12 @@ public void load(
             factory.createSolrContentHandler(neutral, params, req.getSchema());
         try {
           backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral);
+        } catch (UnsupportedOperationException uoe) {
+          // For backends that don't support parseToSolrContentHandler
+          log.warn("skip extracting text due to {}.", uoe.getMessage());
+          throw new SolrException(
+              SolrException.ErrorCode.BAD_REQUEST,
+              "The requested operation is not supported by backend '" + backend.name() + "'.");
         } catch (Exception e) {
           if (ignoreTikaException) {
             if (log.isWarnEnabled())
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index c7b0adaf0f5..3a39caf57e7 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -16,7 +16,6 @@
  */
 package org.apache.solr.handler.extraction;
 
-import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
 import java.net.http.HttpClient;
@@ -26,6 +25,7 @@
 import java.time.Duration;
 import java.util.Arrays;
 import java.util.Set;
+import org.apache.solr.common.SolrException;
 import org.noggit.JSONParser;
 
 /**
@@ -96,13 +96,15 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     }
     b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream));
 
+    // TODO: Consider getting the InputStream instead
     HttpResponse<String> resp =
         httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
     int code = resp.statusCode();
     if (code < 200 || code >= 300) {
-      String body = resp.body();
-      String preview = body == null ? "" : body.substring(0, Math.min(body.length(), 512));
-      throw new IOException("TikaServer " + url + " returned status " + code + " body: " + preview);
+      // TODO: Parse error message from response?
+      throw new SolrException(
+          SolrException.ErrorCode.getErrorCode(code),
+          "TikaServer " + url + " returned status " + code);
     }
     String body = resp.body();
     return parseCombinedJson(body, md);

From f29751447a4c1a1544c1976e5c3a995956ba6e03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 23 Sep 2025 21:58:59 +0200
Subject: [PATCH 15/47] Properly skip test if Docker not available

---
 ...xtractingRequestHandlerTikaServerTest.java | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index 14cc89d0cc6..a73081ec042 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
 import org.apache.solr.SolrIgnoredThreadsFilter;
 import org.junit.AfterClass;
+import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -70,28 +71,10 @@ public static void beforeClassTika() throws Exception {
       ExtractingRequestHandlerTestAbstract.beforeClass();
     } catch (Throwable t) {
       // Best-effort cleanup to avoid leaking resources if class initialization fails
-      try {
-        System.clearProperty("solr.test.tikaserver.url");
-        System.clearProperty("solr.test.extraction.backend");
-      } catch (Throwable ignored) {
-      }
-      try {
-        // Ensure any partially initialized core and clients are released
-        org.apache.solr.SolrTestCaseJ4.deleteCore();
-      } catch (Throwable ignored) {
-      }
-      if (tika != null) {
-        try {
-          tika.stop();
-        } catch (Throwable ignored) {
-        }
-        try {
-          tika.close();
-        } catch (Throwable ignored) {
-        }
-        tika = null;
-      }
-      throw t;
+      System.clearProperty("solr.test.tikaserver.url");
+      System.clearProperty("solr.test.extraction.backend");
+      // Skip tests if Docker/Testcontainers are not available in the environment
+      Assume.assumeNoException("Docker/Testcontainers not available; skipping test", t);
     }
   }
 

From b1840eebdbbae0548f7ac6a423bbcccd94837f2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 23 Sep 2025 22:25:02 +0200
Subject: [PATCH 16/47] Fix precommit

---
 .../solr/handler/extraction/ExtractingDocumentLoader.java  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 770076cd548..ef68427099e 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -173,14 +173,17 @@ public void load(
           backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral);
         } catch (UnsupportedOperationException uoe) {
           // For backends that don't support parseToSolrContentHandler
-          log.warn("skip extracting text due to {}.", uoe.getMessage());
+          if (log.isWarnEnabled()) {
+            log.warn("skip extracting text since tika backend does not yet support this option");
+          }
           throw new SolrException(
               SolrException.ErrorCode.BAD_REQUEST,
               "The requested operation is not supported by backend '" + backend.name() + "'.");
         } catch (Exception e) {
           if (ignoreTikaException) {
-            if (log.isWarnEnabled())
+            if (log.isWarnEnabled()) {
               log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+            }
             // Index a document with literals only (no extracted content/metadata)
             addDoc(handler);
             return;

From 6ec9ddabd815ee82450c827f1a6548b9dbb539bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Wed, 24 Sep 2025 00:21:51 +0200
Subject: [PATCH 17/47] Review feedback. ID -> NAME

---
 .../extraction/DummyExtractionBackend.java    |  4 ++--
 .../extraction/ExtractingRequestHandler.java  |  2 +-
 .../extraction/ExtractionBackendFactory.java  | 20 +++++++------------
 .../LocalTikaExtractionBackend.java           |  4 ++--
 .../TikaServerExtractionBackend.java          |  4 ++--
 .../ExtractingRequestHandlerTestAbstract.java |  2 +-
 6 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index 745216eb31a..e85844ff46f 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -20,11 +20,11 @@
 
 /** Dummy backend that emits predictable test data without actually parsing input content. */
 public class DummyExtractionBackend implements ExtractionBackend {
-  public static final String ID = "dummy";
+  public static final String NAME = "dummy";
 
   @Override
   public String name() {
-    return ID;
+    return NAME;
   }
 
   @Override
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 224ee54f0ac..09e2dddb0e0 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -73,7 +73,7 @@ public void inform(SolrCore core) {
       String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND);
       defaultBackendName =
           (backendName == null || backendName.trim().isEmpty())
-              ? LocalTikaExtractionBackend.ID
+              ? LocalTikaExtractionBackend.NAME
               : backendName;
 
     } catch (Exception e) {
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
index 38033d8b935..abe3ab726f0 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
@@ -58,23 +58,17 @@ public ExtractionBackend getBackend(String name) {
   }
 
   private String normalize(String name) {
-    if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.ID;
+    if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.NAME;
     return name.trim().toLowerCase(Locale.ROOT);
   }
 
   /** Creates a new backend instance for the given normalized name. */
   protected ExtractionBackend create(String normalizedName) throws Exception {
-    switch (normalizedName) {
-      case LocalTikaExtractionBackend.ID:
-        return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
-      case DummyExtractionBackend.ID:
-        return new DummyExtractionBackend();
-      case TikaServerExtractionBackend.ID:
-        return new TikaServerExtractionBackend(
-            tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
-      default:
-        // Fallback to local for unknown names
-        return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
-    }
+    return switch (normalizedName) {
+      case DummyExtractionBackend.NAME -> new DummyExtractionBackend();
+      case TikaServerExtractionBackend.NAME -> new TikaServerExtractionBackend(
+          tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
+      default -> new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
+    };
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index d39011cf5a2..28470ff7024 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -94,11 +94,11 @@ public LocalTikaExtractionBackend(
     this.autoDetectParser = new AutoDetectParser(cfg);
   }
 
-  public static final String ID = "local";
+  public static final String NAME = "local";
 
   @Override
   public String name() {
-    return ID;
+    return NAME;
   }
 
   private Parser selectParser(ExtractionRequest request) {
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index 3a39caf57e7..196d9397cf4 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -54,11 +54,11 @@ public TikaServerExtractionBackend(String baseUrl) {
     this.httpClient = httpClient;
   }
 
-  public static final String ID = "tikaserver";
+  public static final String NAME = "tikaserver";
 
   @Override
   public String name() {
-    return ID;
+    return NAME;
   }
 
   @Override
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index b9ed368e1ed..349dae50ce6 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -1068,7 +1068,7 @@ public void testDummyBackendExtractOnly() throws Exception {
         loadLocal(
             "extraction/version_control.txt",
             ExtractingParams.EXTRACTION_BACKEND,
-            DummyExtractionBackend.ID,
+            DummyExtractionBackend.NAME,
             ExtractingParams.EXTRACT_ONLY,
             "true",
             ExtractingParams.EXTRACT_FORMAT,

From 902355d7cc0300f82228cb7588e007ff2fe343d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Wed, 24 Sep 2025 00:29:47 +0200
Subject: [PATCH 18/47] Review feedback. Simplify metadata add code

---
 .../handler/extraction/ExtractionBackend.java | 19 +++++++------------
 .../extraction/ExtractionMetadata.java        |  6 ++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index fd5c5409113..9647d0f843b 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -51,18 +51,13 @@ void parseToSolrContentHandler(
   /** Build ExtractionMetadata from the request context */
   default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) {
     ExtractionMetadata md = new ExtractionMetadata();
-    if (request.resourceName != null)
-      md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName);
-    if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType);
-    if (request.streamName != null)
-      md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName);
-    if (request.streamSourceInfo != null)
-      md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo);
-    if (request.streamSize != null)
-      md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize));
-    if (request.contentType != null)
-      md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType);
-    if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset);
+    md.addIfNotNull(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName);
+    md.addIfNotNull(HttpHeaders.CONTENT_TYPE, request.contentType);
+    md.addIfNotNull(ExtractingMetadataConstants.STREAM_NAME, request.streamName);
+    md.addIfNotNull(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo);
+    md.addIfNotNull(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize));
+    md.addIfNotNull(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType);
+    md.addIfNotNull(HttpHeaders.CONTENT_ENCODING, request.charset);
     return md;
   }
 
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
index 6229089d502..c400bc90fb1 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
@@ -31,6 +31,12 @@ public void add(String name, String value) {
     map.computeIfAbsent(name, k -> new ArrayList<>()).add(value);
   }
 
+  public void addIfNotNull(String resourceNameKey, String resourceName) {
+    if (resourceName != null) {
+      add(resourceNameKey, resourceName);
+    }
+  }
+
   public String[] getValues(String name) {
     List<String> vals = map.get(name);
     if (vals == null) return new String[0];

From 1cfcce9f7474d7e69bdd2ac4ea364893140e1c6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Wed, 24 Sep 2025 00:37:10 +0200
Subject: [PATCH 19/47] Error handling for factory

---
 .../handler/extraction/ExtractionBackendFactory.java  | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
index abe3ab726f0..7ee0c163152 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
@@ -19,6 +19,7 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.core.SolrCore;
 
 /**
@@ -52,7 +53,10 @@ public ExtractionBackend getBackend(String name) {
           try {
             return create(k);
           } catch (Exception e) {
-            throw new RuntimeException("Failed to create extraction backend '" + k + "'", e);
+            throw new SolrException(
+                SolrException.ErrorCode.SERVER_ERROR,
+                "Failed to create extraction backend '" + k + "'",
+                e);
           }
         });
   }
@@ -68,7 +72,10 @@ protected ExtractionBackend create(String normalizedName) throws Exception {
       case DummyExtractionBackend.NAME -> new DummyExtractionBackend();
       case TikaServerExtractionBackend.NAME -> new TikaServerExtractionBackend(
           tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
-      default -> new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig);
+      case LocalTikaExtractionBackend.NAME -> new LocalTikaExtractionBackend(
+          core, tikaConfigLoc, parseContextConfig);
+      default -> throw new SolrException(
+          SolrException.ErrorCode.BAD_REQUEST, "Unknown extraction backend: " + normalizedName);
     };
   }
 }

From b769c06c35bfa679953867a957aa43aca6c466b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Wed, 24 Sep 2025 01:18:32 +0200
Subject: [PATCH 20/47] More documentation

---
 .../pages/indexing-with-tika.adoc             | 52 ++++++++++++++++---
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc
index 183af23e30b..c7f67b9968b 100644
--- a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc
+++ b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc
@@ -51,7 +51,41 @@ The next step after any update handler is the xref:configuration-guide:update-re
 
 == Tika Server
 
-TODO: Add documentation about Tika Server backend.
+The `tikaserver` backend lets Solr delegate content extraction to an external Apache Tika Server process instead of running Tika parsers inside the Solr JVM. This can improve operational isolation (crashes or heavy parsing won’t impact Solr), simplify dependency management, and allow you to scale Tika independently of Solr.
+
+Example handler configuration:
+
+[source,xml]
+----
+<requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler">
+  <lst name="defaults">
+    <!-- Select the tikaserver backend by default for this handler -->
+    <str name="extraction.backend">tikaserver</str>
+  </lst>
+  <!-- Point Solr to your Tika Server -->
+  <str name="tikaserver.url">http://localhost:9998</str>
+</requestHandler>
+----
+
+=== Starting Tika Server with Docker
+
+The quickest way to run Tika Server for development is using Docker. The examples below expose Tika on port 9998 on localhost, matching the default value when `tikaserver.url` is not explicitly set.
+
+[,bash]
+----
+docker run --rm -p 9998:9998 apache/tika:3.2.3.0-full
+----
+
+NOTE: If Solr runs in Docker too, ensure both containers share a network and use the Tika container name as the host in `tikaserver.url`.
+
+=== Limitations
+Currently, the `tikaserver` option lacks some features and will return HTTP 400 in these cases:
+
+- `capture` and `captureAttr`: Selecting specific XHTML elements/attributes during indexing requires Solr’s SAX ContentHandler and is not supported by the `tikaserver` backend.
+- `xpath`: Server-side XPath filtering of the XHTML is not supported.
+- `passwordsFile` and `resource.password` for the indexing path: these options trigger the legacy SAX path in Solr and are not currently supported.
+
+Metadata produced by Tika Server can differ slightly from local Tika, particularly in key names and the presence/absence of certain fields. Adjust your `fmap.*` mappings accordingly.
 
 == Module
 
@@ -61,7 +95,7 @@ The "techproducts" example included with Solr is pre-configured to have Solr Cel
 If you are not using the example, you will want to pay attention to the section <<solrconfig.xml Configuration>> below.
 
 
-=== Solr Cell Performance Implications
+=== Solr Cell Performance Implications (local mode)
 
 Rich document formats are frequently not well documented, and even in cases where there is documentation for the format, not everyone who creates documents will follow the specifications faithfully.
 
@@ -76,7 +110,8 @@ the request handler is running in the same JVM that Solr uses for other operatio
 Indexing can also consume all available Solr resources, particularly with large PDFs, presentations, or other files
 that have a lot of rich media embedded in them.
 
-For these reasons, Solr Cell is not recommended for use in a production system.
+For these reasons, Solr Cell with `local` backend is not recommended for use in a production system. Prefer the
+`tikaserver` backend, which is more robust and isolates failures from Solr itself.
 
 It is a best practice to use Solr Cell as a proof-of-concept tool during development and then run Tika as an external
 process that sends the extracted documents to Solr (via xref:deployment-guide:solrj.adoc[]) for indexing.
@@ -181,7 +216,7 @@ These parameters can be set for each indexing request (as request parameters), o
 |===
 +
 Choose the backend to use for extraction. The options are `local` or `tikaserver`.
-The `local` backend uses Tika libraries included with Solr to do the extraction, and is the default in Solr 9.
+The `local` backend uses Tika libraries included with Solr to do the extraction, and is the default in Solr 9.x.
 The `tikaserver` backend uses an external Tika server process to do the extraction.
 **The `local` backend is deprecated and will be removed in a future release.**
 +
@@ -195,9 +230,9 @@ Example: In `solrconfig.xml`: `<str name="extraction.backend">tikaserver</str>`.
 |===
 +
 Specifies the URL of the Tika server to use when the `extraction.backend` parameter is set to `tikaserver`.
-This parameter is required when using the `tikaserver` backend.
+This parameter is required when using the `tikaserver` backend. Defaults to `http://localhost:9998` if not specified.
 +
-Example: In `solrconfig.xml`: `<str name="tikaserver.url">http://my.tika.server</str>`.
+Example: In `solrconfig.xml`: `<str name="tikaserver.url">http://localhost:9998</str>`.
 
 `capture`::
 +
@@ -500,6 +535,8 @@ So you can use the other URPs without worrying about unexpected field additions.
 
 === Parser-Specific Properties
 
+NOTE: This setting applies to `local` backend only.
+
 Parsers used by Tika may have specific properties to govern how data is extracted.
 These can be passed through Solr for special parsing situations.
 
@@ -521,6 +558,8 @@ Consult the Tika Java API documentation for configuration parameters that can be
 
 === Indexing Encrypted Documents
 
+NOTE: The `tikaserver` backend does not currently support indexing encrypted documents.
+
 The ExtractingRequestHandler will decrypt encrypted files and index their content if you supply a password in either `resource.password` in the request, or in a `passwordsFile` file.
 
 In the case of `passwordsFile`, the file supplied must be formatted so there is one line per rule.
@@ -658,6 +697,7 @@ public class SolrCellRequestDemo {
     req.setParam(ExtractingParams.EXTRACT_ONLY, "true");
     NamedList<Object> result = client.request(req);
     System.out.println("Result: " + result);
+  }
 }
 ----
 

From 83296a9822b15184d7adb6067e3b4a9e85f6284c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Thu, 25 Sep 2025 16:30:04 +0200
Subject: [PATCH 21/47] * Refactor some logic back to ExtractingDocumentLoader
 * Add back-compat option for metadata * Fix true SAX streaming parser for
 Tika XML response * Simplify ExtractionBackend interface

---
 .../extraction/DummyExtractionBackend.java    |  32 +--
 .../extraction/ExtractingDocumentLoader.java  | 108 +++++++--
 .../handler/extraction/ExtractingParams.java  |   3 +
 .../handler/extraction/ExtractionBackend.java |  20 +-
 .../extraction/ExtractionMetadata.java        |  13 ++
 .../LocalTikaExtractionBackend.java           |  67 +-----
 .../TikaServerExtractionBackend.java          | 213 ++++--------------
 .../extraction/TikaServerXmlParser.java       | 126 +++++++++++
 .../extraction/XmlSanitizingReader.java       | 168 ++++++++++++++
 .../TikaServerExtractionBackendTest.java      |  29 +--
 10 files changed, 480 insertions(+), 299 deletions(-)
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java
 create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index e85844ff46f..9bdad267147 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -17,10 +17,12 @@
 package org.apache.solr.handler.extraction;
 
 import java.io.InputStream;
+import org.xml.sax.ContentHandler;
 
 /** Dummy backend that emits predictable test data without actually parsing input content. */
 public class DummyExtractionBackend implements ExtractionBackend {
   public static final String NAME = "dummy";
+  private final String text = "This is dummy extracted content";
 
   @Override
   public String name() {
@@ -37,32 +39,20 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     if (request.resourceName != null) {
       metadata.add("resourcename", request.resourceName);
     }
-    String text = "This is dummy extracted content";
     return new ExtractionResult(text, metadata);
   }
 
   @Override
-  public ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String xpathExpr) {
-    if (xpathExpr != null) {
-      throw new UnsupportedOperationException("XPath not supported by dummy backend");
-    }
-    return extract(inputStream, request);
-  }
-
-  @Override
-  public void parseToSolrContentHandler(
+  public void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
-      SolrContentHandler handler,
-      ExtractionMetadata outMetadata) {
-    // Fill metadata
-    ExtractionResult r = extract(inputStream, request);
-    for (String name : r.getMetadata().names()) {
-      String[] vals = r.getMetadata().getValues(name);
-      if (vals != null) for (String v : vals) outMetadata.add(name, v);
-    }
-    // Append content
-    handler.appendToContent(r.getContent());
+      ExtractionMetadata md,
+      ContentHandler saxContentHandler)
+      throws Exception {
+
+    ExtractionResult res = extract(inputStream, request);
+    md.putAll(res.getMetadata().asMap());
+    // Append the content to the SAX handler
+    saxContentHandler.characters(res.getContent().toCharArray(), 0, res.getContent().length());
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index ef68427099e..d968583ee0b 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -51,6 +51,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
   final SolrParams params;
   final UpdateRequestProcessor processor;
   final boolean ignoreTikaException;
+  final boolean backCompat;
 
   private final AddUpdateCommand templateAdd;
 
@@ -65,10 +66,12 @@ public ExtractingDocumentLoader(
     this.params = req.getParams();
     this.core = req.getCore();
     this.processor = processor;
+    this.backCompat = params.getBool(ExtractingParams.BACK_COMPATIBILITY, true);
 
     templateAdd = new AddUpdateCommand(req);
     templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
     templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);
+    templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
 
     this.factory = factory;
     this.backend = backend;
@@ -139,13 +142,48 @@ public void load(
 
       if (extractOnly) {
         try {
-          ExtractionResult result = backend.extractOnly(inputStream, extractionRequest, xpathExpr);
+          ExtractionMetadata md = backend.buildMetadataFromRequest(extractionRequest);
+          String content;
+          if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractionRequest.extractFormat)
+              || xpathExpr != null) {
+            org.apache.tika.sax.ToTextContentHandler textHandler =
+                new org.apache.tika.sax.ToTextContentHandler();
+            org.xml.sax.ContentHandler ch = textHandler;
+            if (xpathExpr != null) {
+              org.apache.tika.sax.xpath.XPathParser xparser =
+                  new org.apache.tika.sax.xpath.XPathParser(
+                      "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
+              org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
+              ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
+            }
+            backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch);
+            content = textHandler.toString();
+          } else { // XML format
+            org.apache.tika.sax.ToXMLContentHandler toXml =
+                new org.apache.tika.sax.ToXMLContentHandler();
+            org.xml.sax.ContentHandler ch = toXml;
+            if (xpathExpr != null) {
+              org.apache.tika.sax.xpath.XPathParser xparser =
+                  new org.apache.tika.sax.xpath.XPathParser(
+                      "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
+              org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
+              ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher);
+            }
+            backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch);
+            content = toXml.toString();
+            if (!content.startsWith("<?xml")) {
+              content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + content;
+            }
+          }
+
+          appendBackCompatTikaMetadata(md);
+
           // Write content
-          rsp.add(stream.getName(), result.getContent());
+          rsp.add(stream.getName(), content);
           // Write metadata
           NamedList<String[]> metadataNL = new NamedList<>();
-          for (String name : result.getMetadata().names()) {
-            metadataNL.add(name, result.getMetadata().getValues(name));
+          for (String name : md.names()) {
+            metadataNL.add(name, md.getValues(name));
           }
           rsp.add(stream.getName() + "_metadata", metadataNL);
         } catch (UnsupportedOperationException uoe) {
@@ -166,11 +204,11 @@ public void load(
 
       if (needLegacySax) {
         // Indexing with capture/xpath/etc: delegate SAX parse to backend
-        ExtractionMetadata neutral = new ExtractionMetadata();
+        ExtractionMetadata metadata = backend.buildMetadataFromRequest(extractionRequest);
         SolrContentHandler handler =
-            factory.createSolrContentHandler(neutral, params, req.getSchema());
+            factory.createSolrContentHandler(metadata, params, req.getSchema());
         try {
-          backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral);
+          backend.extractWithSaxHandler(inputStream, extractionRequest, metadata, handler);
         } catch (UnsupportedOperationException uoe) {
           // For backends that don't support parseToSolrContentHandler
           if (log.isWarnEnabled()) {
@@ -183,13 +221,13 @@ public void load(
           if (ignoreTikaException) {
             if (log.isWarnEnabled()) {
               log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
+              return;
             }
-            // Index a document with literals only (no extracted content/metadata)
-            addDoc(handler);
-            return;
           }
           throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
         }
+        appendBackCompatTikaMetadata(handler.metadata);
+
         addDoc(handler);
         return;
       }
@@ -202,16 +240,15 @@ public void load(
         if (ignoreTikaException) {
           if (log.isWarnEnabled())
             log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e);
-          // Index a document with literals only (no extracted content/metadata)
-          SolrContentHandler handler =
-              factory.createSolrContentHandler(new ExtractionMetadata(), params, req.getSchema());
-          addDoc(handler);
           return;
         }
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
 
       ExtractionMetadata metadata = result.getMetadata();
+
+      appendBackCompatTikaMetadata(metadata);
+
       String content = result.getContent();
 
       SolrContentHandler handler =
@@ -220,4 +257,47 @@ public void load(
       addDoc(handler);
     }
   }
+
+  private void appendBackCompatTikaMetadata(ExtractionMetadata md) {
+    if (!backCompat) {
+      return;
+    }
+
+    if (md.get("dc:title") != null) {
+      md.addValues("title", md.getValues("dc:title"));
+    }
+    if (md.get("dc:creator") != null) {
+      md.addValues("author", md.getValues("dc:creator"));
+    }
+    if (md.get("dc:description") != null) {
+      md.addValues("description", md.getValues("dc:description"));
+    }
+    if (md.get("dc:subject") != null) {
+      md.addValues("subject", md.getValues("dc:subject"));
+    }
+    if (md.get("dc:language") != null) {
+      md.addValues("language", md.getValues("dc:language"));
+    }
+    if (md.get("dc:publisher") != null) {
+      md.addValues("publisher", md.getValues("dc:publisher"));
+    }
+    if (md.get("dcterms:created") != null) {
+      md.addValues("created", md.getValues("dcterms:created"));
+    }
+    if (md.get("dcterms:modified") != null) {
+      md.addValues("modified", md.getValues("dcterms:modified"));
+    }
+    if (md.get("meta:author") != null) {
+      md.addValues("Author", md.getValues("meta:author"));
+    }
+    if (md.get("meta:creation-date") != null) {
+      md.addValues("Creation-Date", md.getValues("meta:creation-date"));
+    }
+    if (md.get("meta:save-date") != null) {
+      md.addValues("Last-Save-Date", md.getValues("meta:save-date"));
+    }
+    if (md.get("meta:keyword") != null) {
+      md.addValues("Keywords", md.getValues("meta:keyword"));
+    }
+  }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
index 840af280243..40bca51256d 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
@@ -139,4 +139,7 @@ public interface ExtractingParams {
 
   /** Backend selection parameter and */
   public static final String EXTRACTION_BACKEND = "extraction.backend";
+
+  /** Fix metadata to match Tika 1.x */
+  public static final String BACK_COMPATIBILITY = "backCompatibility";
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index 9647d0f843b..6c8962f7d60 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -19,6 +19,7 @@
 import java.io.InputStream;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.TikaMetadataKeys;
+import org.xml.sax.ContentHandler;
 
 /** Strategy interface for content extraction backends. */
 public interface ExtractionBackend {
@@ -29,23 +30,14 @@ public interface ExtractionBackend {
   ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception;
 
   /**
-   * Perform extractOnly operation. If extractFormat equals ExtractingDocumentLoader.TEXT_FORMAT,
-   * return plain text. If XML, return XML body as string. Implementations may support optional
-   * xpathExpr; if unsupported and xpathExpr is not null, they should throw
-   * UnsupportedOperationException.
+   * Perform extraction of text from input stream with SAX handler. Sax handler can be
+   * SolrContentHandler, ToTextContentHandler, ToXMLContentHandler, MatchingContentHandler etc
    */
-  ExtractionResult extractOnly(InputStream inputStream, ExtractionRequest request, String xpathExpr)
-      throws Exception;
-
-  /**
-   * Parse the content and stream SAX events into the provided SolrContentHandler, while also
-   * filling outMetadata with extracted metadata.
-   */
-  void parseToSolrContentHandler(
+  void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
-      SolrContentHandler handler,
-      ExtractionMetadata outMetadata)
+      ExtractionMetadata md,
+      ContentHandler saxContentHandler)
       throws Exception;
 
   /** Build ExtractionMetadata from the request context */
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
index c400bc90fb1..67592432fa0 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
@@ -31,12 +31,21 @@ public void add(String name, String value) {
     map.computeIfAbsent(name, k -> new ArrayList<>()).add(value);
   }
 
+  public void addValues(String name, String[] values) {
+    if (name == null || values == null || values.length == 0) return;
+    map.computeIfAbsent(name, k -> new ArrayList<>()).addAll(List.of(values));
+  }
+
   public void addIfNotNull(String resourceNameKey, String resourceName) {
     if (resourceName != null) {
       add(resourceNameKey, resourceName);
     }
   }
 
+  public void putAll(Map<String, List<String>> map) {
+    this.map.putAll(map);
+  }
+
   public String[] getValues(String name) {
     List<String> vals = map.get(name);
     if (vals == null) return new String[0];
@@ -57,6 +66,10 @@ public void remove(String name) {
     map.remove(name);
   }
 
+  public Map<String, List<String>> asMap() {
+    return map;
+  }
+
   @Override
   public String toString() {
     StringBuilder sb = new StringBuilder("ExtractionMetadata{");
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 28470ff7024..9f762bfa375 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -30,6 +30,7 @@
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.html.HtmlMapper;
 import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
 
 /**
  * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct
@@ -134,7 +135,7 @@ private ParseContext buildContext(Parser parser, ExtractionRequest request) {
     return context;
   }
 
-  private static ExtractionMetadata copyToNeutral(Metadata md) {
+  private static ExtractionMetadata tikaMetadataToExtractionMetadata(Metadata md) {
     ExtractionMetadata out = new ExtractionMetadata();
     for (String name : md.names()) {
       String[] vals = md.getValues(name);
@@ -150,74 +151,30 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     if (parser == null) {
       throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
     }
-    Metadata md = buildMetadata(request);
     ParseContext context = buildContext(parser, request);
+    Metadata md = buildMetadata(request);
     BodyContentHandler textHandler = new BodyContentHandler(-1);
     parser.parse(inputStream, textHandler, md, context);
-    return new ExtractionResult(textHandler.toString(), copyToNeutral(md));
-  }
-
-  @Override
-  public ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception {
-    Parser parser = selectParser(request);
-    if (parser == null) {
-      throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
-    }
-    Metadata md = buildMetadata(request);
-    ParseContext context = buildContext(parser, request);
-
-    String content;
-    if (ExtractingDocumentLoader.TEXT_FORMAT.equals(request.extractFormat) || xpathExpr != null) {
-      org.apache.tika.sax.ToTextContentHandler textHandler =
-          new org.apache.tika.sax.ToTextContentHandler();
-      org.xml.sax.ContentHandler ch = textHandler;
-      if (xpathExpr != null) {
-        org.apache.tika.sax.xpath.XPathParser xparser =
-            new org.apache.tika.sax.xpath.XPathParser(
-                "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
-        org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
-        ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
-      }
-      parser.parse(inputStream, ch, md, context);
-      content = textHandler.toString();
-    } else { // XML format
-      org.apache.tika.sax.ToXMLContentHandler toXml = new org.apache.tika.sax.ToXMLContentHandler();
-      org.xml.sax.ContentHandler ch = toXml;
-      if (xpathExpr != null) {
-        org.apache.tika.sax.xpath.XPathParser xparser =
-            new org.apache.tika.sax.xpath.XPathParser(
-                "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
-        org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
-        ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher);
-      }
-      parser.parse(inputStream, ch, md, context);
-      content = toXml.toString();
-      if (!content.startsWith("<?xml")) {
-        content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + content;
-      }
-    }
-    return new ExtractionResult(content, copyToNeutral(md));
+    return new ExtractionResult(textHandler.toString(), tikaMetadataToExtractionMetadata(md));
   }
 
   @Override
-  public void parseToSolrContentHandler(
+  public void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
-      SolrContentHandler handler,
-      ExtractionMetadata outMetadata)
+      ExtractionMetadata md,
+      ContentHandler saxContentHandler)
       throws Exception {
     Parser parser = selectParser(request);
     if (parser == null) {
       throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType);
     }
-    Metadata md = buildMetadata(request);
     ParseContext context = buildContext(parser, request);
-    parser.parse(inputStream, handler, md, context);
-    // populate outMetadata
-    for (String name : md.names()) {
-      String[] vals = md.getValues(name);
-      if (vals != null) for (String v : vals) outMetadata.add(name, v);
+    Metadata tikaMetadata = buildMetadata(request);
+    parser.parse(inputStream, saxContentHandler, tikaMetadata, context);
+    for (String name : tikaMetadata.names()) {
+      String[] vals = tikaMetadata.getValues(name);
+      if (vals != null) for (String v : vals) md.add(name, v);
     }
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index 196d9397cf4..401e4268969 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -16,17 +16,16 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
-import java.nio.charset.StandardCharsets;
 import java.time.Duration;
-import java.util.Arrays;
-import java.util.Set;
 import org.apache.solr.common.SolrException;
-import org.noggit.JSONParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
 
 /**
  * Extraction backend that delegates parsing to a remote Apache Tika Server.
@@ -39,6 +38,7 @@ public class TikaServerExtractionBackend implements ExtractionBackend {
   private final HttpClient httpClient;
   private final String baseUrl; // e.g., http://localhost:9998
   private final Duration timeout = Duration.ofSeconds(30);
+  private final TikaServerXmlParser tikaServerXmlParser = new TikaServerXmlParser();
 
   public TikaServerExtractionBackend(String baseUrl) {
     this(HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build(), baseUrl);
@@ -64,14 +64,40 @@ public String name() {
   @Override
   public ExtractionResult extract(InputStream inputStream, ExtractionRequest request)
       throws Exception {
-    String url =
-        baseUrl
-            + "/tika/"
-            + (Set.of("html", "xml").contains(request.extractFormat) ? "html" : "text");
+    try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
+      ExtractionMetadata md = buildMetadataFromRequest(request);
+      BodyContentHandler textHandler = new BodyContentHandler(-1);
+      tikaServerXmlParser.parse(tikaResponse, textHandler, md);
+      return new ExtractionResult(textHandler.toString(), md);
+    }
+  }
+
+  @Override
+  public void extractWithSaxHandler(
+      InputStream inputStream,
+      ExtractionRequest request,
+      ExtractionMetadata md,
+      ContentHandler saxContentHandler)
+      throws Exception {
+    try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
+      tikaServerXmlParser.parse(tikaResponse, saxContentHandler, md);
+    }
+  }
+
+  private static String firstNonNull(String a, String b) {
+    return a != null ? a : b;
+  }
+
+  /**
+   * Call the Tika Server /tika endpoint to extract text and metadata.
+   *
+   * @return InputStream of the response body, which is XML format
+   */
+  private InputStream callTikaServer(InputStream inputStream, ExtractionRequest request)
+      throws IOException, InterruptedException {
+    String url = baseUrl + "/tika";
     HttpRequest.Builder b =
-        HttpRequest.newBuilder(URI.create(url))
-            .timeout(timeout)
-            .header("Accept", "application/json");
+        HttpRequest.newBuilder(URI.create(url)).timeout(timeout).header("Accept", "text/xml");
     String contentType = firstNonNull(request.streamType, request.contentType);
     if (contentType != null) {
       b.header("Content-Type", contentType);
@@ -96,173 +122,14 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     }
     b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream));
 
-    // TODO: Consider getting the InputStream instead
-    HttpResponse<String> resp =
-        httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8));
+    HttpResponse<InputStream> resp =
+        httpClient.send(b.build(), HttpResponse.BodyHandlers.ofInputStream());
     int code = resp.statusCode();
     if (code < 200 || code >= 300) {
-      // TODO: Parse error message from response?
       throw new SolrException(
           SolrException.ErrorCode.getErrorCode(code),
           "TikaServer " + url + " returned status " + code);
     }
-    String body = resp.body();
-    return parseCombinedJson(body, md);
-  }
-
-  @Override
-  public ExtractionResult extractOnly(
-      InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception {
-    if (xpathExpr != null) {
-      throw new UnsupportedOperationException(
-          "XPath filtering is not supported by TikaServer backend");
-    }
-    return extract(inputStream, request);
-  }
-
-  @Override
-  public void parseToSolrContentHandler(
-      InputStream inputStream,
-      ExtractionRequest request,
-      SolrContentHandler handler,
-      ExtractionMetadata outMetadata) {
-    throw new UnsupportedOperationException(
-        "Legacy SAX-based parsing is not supported by TikaServer backend");
-  }
-
-  private static String firstNonNull(String a, String b) {
-    return a != null ? a : b;
-  }
-
-  // Reads key-values of the current object into md. Assumes the parser is positioned
-  // right after OBJECT_START of that object.
-  private static ExtractionMetadata parseMetadataObject(JSONParser p) throws java.io.IOException {
-    ExtractionMetadata md = new ExtractionMetadata();
-    String currentKey;
-    while (true) {
-      int ev = p.nextEvent();
-      if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) {
-        break;
-      }
-      if (ev == JSONParser.STRING && p.wasKey()) {
-        currentKey = p.getString();
-        ev = p.nextEvent();
-        if (ev == JSONParser.STRING) {
-          md.add(currentKey, p.getString());
-        } else if (ev == JSONParser.ARRAY_START) {
-          while (true) {
-            ev = p.nextEvent();
-            if (ev == JSONParser.ARRAY_END) break;
-            if (ev == JSONParser.STRING) {
-              md.add(currentKey, p.getString());
-            } else if (ev == JSONParser.LONG
-                || ev == JSONParser.NUMBER
-                || ev == JSONParser.BIGNUMBER) {
-              md.add(currentKey, p.getNumberChars().toString());
-            } else if (ev == JSONParser.BOOLEAN) {
-              md.add(currentKey, String.valueOf(p.getBoolean()));
-            } else if (ev == JSONParser.NULL) {
-              // ignore nulls
-            } else {
-              // skip nested objects or unsupported types within arrays
-            }
-          }
-        } else if (ev == JSONParser.LONG || ev == JSONParser.NUMBER || ev == JSONParser.BIGNUMBER) {
-          md.add(currentKey, p.getNumberChars().toString());
-        } else if (ev == JSONParser.BOOLEAN) {
-          md.add(currentKey, String.valueOf(p.getBoolean()));
-        } else if (ev == JSONParser.NULL) {
-          // ignore nulls
-        } else if (ev == JSONParser.OBJECT_START) {
-          // Unexpected nested object; skip it entirely
-          skipObject(p);
-        } else {
-          // skip unsupported value types
-        }
-      }
-    }
-    return md;
-  }
-
-  private static void skipObject(JSONParser p) throws java.io.IOException {
-    int depth = 1;
-    while (depth > 0) {
-      int ev = p.nextEvent();
-      if (ev == JSONParser.OBJECT_START) depth++;
-      else if (ev == JSONParser.OBJECT_END) depth--;
-      else if (ev == JSONParser.EOF) break;
-    }
-  }
-
-  // Parses combined JSON from /tika/text with Accept: application/json and returns both content
-  // and metadata. Supports two shapes:
-  // 1) {"content": "...", "metadata": { ... }}
-  // 2) {"content": "...", <flat metadata fields> }
-  private ExtractionResult parseCombinedJson(String json, ExtractionMetadata md) {
-    String content = "";
-    if (json == null) return new ExtractionResult(content, md);
-    try {
-      JSONParser p = new JSONParser(json);
-      int ev = p.nextEvent();
-      if (ev != JSONParser.OBJECT_START) {
-        return new ExtractionResult(content, md);
-      }
-      while (true) {
-        ev = p.nextEvent();
-        if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) break;
-        if (ev == JSONParser.STRING && p.wasKey()) {
-          String key = p.getString();
-          ev = p.nextEvent();
-          if ("X-TIKA:content".equals(key)) {
-            if (ev == JSONParser.STRING) {
-              content = p.getString();
-            } else {
-              // Skip non-string content
-              if (ev == JSONParser.OBJECT_START) skipObject(p);
-            }
-          } else if ("metadata".equals(key)) {
-            if (ev == JSONParser.OBJECT_START) {
-              md = parseMetadataObject(p);
-            } else {
-              // unexpected shape; skip
-              if (ev == JSONParser.OBJECT_START) skipObject(p);
-            }
-          } else {
-            // Treat as flat metadata field
-            if (ev == JSONParser.STRING) {
-              md.add(key, p.getString());
-            } else if (ev == JSONParser.ARRAY_START) {
-              while (true) {
-                ev = p.nextEvent();
-                if (ev == JSONParser.ARRAY_END) break;
-                if (ev == JSONParser.STRING) md.add(key, p.getString());
-                else if (ev == JSONParser.LONG
-                    || ev == JSONParser.NUMBER
-                    || ev == JSONParser.BIGNUMBER) md.add(key, p.getNumberChars().toString());
-                else if (ev == JSONParser.BOOLEAN) md.add(key, String.valueOf(p.getBoolean()));
-                else if (ev == JSONParser.NULL) {
-                  // ignore
-                }
-              }
-            } else if (ev == JSONParser.LONG
-                || ev == JSONParser.NUMBER
-                || ev == JSONParser.BIGNUMBER) {
-              md.add(key, p.getNumberChars().toString());
-            } else if (ev == JSONParser.BOOLEAN) {
-              md.add(key, String.valueOf(p.getBoolean()));
-            } else if (ev == JSONParser.NULL) {
-              // ignore
-            } else if (ev == JSONParser.OBJECT_START) {
-              // skip nested object for unknown key
-              skipObject(p);
-            }
-          }
-        }
-      }
-    } catch (java.io.IOException ioe) {
-      // ignore, return what we have
-    }
-    Arrays.stream(md.names()).filter(k -> k.startsWith("X-TIKA:Parsed-")).forEach(md::remove);
-    return new ExtractionResult(content, md);
+    return resp.body();
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java
new file mode 100644
index 00000000000..591fb3a1475
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.IOException;
+import java.io.InputStream;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import org.apache.solr.common.SolrException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TikaServerXmlParser {
+  private final SAXParser saxParser;
+
+  public TikaServerXmlParser() {
+    SAXParserFactory factory = SAXParserFactory.newInstance();
+    factory.setNamespaceAware(true);
+    try {
+      factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
+      factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
+      factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+    } catch (Throwable ignore) {
+      // Some parsers may not support all features; ignore
+    }
+    try {
+      saxParser = factory.newSAXParser();
+    } catch (Exception e) {
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+    }
+  }
+
+  /**
+   * Parses response in XML format from Tika Server /tika endpoint. The result is that the metadata
+   * object is populated and the content handler is called with extracted text.
+   */
+  public void parse(InputStream inputStream, ContentHandler handler, ExtractionMetadata metadata)
+      throws IOException, SAXException {
+    DefaultHandler myHandler = new TikaXmlResponseSaxContentHandler(handler, metadata);
+    InputStream sanitizedStream = XmlSanitizingReader.sanitize(inputStream);
+    saxParser.parse(sanitizedStream, myHandler);
+  }
+
+  /** Custom SAX handler that will extract meta tags from the tika xml and delegate */
+  static class TikaXmlResponseSaxContentHandler extends DefaultHandler {
+    private final ContentHandler delegate;
+    private final ExtractionMetadata metadata;
+    private boolean inHead = false;
+
+    public TikaXmlResponseSaxContentHandler(ContentHandler delegate, ExtractionMetadata metadata) {
+      this.delegate = delegate;
+      this.metadata = metadata;
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+      if (delegate != null) delegate.startDocument();
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+      if (delegate != null) delegate.endDocument();
+    }
+
+    @Override
+    public void startElement(
+        String uri, String localName, String qName, org.xml.sax.Attributes attributes)
+        throws SAXException {
+      String ln = localName != null && !localName.isEmpty() ? localName : qName;
+      if ("head".equalsIgnoreCase(ln)) {
+        inHead = true;
+      } else if (inHead && "meta".equalsIgnoreCase(ln) && attributes != null) {
+        String name = attributes.getValue("name");
+        String content = attributes.getValue("content");
+        if (name != null && content != null) {
+          metadata.add(name, content);
+        }
+      }
+      if (delegate != null) delegate.startElement(uri, localName, qName, attributes);
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+      String ln = localName != null && !localName.isEmpty() ? localName : qName;
+      if ("head".equalsIgnoreCase(ln)) {
+        inHead = false;
+      }
+      if (delegate != null) delegate.endElement(uri, localName, qName);
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+      if (delegate != null) delegate.characters(ch, start, length);
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+      if (delegate != null) delegate.ignorableWhitespace(ch, start, length);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+      if (delegate != null) delegate.startPrefixMapping(prefix, uri);
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+      if (delegate != null) delegate.endPrefixMapping(prefix);
+    }
+  }
+}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
new file mode 100644
index 00000000000..39f6b79a4cd
--- /dev/null
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.extraction;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringWriter;
+import org.apache.commons.io.IOUtils;
+
+/**
+ * Make sure the XHTML input is valid XML. Pipe text through this reader before passing it to an XML
+ * parser
+ */
+final class XmlSanitizingReader extends java.io.Reader {
+  private final java.io.Reader in;
+  private final StringBuilder entityBuf = new StringBuilder();
+  private boolean inEntity = false; // after reading '&'
+
+  // For surrogate tracking to evaluate XML validity by code point
+  private int pendingHighSurrogate = -1;
+
+  public XmlSanitizingReader(java.io.Reader in) {
+    this.in = in;
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws java.io.IOException {
+    int written = 0;
+    while (written < len) {
+      int ci = in.read();
+      if (ci == -1) break;
+      char ch = (char) ci;
+
+      // Handle numeric entity stripping for &#0; and &#x0; variants
+      if (!inEntity) {
+        if (ch == '&') {
+          inEntity = true;
+          entityBuf.setLength(0);
+          entityBuf.append(ch);
+          continue; // don't write yet
+        }
+      } else {
+        entityBuf.append(ch);
+        // stop conditions for entity buffering
+        if (ch == ';' || entityBuf.length() > 12) { // entities are short; cap length defensively
+          String ent = entityBuf.toString();
+          boolean drop = isNullNumericEntity(ent);
+          inEntity = false;
+          if (!drop) {
+            // flush buffered entity to output
+            for (int i = 0; i < ent.length() && written < len; i++) {
+              cbuf[off + written++] = ent.charAt(i);
+            }
+          }
+          continue;
+        }
+        // Keep buffering alphanumerics and '#', 'x'
+        continue;
+      }
+
+      // Filter invalid XML 1.0 characters by code point
+      if (Character.isHighSurrogate(ch)) {
+        pendingHighSurrogate = ch;
+        continue; // need next char to form code point
+      }
+      if (Character.isLowSurrogate(ch) && pendingHighSurrogate != -1) {
+        int cp = Character.toCodePoint((char) pendingHighSurrogate, ch);
+        pendingHighSurrogate = -1;
+        if (isAllowedXmlChar(cp)) {
+          // encode back as surrogate pair
+          cbuf[off + written++] = Character.highSurrogate(cp);
+          if (written < len) {
+            cbuf[off + written++] = Character.lowSurrogate(cp);
+          } else {
+            // If no space for low surrogate, keep it pending (edge, unlikely with reasonable len)
+            // Fallback: buffer low surrogate into a small one-char pushback by using a field
+            // For simplicity, write only if space available; otherwise, return and next read
+            // continues
+            // But to avoid corruption, store it
+            pushbackChar = Character.lowSurrogate(cp);
+          }
+        }
+        continue;
+      } else {
+        // previous high surrogate without low surrogate -> invalid; drop it
+        pendingHighSurrogate = -1;
+      }
+
+      int cp = ch;
+      if (!Character.isSurrogate(ch) && isAllowedXmlChar(cp)) {
+        cbuf[off + written++] = ch;
+      }
+    }
+    return (written == 0) ? -1 : written;
+  }
+
+  private Character pushbackChar = null;
+
+  @Override
+  public boolean ready() throws java.io.IOException {
+    return in.ready();
+  }
+
+  @Override
+  public void close() throws java.io.IOException {
+    in.close();
+  }
+
+  private static boolean isNullNumericEntity(String ent) {
+    // Accept patterns like '&#0;', '&#00;', '&#x0;', '&#x0000;' (case-insensitive)
+    if (ent == null) return false;
+    if (!ent.startsWith("&#") || !ent.endsWith(";")) return false;
+    String mid = ent.substring(2, ent.length() - 1);
+    if (mid.isEmpty()) return false;
+    if (mid.charAt(0) == 'x' || mid.charAt(0) == 'X') {
+      // hex
+      for (int i = 1; i < mid.length(); i++) {
+        char c = mid.charAt(i);
+        if (c != '0') return false;
+      }
+      return mid.length() > 1; // at least one zero after x
+    } else {
+      // decimal
+      for (int i = 0; i < mid.length(); i++) {
+        char c = mid.charAt(i);
+        if (c != '0') return false;
+      }
+      return true; // one or more zeros
+    }
+  }
+
+  private static boolean isAllowedXmlChar(int cp) {
+    return cp == 0x9
+        || cp == 0xA
+        || cp == 0xD
+        || (cp >= 0x20 && cp <= 0xD7FF)
+        || (cp >= 0xE000 && cp <= 0xFFFD)
+        || (cp >= 0x10000 && cp <= 0x10FFFF);
+  }
+
+  public static InputStream sanitize(InputStream in) throws IOException {
+    try (Reader reader = new XmlSanitizingReader(new InputStreamReader(in));
+        StringWriter writer = new StringWriter()) {
+
+      IOUtils.copy(reader, writer); // copy all sanitized chars to writer
+
+      byte[] bytes = writer.toString().getBytes("UTF-8");
+      return new ByteArrayInputStream(bytes);
+    }
+  }
+}
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
index 15f54707638..4d583b67769 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
@@ -25,6 +25,7 @@
 import org.apache.solr.SolrIgnoredThreadsFilter;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.junit.AfterClass;
 import org.junit.Assume;
 import org.junit.BeforeClass;
@@ -137,15 +138,16 @@ public void testExtractTextAndMetadata() throws Exception {
   }
 
   @Test
-  public void testExtractOnlyXml() throws Exception {
+  public void testExtractWithSaxHandlerXml() throws Exception {
     Assume.assumeTrue("Tika server container not started", tika != null);
     TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
     byte[] data = "Hello XML".getBytes(java.nio.charset.StandardCharsets.UTF_8);
+    ExtractionRequest request = newRequest("test.txt", "text/plain", "xml");
     try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
-      ExtractionResult res =
-          backend.extractOnly(in, newRequest("test.txt", "text/plain", "xml"), null);
-      assertNotNull(res);
-      String c = res.getContent();
+      ToXMLContentHandler xmlHandler = new ToXMLContentHandler();
+      ExtractionMetadata md = backend.buildMetadataFromRequest(request);
+      backend.extractWithSaxHandler(in, request, md, xmlHandler);
+      String c = xmlHandler.toString();
       assertNotNull(c);
       // Tika Server may return XHTML without XML declaration; be flexible
       assertTrue(
@@ -155,21 +157,4 @@ public void testExtractOnlyXml() throws Exception {
       assertTrue(c.contains("Hello XML"));
     }
   }
-
-  @Test
-  public void testParseToSolrContentHandlerUnsupported() throws Exception {
-    Assume.assumeTrue("Tika server container not started", tika != null);
-    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
-    byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8);
-    try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
-      expectThrows(
-          UnsupportedOperationException.class,
-          () ->
-              backend.parseToSolrContentHandler(
-                  in,
-                  newRequest("test.txt", "text/plain", "text"),
-                  new SolrContentHandler(new ExtractionMetadata(), params(), null),
-                  new ExtractionMetadata()));
-    }
-  }
 }

From 14b556bf256756ebbcae3aa0211234d48c679e68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Thu, 25 Sep 2025 17:40:27 +0200
Subject: [PATCH 22/47] Fix forbiddenAPI

---
 .../extraction/XmlSanitizingReader.java       | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
index 39f6b79a4cd..9451972ee01 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
@@ -16,13 +16,15 @@
  */
 package org.apache.solr.handler.extraction;
 
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PipedInputStream;
+import java.io.PipedOutputStream;
 import java.io.Reader;
-import java.io.StringWriter;
-import org.apache.commons.io.IOUtils;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
 
 /**
  * Make sure the XHTML input is valid XML. Pipe text through this reader before passing it to an XML
@@ -156,13 +158,28 @@ private static boolean isAllowedXmlChar(int cp) {
   }
 
   public static InputStream sanitize(InputStream in) throws IOException {
-    try (Reader reader = new XmlSanitizingReader(new InputStreamReader(in));
-        StringWriter writer = new StringWriter()) {
-
-      IOUtils.copy(reader, writer); // copy all sanitized chars to writer
-
-      byte[] bytes = writer.toString().getBytes("UTF-8");
-      return new ByteArrayInputStream(bytes);
-    }
+    PipedOutputStream out = new PipedOutputStream();
+    PipedInputStream pipedIn = new PipedInputStream(out);
+
+    Reader reader = new XmlSanitizingReader(new InputStreamReader(in, StandardCharsets.UTF_8));
+    Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8);
+
+    Thread worker =
+        new Thread(
+            () -> {
+              try (reader;
+                  writer) {
+                reader.transferTo(writer);
+              } catch (IOException e) {
+                try {
+                  pipedIn.close();
+                } catch (IOException ignored) {
+                }
+              }
+            });
+    worker.setDaemon(true);
+    worker.start();
+
+    return pipedIn;
   }
 }

From 45e7e4104fc27ee41794c18a77b98b1639af4874 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Thu, 25 Sep 2025 22:27:48 +0200
Subject: [PATCH 23/47] better back-compat metadata logic

---
 .../extraction/ExtractingDocumentLoader.java  | 58 ++++++++-----------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index d968583ee0b..9b3eafa38e4 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -20,6 +20,7 @@
 import java.io.InputStream;
 import java.lang.invoke.MethodHandles;
 import java.util.LinkedHashMap;
+import java.util.Map;
 import java.util.regex.Pattern;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
@@ -258,46 +259,33 @@ public void load(
     }
   }
 
+  private final Map<String, String> fieldMappings = new LinkedHashMap<>();
+  {
+    fieldMappings.put("dc:title", "title");
+    fieldMappings.put("dc:creator", "author");
+    fieldMappings.put("dc:description", "description");
+    fieldMappings.put("dc:subject", "subject");
+    fieldMappings.put("dc:language", "language");
+    fieldMappings.put("dc:publisher", "publisher");
+    fieldMappings.put("dcterms:created", "created");
+    fieldMappings.put("dcterms:modified", "modified");
+    fieldMappings.put("meta:author", "Author");
+    fieldMappings.put("meta:creation-date", "Creation-Date");
+    fieldMappings.put("meta:save-date", "Last-Save-Date");
+    fieldMappings.put("meta:keyword", "Keywords");
+  }
+
   private void appendBackCompatTikaMetadata(ExtractionMetadata md) {
     if (!backCompat) {
       return;
     }
 
-    if (md.get("dc:title") != null) {
-      md.addValues("title", md.getValues("dc:title"));
-    }
-    if (md.get("dc:creator") != null) {
-      md.addValues("author", md.getValues("dc:creator"));
-    }
-    if (md.get("dc:description") != null) {
-      md.addValues("description", md.getValues("dc:description"));
-    }
-    if (md.get("dc:subject") != null) {
-      md.addValues("subject", md.getValues("dc:subject"));
-    }
-    if (md.get("dc:language") != null) {
-      md.addValues("language", md.getValues("dc:language"));
-    }
-    if (md.get("dc:publisher") != null) {
-      md.addValues("publisher", md.getValues("dc:publisher"));
-    }
-    if (md.get("dcterms:created") != null) {
-      md.addValues("created", md.getValues("dcterms:created"));
-    }
-    if (md.get("dcterms:modified") != null) {
-      md.addValues("modified", md.getValues("dcterms:modified"));
-    }
-    if (md.get("meta:author") != null) {
-      md.addValues("Author", md.getValues("meta:author"));
-    }
-    if (md.get("meta:creation-date") != null) {
-      md.addValues("Creation-Date", md.getValues("meta:creation-date"));
-    }
-    if (md.get("meta:save-date") != null) {
-      md.addValues("Last-Save-Date", md.getValues("meta:save-date"));
-    }
-    if (md.get("meta:keyword") != null) {
-      md.addValues("Keywords", md.getValues("meta:keyword"));
+    for (Map.Entry<String, String> mapping : fieldMappings.entrySet()) {
+      String sourceField = mapping.getKey();
+      String targetField = mapping.getValue();
+      if (md.get(sourceField) != null && md.get(targetField) == null) {
+        md.addValues(targetField, md.getValues(sourceField));
+      }
     }
   }
 }

From 5ba93912f3356d5e4cc8c908d4845e3c2e9a1e72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 26 Sep 2025 01:04:59 +0200
Subject: [PATCH 24/47] More tests pass

---
 .../extraction/ExtractingDocumentLoader.java  |  5 ++++-
 .../handler/extraction/ExtractionRequest.java |  8 +++++++-
 .../TikaServerExtractionBackend.java          |  3 +++
 .../extraction/XmlSanitizingReader.java       |  3 ++-
 .../ExtractingRequestHandlerTestAbstract.java | 16 +++++++++++++++
 .../LocalTikaExtractionBackendTest.java       | 20 +++++++++++++------
 .../TikaServerExtractionBackendTest.java      | 15 +++++++++++---
 7 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 9b3eafa38e4..ecfac595613 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -19,6 +19,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.lang.invoke.MethodHandles;
+import java.util.Collections;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.regex.Pattern;
@@ -129,7 +130,8 @@ public void load(
               stream.getSize(),
               params.get(ExtractingParams.RESOURCE_PASSWORD, null),
               pwMap,
-              extractFormat);
+              extractFormat,
+              Collections.emptyMap());
 
       boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
       String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
@@ -260,6 +262,7 @@ public void load(
   }
 
   private final Map<String, String> fieldMappings = new LinkedHashMap<>();
+
   {
     fieldMappings.put("dc:title", "title");
     fieldMappings.put("dc:creator", "author");
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
index 010f6633472..bb23975afec 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
@@ -16,6 +16,9 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.util.HashMap;
+import java.util.Map;
+
 /** Immutable request info needed by extraction backends. */
 public class ExtractionRequest {
   public final String streamType; // explicit MIME type (optional)
@@ -29,6 +32,7 @@ public class ExtractionRequest {
   public final java.util.LinkedHashMap<java.util.regex.Pattern, String>
       passwordsMap; // optional passwords map
   public final String extractFormat;
+  public final Map<String, String> tikaRequestHeaders = new HashMap<>();
 
   public ExtractionRequest(
       String streamType,
@@ -40,7 +44,8 @@ public ExtractionRequest(
       Long streamSize,
       String resourcePassword,
       java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap,
-      String extractFormat) {
+      String extractFormat,
+      Map<String, String> tikaRequestHeaders) {
     this.streamType = streamType;
     this.resourceName = resourceName;
     this.contentType = contentType;
@@ -51,5 +56,6 @@ public ExtractionRequest(
     this.resourcePassword = resourcePassword;
     this.passwordsMap = passwordsMap;
     this.extractFormat = extractFormat;
+    this.tikaRequestHeaders.putAll(tikaRequestHeaders);
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index 401e4268969..e06fc89e213 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -102,6 +102,9 @@ private InputStream callTikaServer(InputStream inputStream, ExtractionRequest re
     if (contentType != null) {
       b.header("Content-Type", contentType);
     }
+    if (!request.tikaRequestHeaders.isEmpty()) {
+      request.tikaRequestHeaders.forEach(b::header);
+    }
     ExtractionMetadata md = buildMetadataFromRequest(request);
     if (request.resourcePassword != null || request.passwordsMap != null) {
       RegexRulesPasswordProvider passwordProvider = new RegexRulesPasswordProvider();
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
index 9451972ee01..ee7ec8cda08 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
@@ -176,7 +176,8 @@ public static InputStream sanitize(InputStream in) throws IOException {
                 } catch (IOException ignored) {
                 }
               }
-            });
+            },
+            "XmlSanitizingReaderWorker");
     worker.setDaemon(true);
     worker.start();
 
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index 349dae50ce6..843aa4889d7 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -458,6 +458,8 @@ public void testLiterals() throws Exception {
           "two",
           "fmap.X-Parsed-By",
           "ignored_parser",
+          "fmap.X-TIKA:Parsed-By",
+          "ignored_parser",
           "fmap.Last-Modified",
           "extractedDate");
       // TODO: original author did not specify why an exception should be thrown... how to fix?
@@ -488,6 +490,8 @@ public void testLiterals() throws Exception {
         "one",
         "fmap.X-Parsed-By",
         "ignored_parser",
+        "fmap.X-TIKA:Parsed-By",
+        "ignored_parser",
         "fmap.Last-Modified",
         "extractedDate");
     assertU(commit());
@@ -594,6 +598,12 @@ public void testPlainTextSpecifyingMimeType() throws Exception {
         "extractedLanguage",
         "fmap.X-Parsed-By",
         "ignored_parser",
+        "fmap.X-TIKA:Parsed-By",
+        "ignored_parser",
+        "fmap.X-TIKA:detectedEncoding",
+        "ignored_parser",
+        "fmap.X-TIKA:encodingDetector",
+        "ignored_parser",
         "fmap.content",
         "extractedContent",
         ExtractingParams.STREAM_TYPE,
@@ -628,6 +638,12 @@ public void testPlainTextSpecifyingResourceName() throws Exception {
         "extractedLanguage",
         "fmap.X-Parsed-By",
         "ignored_parser",
+        "fmap.X-TIKA:Parsed-By",
+        "ignored_parser",
+        "fmap.X-TIKA:detectedEncoding",
+        "ignored_parser",
+        "fmap.X-TIKA:encodingDetector",
+        "ignored_parser",
         "fmap.content",
         "extractedContent",
         ExtractingParams.RESOURCE_NAME,
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
index 4974f5a1903..825439cad83 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
@@ -18,6 +18,8 @@
 
 import java.io.InputStream;
 import java.nio.file.Files;
+import java.util.Collections;
+import java.util.Map;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.tika.config.TikaConfig;
 import org.junit.BeforeClass;
@@ -54,7 +56,8 @@ private ExtractionRequest newRequest(
       String streamSourceInfo,
       Long streamSize,
       String resourcePassword,
-      String returnType) {
+      String returnType,
+      Map<String, String> tikaRequestHeaders) {
     return new ExtractionRequest(
         streamType,
         resourceName,
@@ -65,7 +68,8 @@ private ExtractionRequest newRequest(
         streamSize,
         resourcePassword,
         null,
-        returnType);
+        returnType,
+        tikaRequestHeaders);
   }
 
   @Test
@@ -83,7 +87,8 @@ public void testWrongStreamTypeThrows() throws Exception {
               null,
               null,
               null,
-              "text");
+              "text",
+              Collections.emptyMap());
       expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req));
     }
 
@@ -99,7 +104,8 @@ public void testWrongStreamTypeThrows() throws Exception {
               null,
               null,
               null,
-              "text");
+              "text",
+              Collections.emptyMap());
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
   }
@@ -118,7 +124,8 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception {
               null,
               null,
               null,
-              "text");
+              "text",
+              Collections.emptyMap());
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
   }
@@ -137,7 +144,8 @@ public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception {
               null,
               null,
               "Word2010",
-              "text");
+              "text",
+              Collections.emptyMap());
       ExtractionResult res = backend.extract(in, req);
       assertNotNull(res);
       assertNotNull(res.getMetadata());
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
index 4d583b67769..e28fcb6a832 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
@@ -20,6 +20,8 @@
 import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
 import java.io.ByteArrayInputStream;
 import java.net.http.HttpClient;
+import java.util.Collections;
+import java.util.Map;
 import java.util.concurrent.ExecutorService;
 import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
 import org.apache.solr.SolrIgnoredThreadsFilter;
@@ -103,7 +105,10 @@ public static void stopTikaServer() {
   }
 
   private static ExtractionRequest newRequest(
-      String resourceName, String contentType, String extractFormat) {
+      String resourceName,
+      String contentType,
+      String extractFormat,
+      Map<String, String> tikaRequestHeaders) {
     return new ExtractionRequest(
         contentType, // streamType
         resourceName, // resourceName
@@ -114,8 +119,8 @@ private static ExtractionRequest newRequest(
         null, // size
         null, // resourcePassword
         null, // passwordsMap
-        extractFormat // extraction format xml or text
-        );
+        extractFormat, // extraction format xml or text
+        tikaRequestHeaders);
   }
 
   @Test
@@ -157,4 +162,8 @@ public void testExtractWithSaxHandlerXml() throws Exception {
       assertTrue(c.contains("Hello XML"));
     }
   }
+
+  private ExtractionRequest newRequest(String file, String contentType, String xml) {
+    return newRequest(file, contentType, xml, Collections.emptyMap());
+  }
 }

From dbca2346076d41407b4870d3c9ccbc9c3d173565 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 26 Sep 2025 01:24:49 +0200
Subject: [PATCH 25/47] Fix test testLiteralsOverride

---
 .../apache/solr/handler/extraction/ExtractingDocumentLoader.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index ecfac595613..941b086e298 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -276,6 +276,7 @@ public void load(
     fieldMappings.put("meta:creation-date", "Creation-Date");
     fieldMappings.put("meta:save-date", "Last-Save-Date");
     fieldMappings.put("meta:keyword", "Keywords");
+    fieldMappings.put("pdf:docinfo:keywords", "Keywords");
   }
 
   private void appendBackCompatTikaMetadata(ExtractionMetadata md) {

From e6ee7066bccef6f20fc3df85ec340f23943fd666 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 26 Sep 2025 02:57:20 +0200
Subject: [PATCH 26/47] Rewrite tests to use h1 instead of div

---
 .../src/test-files/extraction/example.html    |  4 ++--
 .../src/test-files/extraction/simple.html     |  2 +-
 .../ExtractingRequestHandlerTestAbstract.java | 22 +++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/solr/modules/extraction/src/test-files/extraction/example.html b/solr/modules/extraction/src/test-files/extraction/example.html
index 5732f6214bc..2801c3c97d8 100644
--- a/solr/modules/extraction/src/test-files/extraction/example.html
+++ b/solr/modules/extraction/src/test-files/extraction/example.html
@@ -6,8 +6,8 @@
 <p>
   Here is some text
 </p>
-<div>Here is some text in a div</div>
-<div>This has a <a href="http://www.apache.org">link</a>.</div>
+<h1>a h1 tag</h1>
+<p>This has a <a href="http://www.apache.org">link</a> in a paragraph.</p>
 <a href="#news">News</a>
 <ul class="minitoc">
 <li>
diff --git a/solr/modules/extraction/src/test-files/extraction/simple.html b/solr/modules/extraction/src/test-files/extraction/simple.html
index 3c807fb1d98..3ec4d4e0d01 100644
--- a/solr/modules/extraction/src/test-files/extraction/simple.html
+++ b/solr/modules/extraction/src/test-files/extraction/simple.html
@@ -10,7 +10,7 @@
   Here is some text
 </p>
 <p>distinct<br/>words</p>
-<div>Here is some text in a div</div>
+<h1>Here is some text in a h1</h1>
 <div>This has a <a href="http://www.apache.org">link</a>.</div>
 </body>
 <script>
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index 843aa4889d7..1f32a60883f 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -292,13 +292,13 @@ public void testCapture() throws Exception {
         "uprefix",
         "t_",
         "capture",
-        "div",
-        "fmap.div",
+        "h1",
+        "fmap.h1",
         "foo_t",
         "commit",
         "true");
     assertQ(req("+id:capture1 +t_content:Solr"), "//*[@numFound='1']");
-    assertQ(req("+id:capture1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+    assertQ(req("+id:capture1 +foo_t:\"here is some text in a h1\""), "//*[@numFound='1']");
 
     loadLocal(
         "extraction/simple.html",
@@ -308,18 +308,18 @@ public void testCapture() throws Exception {
         "true",
         "defaultField",
         "text",
-        "fmap.div",
-        "div_t",
+        "fmap.h1",
+        "h1_t",
         "fmap.a",
         "anchor_t",
         "capture",
-        "div",
+        "h1",
         "capture",
         "a",
         "commit",
         "true");
     assertQ(req("+id:capture2 +text:Solr"), "//*[@numFound='1']");
-    assertQ(req("+id:capture2 +div_t:\"here is some text in a div\""), "//*[@numFound='1']");
+    assertQ(req("+id:capture2 +h1_t:\"here is some text in a h1\""), "//*[@numFound='1']");
     assertQ(req("+id:capture2 +anchor_t:http\\://www.apache.org"), "//*[@numFound='1']");
     assertQ(req("+id:capture2 +anchor_t:link"), "//*[@numFound='1']");
   }
@@ -756,16 +756,16 @@ public void testXPath() throws Exception {
         "defaultField",
         "text",
         "capture",
-        "div",
-        "fmap.div",
+        "h1",
+        "fmap.h1",
         "foo_t",
         "boost.foo_t",
         "3",
         "xpath",
-        "/xhtml:html/xhtml:body/xhtml:div//node()",
+        "/xhtml:html/xhtml:body/xhtml:cite//node()",
         "commit",
         "true");
-    assertQ(req("+id:example1 +foo_t:\"here is some text in a div\""), "//*[@numFound='1']");
+    assertQ(req("+id:example1 +foo_t:\"a h1 tag\""), "//*[@numFound='1']");
   }
 
   /** test arabic PDF extraction is functional */

From 5863118c976751cdb40d3ae2f39080d1119dafb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 26 Sep 2025 10:38:03 +0200
Subject: [PATCH 27/47] Add support for recursive parsing in TikaServer impl
 ("recursive=true" config) Move pdf-with-image test to local test Add
 recursive test to TikaServer test case

---
 .../extraction/DummyExtractionBackend.java    |  4 +-
 .../extraction/ExtractingDocumentLoader.java  |  7 +-
 .../handler/extraction/ExtractingParams.java  |  3 +
 .../handler/extraction/ExtractionBackend.java |  4 +-
 .../handler/extraction/ExtractionRequest.java |  7 +-
 .../LocalTikaExtractionBackend.java           |  4 +-
 .../TikaServerExtractionBackend.java          | 29 +++++---
 ...erXmlParser.java => TikaServerParser.java} | 53 ++++++++++++++-
 .../extraction/XmlSanitizingReader.java       |  3 +-
 .../ExtractingRequestHandlerLocalTest.java    | 41 +++++++++++-
 .../ExtractingRequestHandlerTestAbstract.java | 67 +++++++++----------
 ...xtractingRequestHandlerTikaServerTest.java |  5 +-
 .../LocalTikaExtractionBackendTest.java       |  6 ++
 .../TikaServerExtractionBackendTest.java      | 30 ++++++++-
 14 files changed, 203 insertions(+), 60 deletions(-)
 rename solr/modules/extraction/src/java/org/apache/solr/handler/extraction/{TikaServerXmlParser.java => TikaServerParser.java} (71%)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
index 9bdad267147..cf42e72453b 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
@@ -17,7 +17,7 @@
 package org.apache.solr.handler.extraction;
 
 import java.io.InputStream;
-import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 /** Dummy backend that emits predictable test data without actually parsing input content. */
 public class DummyExtractionBackend implements ExtractionBackend {
@@ -47,7 +47,7 @@ public void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
       ExtractionMetadata md,
-      ContentHandler saxContentHandler)
+      DefaultHandler saxContentHandler)
       throws Exception {
 
     ExtractionResult res = extract(inputStream, request);
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 941b086e298..d07f25f3a0c 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -37,6 +37,7 @@
 import org.apache.solr.update.processor.UpdateRequestProcessor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.xml.sax.helpers.DefaultHandler;
 
 /** The class responsible for loading extracted content into Solr. */
 public class ExtractingDocumentLoader extends ContentStreamLoader {
@@ -107,6 +108,7 @@ public void load(
 
       String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
       boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
+      boolean recursive = params.getBool(ExtractingParams.RECURSIVE, false);
       String extractFormat =
           params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT);
 
@@ -131,6 +133,7 @@ public void load(
               params.get(ExtractingParams.RESOURCE_PASSWORD, null),
               pwMap,
               extractFormat,
+              recursive,
               Collections.emptyMap());
 
       boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
@@ -151,7 +154,7 @@ public void load(
               || xpathExpr != null) {
             org.apache.tika.sax.ToTextContentHandler textHandler =
                 new org.apache.tika.sax.ToTextContentHandler();
-            org.xml.sax.ContentHandler ch = textHandler;
+            DefaultHandler ch = textHandler;
             if (xpathExpr != null) {
               org.apache.tika.sax.xpath.XPathParser xparser =
                   new org.apache.tika.sax.xpath.XPathParser(
@@ -164,7 +167,7 @@ public void load(
           } else { // XML format
             org.apache.tika.sax.ToXMLContentHandler toXml =
                 new org.apache.tika.sax.ToXMLContentHandler();
-            org.xml.sax.ContentHandler ch = toXml;
+            DefaultHandler ch = toXml;
             if (xpathExpr != null) {
               org.apache.tika.sax.xpath.XPathParser xparser =
                   new org.apache.tika.sax.xpath.XPathParser(
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
index 40bca51256d..eb70d5b6f6d 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
@@ -142,4 +142,7 @@ public interface ExtractingParams {
 
   /** Fix metadata to match Tika 1.x */
   public static final String BACK_COMPATIBILITY = "backCompatibility";
+
+  /** Enable recursive parsing of embedded documents */
+  String RECURSIVE = "recursive";
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index 6c8962f7d60..9d15b5a1159 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -19,7 +19,7 @@
 import java.io.InputStream;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.TikaMetadataKeys;
-import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 /** Strategy interface for content extraction backends. */
 public interface ExtractionBackend {
@@ -37,7 +37,7 @@ void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
       ExtractionMetadata md,
-      ContentHandler saxContentHandler)
+      DefaultHandler saxContentHandler)
       throws Exception;
 
   /** Build ExtractionMetadata from the request context */
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
index bb23975afec..99ab4d8d742 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
@@ -32,6 +32,7 @@ public class ExtractionRequest {
   public final java.util.LinkedHashMap<java.util.regex.Pattern, String>
       passwordsMap; // optional passwords map
   public final String extractFormat;
+  public final boolean recursive;
   public final Map<String, String> tikaRequestHeaders = new HashMap<>();
 
   public ExtractionRequest(
@@ -45,6 +46,7 @@ public ExtractionRequest(
       String resourcePassword,
       java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap,
       String extractFormat,
+      boolean recursive,
       Map<String, String> tikaRequestHeaders) {
     this.streamType = streamType;
     this.resourceName = resourceName;
@@ -56,6 +58,9 @@ public ExtractionRequest(
     this.resourcePassword = resourcePassword;
     this.passwordsMap = passwordsMap;
     this.extractFormat = extractFormat;
-    this.tikaRequestHeaders.putAll(tikaRequestHeaders);
+    this.recursive = recursive;
+    if (tikaRequestHeaders != null) {
+      this.tikaRequestHeaders.putAll(tikaRequestHeaders);
+    }
   }
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 9f762bfa375..38d7777b10a 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -30,7 +30,7 @@
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.html.HtmlMapper;
 import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct
@@ -163,7 +163,7 @@ public void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
       ExtractionMetadata md,
-      ContentHandler saxContentHandler)
+      DefaultHandler saxContentHandler)
       throws Exception {
     Parser parser = selectParser(request);
     if (parser == null) {
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index e06fc89e213..d2dbc5485a9 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -25,7 +25,7 @@
 import java.time.Duration;
 import org.apache.solr.common.SolrException;
 import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * Extraction backend that delegates parsing to a remote Apache Tika Server.
@@ -38,7 +38,7 @@ public class TikaServerExtractionBackend implements ExtractionBackend {
   private final HttpClient httpClient;
   private final String baseUrl; // e.g., http://localhost:9998
   private final Duration timeout = Duration.ofSeconds(30);
-  private final TikaServerXmlParser tikaServerXmlParser = new TikaServerXmlParser();
+  private final TikaServerParser tikaServerResponseParser = new TikaServerParser();
 
   public TikaServerExtractionBackend(String baseUrl) {
     this(HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build(), baseUrl);
@@ -67,7 +67,11 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque
     try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
       ExtractionMetadata md = buildMetadataFromRequest(request);
       BodyContentHandler textHandler = new BodyContentHandler(-1);
-      tikaServerXmlParser.parse(tikaResponse, textHandler, md);
+      if (request.recursive) {
+        tikaServerResponseParser.parseRmetaJson(tikaResponse, textHandler, md);
+      } else {
+        tikaServerResponseParser.parseXml(tikaResponse, textHandler, md);
+      }
       return new ExtractionResult(textHandler.toString(), md);
     }
   }
@@ -77,10 +81,14 @@ public void extractWithSaxHandler(
       InputStream inputStream,
       ExtractionRequest request,
       ExtractionMetadata md,
-      ContentHandler saxContentHandler)
+      DefaultHandler saxContentHandler)
       throws Exception {
     try (InputStream tikaResponse = callTikaServer(inputStream, request)) {
-      tikaServerXmlParser.parse(tikaResponse, saxContentHandler, md);
+      if (request.recursive) {
+        tikaServerResponseParser.parseRmetaJson(tikaResponse, saxContentHandler, md);
+      } else {
+        tikaServerResponseParser.parseXml(tikaResponse, saxContentHandler, md);
+      }
     }
   }
 
@@ -89,15 +97,18 @@ private static String firstNonNull(String a, String b) {
   }
 
   /**
-   * Call the Tika Server /tika endpoint to extract text and metadata.
+   * Call the Tika Server to extract text and metadata. Depending on request.recursive, will either
+   * return XML (false) or JSON array (true)
    *
-   * @return InputStream of the response body, which is XML format
+   * @return InputStream of the response body, either XML or json depending on request.recursive
    */
   private InputStream callTikaServer(InputStream inputStream, ExtractionRequest request)
       throws IOException, InterruptedException {
-    String url = baseUrl + "/tika";
+    String url = baseUrl + (request.recursive ? "/rmeta" : "/tika");
     HttpRequest.Builder b =
-        HttpRequest.newBuilder(URI.create(url)).timeout(timeout).header("Accept", "text/xml");
+        HttpRequest.newBuilder(URI.create(url))
+            .timeout(timeout)
+            .header("Accept", (request.recursive ? "application/json" : "text/xml"));
     String contentType = firstNonNull(request.streamType, request.contentType);
     if (contentType != null) {
       b.header("Content-Type", contentType);
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java
similarity index 71%
rename from solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java
rename to solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java
index 591fb3a1475..26137c049c9 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerParser.java
@@ -16,19 +16,24 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.Map;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.Utils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class TikaServerXmlParser {
+public class TikaServerParser {
   private final SAXParser saxParser;
 
-  public TikaServerXmlParser() {
+  public TikaServerParser() {
     SAXParserFactory factory = SAXParserFactory.newInstance();
     factory.setNamespaceAware(true);
     try {
@@ -49,13 +54,55 @@ public TikaServerXmlParser() {
    * Parses response in XML format from Tika Server /tika endpoint. The result is that the metadata
    * object is populated and the content handler is called with extracted text.
    */
-  public void parse(InputStream inputStream, ContentHandler handler, ExtractionMetadata metadata)
+  public void parseXml(InputStream inputStream, ContentHandler handler, ExtractionMetadata metadata)
       throws IOException, SAXException {
     DefaultHandler myHandler = new TikaXmlResponseSaxContentHandler(handler, metadata);
     InputStream sanitizedStream = XmlSanitizingReader.sanitize(inputStream);
     saxParser.parse(sanitizedStream, myHandler);
   }
 
+  // TODO: Warning, this method 100% AI generated, not reviewed
+  @SuppressWarnings({"unchecked", "rawtypes"})
+  void parseRmetaJson(InputStream jsonStream, DefaultHandler handler, ExtractionMetadata md)
+      throws Exception {
+    Object parsed = Utils.fromJSON(jsonStream);
+    if (!(parsed instanceof List)) {
+      throw new SolrException(
+          SolrException.ErrorCode.SERVER_ERROR, "Unexpected /rmeta response, expected JSON array");
+    }
+    List list = (List) parsed;
+    for (Object o : list) {
+      if (!(o instanceof Map)) continue;
+      Map map = (Map) o;
+      // Copy metadata
+      for (Object k : map.keySet()) {
+        String key = String.valueOf(k);
+        Object val = map.get(k);
+        if ("X-TIKA:content".equalsIgnoreCase(key)) {
+          // handled below
+          continue;
+        }
+        if (val instanceof List) {
+          for (Object v : (List) val) {
+            if (v != null) md.add(key, String.valueOf(v));
+          }
+        } else if (val != null) {
+          md.add(key, String.valueOf(val));
+        }
+      }
+      Object content = map.get("X-TIKA:content");
+      if (content != null) {
+        String xhtml = String.valueOf(content);
+        if (!xhtml.isEmpty() && handler != null) {
+          InputStream inputStream =
+              new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8));
+          InputStream sanitizedStream = XmlSanitizingReader.sanitize(inputStream);
+          saxParser.parse(sanitizedStream, handler);
+        }
+      }
+    }
+  }
+
   /** Custom SAX handler that will extract meta tags from the tika xml and delegate */
   static class TikaXmlResponseSaxContentHandler extends DefaultHandler {
     private final ContentHandler delegate;
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
index ee7ec8cda08..5c211df155b 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java
@@ -28,7 +28,8 @@
 
 /**
  * Make sure the XHTML input is valid XML. Pipe text through this reader before passing it to an XML
- * parser
+ * parser. TODO: Warning: Most of this class is AI generated. Can be a lot smaller, only sanitize
+ * '0x0'?
  */
 final class XmlSanitizingReader extends java.io.Reader {
   private final java.io.Reader in;
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java
index 64dc90c1b50..32892b8efb4 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java
@@ -16,4 +16,43 @@
  */
 package org.apache.solr.handler.extraction;
 
-public class ExtractingRequestHandlerLocalTest extends ExtractingRequestHandlerTestAbstract {}
+import org.junit.Test;
+
+public class ExtractingRequestHandlerLocalTest extends ExtractingRequestHandlerTestAbstract {
+  @Test
+  public void testPdfWithImages() throws Exception {
+    // This test moved from abstract class since TikaServer with Tika3 does not extract images by
+    // default
+    // Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
+    loadLocal(
+        "extraction/pdf-with-image.pdf",
+        "fmap.created",
+        "extractedDate",
+        "fmap.producer",
+        "extractedProducer",
+        "fmap.creator",
+        "extractedCreator",
+        "fmap.Keywords",
+        "extractedKeywords",
+        "fmap.Creation-Date",
+        "extractedDate",
+        "uprefix",
+        "ignored_",
+        "fmap.Author",
+        "extractedAuthor",
+        "fmap.content",
+        "wdf_nocase",
+        "literal.id",
+        "pdfWithImage",
+        "resource.name",
+        "pdf-with-image.pdf",
+        "resource.password",
+        "solrRules",
+        "fmap.Last-Modified",
+        "extractedDate");
+
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+  }
+}
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index 1f32a60883f..38cf275b3f1 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -460,6 +460,14 @@ public void testLiterals() throws Exception {
           "ignored_parser",
           "fmap.X-TIKA:Parsed-By",
           "ignored_parser",
+          "fmap.X-TIKA:Parsed-By-Full-Set",
+          "ignored_parser",
+          "fmap.X-TIKA:content_handler",
+          "ignored_parser",
+          "fmap.X-TIKA:parse_time_millis",
+          "ignored_parser",
+          "fmap.X-TIKA:embedded_depth",
+          "ignored_parser",
           "fmap.Last-Modified",
           "extractedDate");
       // TODO: original author did not specify why an exception should be thrown... how to fix?
@@ -492,6 +500,14 @@ public void testLiterals() throws Exception {
         "ignored_parser",
         "fmap.X-TIKA:Parsed-By",
         "ignored_parser",
+        "fmap.X-TIKA:Parsed-By-Full-Set",
+        "ignored_parser",
+        "fmap.X-TIKA:content_handler",
+        "ignored_parser",
+        "fmap.X-TIKA:parse_time_millis",
+        "ignored_parser",
+        "fmap.X-TIKA:embedded_depth",
+        "ignored_parser",
         "fmap.Last-Modified",
         "extractedDate");
     assertU(commit());
@@ -604,6 +620,14 @@ public void testPlainTextSpecifyingMimeType() throws Exception {
         "ignored_parser",
         "fmap.X-TIKA:encodingDetector",
         "ignored_parser",
+        "fmap.X-TIKA:Parsed-By-Full-Set",
+        "ignored_parser",
+        "fmap.X-TIKA:content_handler",
+        "ignored_parser",
+        "fmap.X-TIKA:parse_time_millis",
+        "ignored_parser",
+        "fmap.X-TIKA:embedded_depth",
+        "ignored_parser",
         "fmap.content",
         "extractedContent",
         ExtractingParams.STREAM_TYPE,
@@ -644,6 +668,14 @@ public void testPlainTextSpecifyingResourceName() throws Exception {
         "ignored_parser",
         "fmap.X-TIKA:encodingDetector",
         "ignored_parser",
+        "fmap.X-TIKA:Parsed-By-Full-Set",
+        "ignored_parser",
+        "fmap.X-TIKA:content_handler",
+        "ignored_parser",
+        "fmap.X-TIKA:parse_time_millis",
+        "ignored_parser",
+        "fmap.X-TIKA:embedded_depth",
+        "ignored_parser",
         "fmap.content",
         "extractedContent",
         ExtractingParams.RESOURCE_NAME,
@@ -907,41 +939,6 @@ public void testLiteralsOverride() throws Exception {
         req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
   }
 
-  @Test
-  public void testPdfWithImages() throws Exception {
-    // Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
-    loadLocal(
-        "extraction/pdf-with-image.pdf",
-        "fmap.created",
-        "extractedDate",
-        "fmap.producer",
-        "extractedProducer",
-        "fmap.creator",
-        "extractedCreator",
-        "fmap.Keywords",
-        "extractedKeywords",
-        "fmap.Creation-Date",
-        "extractedDate",
-        "uprefix",
-        "ignored_",
-        "fmap.Author",
-        "extractedAuthor",
-        "fmap.content",
-        "wdf_nocase",
-        "literal.id",
-        "pdfWithImage",
-        "resource.name",
-        "pdf-with-image.pdf",
-        "resource.password",
-        "solrRules",
-        "fmap.Last-Modified",
-        "extractedDate");
-
-    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
-    assertU(commit());
-    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
-  }
-
   @Test
   public void testPasswordProtected() throws Exception {
     // PDF, Passwords from resource.password
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index a73081ec042..06d5ac96527 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -29,7 +29,10 @@
 import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.containers.wait.strategy.Wait;
 
-/** Generic tests, randomized between local and tikaserver backends */
+/**
+ * Generic tests, randomized between local and tikaserver backends TODO: This test still has thread
+ * leaks.
+ */
 @ThreadLeakFilters(
     defaultFilters = true,
     filters = {
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
index 825439cad83..d78f49b8a8d 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java
@@ -57,6 +57,7 @@ private ExtractionRequest newRequest(
       Long streamSize,
       String resourcePassword,
       String returnType,
+      boolean recursive,
       Map<String, String> tikaRequestHeaders) {
     return new ExtractionRequest(
         streamType,
@@ -69,6 +70,7 @@ private ExtractionRequest newRequest(
         resourcePassword,
         null,
         returnType,
+        recursive,
         tikaRequestHeaders);
   }
 
@@ -88,6 +90,7 @@ public void testWrongStreamTypeThrows() throws Exception {
               null,
               null,
               "text",
+              false,
               Collections.emptyMap());
       expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req));
     }
@@ -105,6 +108,7 @@ public void testWrongStreamTypeThrows() throws Exception {
               null,
               null,
               "text",
+              false,
               Collections.emptyMap());
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
@@ -125,6 +129,7 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception {
               null,
               null,
               "text",
+              false,
               Collections.emptyMap());
       expectThrows(Exception.class, () -> backend.extract(in, req));
     }
@@ -145,6 +150,7 @@ public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception {
               null,
               "Word2010",
               "text",
+              false,
               Collections.emptyMap());
       ExtractionResult res = backend.extract(in, req);
       assertNotNull(res);
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
index e28fcb6a832..e855f35bc60 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java
@@ -20,6 +20,7 @@
 import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
 import java.io.ByteArrayInputStream;
 import java.net.http.HttpClient;
+import java.nio.file.Files;
 import java.util.Collections;
 import java.util.Map;
 import java.util.concurrent.ExecutorService;
@@ -108,6 +109,7 @@ private static ExtractionRequest newRequest(
       String resourceName,
       String contentType,
       String extractFormat,
+      boolean recursive,
       Map<String, String> tikaRequestHeaders) {
     return new ExtractionRequest(
         contentType, // streamType
@@ -120,6 +122,7 @@ private static ExtractionRequest newRequest(
         null, // resourcePassword
         null, // passwordsMap
         extractFormat, // extraction format xml or text
+        recursive,
         tikaRequestHeaders);
   }
 
@@ -163,7 +166,32 @@ public void testExtractWithSaxHandlerXml() throws Exception {
     }
   }
 
+  @Test
+  public void testPdfWithImageRecursive() throws Exception {
+    Assume.assumeTrue("Tika server container not started", tika != null);
+    TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl);
+    byte[] data = Files.readAllBytes(getFile("extraction/pdf-with-image.pdf"));
+    // Enable recursive extraction and set header to extract images from PDF
+    ExtractionRequest request =
+        newRequest(
+            "pdf-with-image.pdf",
+            "application/pdf",
+            "xml",
+            true,
+            Map.of("X-Tika-PDFextractInlineImages", "true"));
+    try (ByteArrayInputStream in = new ByteArrayInputStream(data)) {
+      ToXMLContentHandler xmlHandler = new ToXMLContentHandler();
+      ExtractionMetadata md = backend.buildMetadataFromRequest(request);
+      backend.extractWithSaxHandler(in, request, md, xmlHandler);
+      String c = xmlHandler.toString();
+      assertNotNull(c);
+      assertTrue(c.contains("Puppet Apply"));
+      assertTrue(c.contains("embedded:image0.jpg"));
+      assertEquals("org.apache.tika.parser.DefaultParser", md.get("X-TIKA:Parsed-By-Full-Set"));
+    }
+  }
+
   private ExtractionRequest newRequest(String file, String contentType, String xml) {
-    return newRequest(file, contentType, xml, Collections.emptyMap());
+    return newRequest(file, contentType, xml, false, Collections.emptyMap());
   }
 }

From d5fef5e80d930d0c51a049be12eca947e01d31d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 26 Sep 2025 11:05:29 +0200
Subject: [PATCH 28/47] Attempt of a github workflow to run the extraction
 tests

---
 .github/workflows/gradle-extraction-check.yml | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 .github/workflows/gradle-extraction-check.yml

diff --git a/.github/workflows/gradle-extraction-check.yml b/.github/workflows/gradle-extraction-check.yml
new file mode 100644
index 00000000000..f625661dbb2
--- /dev/null
+++ b/.github/workflows/gradle-extraction-check.yml
@@ -0,0 +1,25 @@
+name: Gradle Extraction Check
+
+on:
+  pull_request:
+    branches:
+      - '*'
+
+jobs:
+  test:
+    name: gradle extraction test
+
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+
+    env:
+      DEVELOCITY_ACCESS_KEY: ${{ secrets.SOLR_DEVELOCITY_ACCESS_KEY }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - uses: ./.github/actions/prepare-for-build
+
+      - name: Run gradle check (without tests)
+        run: ./gradlew --no-daemon solr:modules:extraction:check

From 488b6bca2d98d463565849f9e3ba64725713c4ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Fri, 26 Sep 2025 11:55:32 +0200
Subject: [PATCH 29/47] Undeprecate handler, deprecate local backend

---
 .github/workflows/gradle-extraction-check.yml             | 2 +-
 .../solr/handler/extraction/ExtractingDocumentLoader.java | 8 +-------
 .../solr/handler/extraction/ExtractingRequestHandler.java | 4 ----
 .../handler/extraction/LocalTikaExtractionBackend.java    | 5 +++++
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/gradle-extraction-check.yml b/.github/workflows/gradle-extraction-check.yml
index f625661dbb2..2b1a24c4cce 100644
--- a/.github/workflows/gradle-extraction-check.yml
+++ b/.github/workflows/gradle-extraction-check.yml
@@ -21,5 +21,5 @@ jobs:
 
       - uses: ./.github/actions/prepare-for-build
 
-      - name: Run gradle check (without tests)
+      - name: Run extraction module tests
         run: ./gradlew --no-daemon solr:modules:extraction:check
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index bf275017aac..d07f25f3a0c 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -39,13 +39,7 @@
 import org.slf4j.LoggerFactory;
 import org.xml.sax.helpers.DefaultHandler;
 
-/**
- * The class responsible for loading extracted content into Solr.
- *
- * @deprecated Will be replaced with something similar that calls out to a separate Tika Server
- *     process running in its own JVM.
- */
-@Deprecated(since = "9.10.0")
+/** The class responsible for loading extracted content into Solr. */
 public class ExtractingDocumentLoader extends ContentStreamLoader {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 836fac74dc6..09e2dddb0e0 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -30,11 +30,7 @@
 /**
  * Handler for rich documents like PDF or Word or any other file format that Tika handles that need
  * the text to be extracted first from the document.
- *
- * @deprecated Will be replaced with something similar that calls out to a separate Tika Server
- *     process running in its own JVM.
  */
-@Deprecated(since = "9.10.0")
 public class ExtractingRequestHandler extends ContentStreamHandlerBase
     implements SolrCoreAware, PermissionNameProvider {
 
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 38d7777b10a..8ad2adc47c0 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -20,6 +20,7 @@
 import java.nio.file.Path;
 import java.util.Locale;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.logging.DeprecationLog;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -35,7 +36,10 @@
 /**
  * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct
  * usage of Tika from the loader.
+ *
+ * @deprecated Will be removed soon, please use the 'tikaserver' extraction backend instead.
  */
+@Deprecated(since = "9.10.0")
 public class LocalTikaExtractionBackend implements ExtractionBackend {
   private final TikaConfig tikaConfig;
   private final ParseContextConfig parseContextConfig;
@@ -93,6 +97,7 @@ public LocalTikaExtractionBackend(
     this.tikaConfig = cfg;
     this.parseContextConfig = parseContextConfig;
     this.autoDetectParser = new AutoDetectParser(cfg);
+    DeprecationLog.log("Local Tika", "The 'local' extraction backend is deprecated");
   }
 
   public static final String NAME = "local";

From 1ef879c191e016b21760bfe1455af77e478e8436 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 11:23:35 +0200
Subject: [PATCH 30/47] Do not create another corecontainer in tika test

---
 .../extraction/ExtractingRequestHandlerTestAbstract.java     | 5 -----
 .../extraction/ExtractingRequestHandlerTikaServerTest.java   | 3 ---
 2 files changed, 8 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index 38cf275b3f1..7c9a9ad2106 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -19,8 +19,6 @@
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Locale;
-import java.util.TimeZone;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.util.ContentStream;
@@ -43,9 +41,6 @@ public abstract class ExtractingRequestHandlerTestAbstract extends SolrTestCaseJ
   @SuppressWarnings("resource")
   @BeforeClass
   public static void beforeClass() throws Exception {
-    // Is the JDK/env affected by a known bug?
-    final String tzDisplayName =
-        TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US);
     initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr"));
   }
 
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index 06d5ac96527..cea8efa6fce 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -71,7 +71,6 @@ public static void beforeClassTika() throws Exception {
       System.setProperty("solr.test.tikaserver.url", baseUrl);
       System.setProperty("solr.test.extraction.backend", "tikaserver");
       log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl);
-      ExtractingRequestHandlerTestAbstract.beforeClass();
     } catch (Throwable t) {
       // Best-effort cleanup to avoid leaking resources if class initialization fails
       System.clearProperty("solr.test.tikaserver.url");
@@ -83,8 +82,6 @@ public static void beforeClassTika() throws Exception {
 
   @AfterClass
   public static void afterClassTika() throws Exception {
-    // TODO: There are still thread leaks after these tests, probably due to failing tests
-    deleteCore();
     // Stop and dispose of the Tika container if it was started
     if (tika != null) {
       try {

From 6472d7e3e7a73df2fd7616a300fc507edd0d3628 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 11:33:01 +0200
Subject: [PATCH 31/47] Hygiene fixes

---
 .../ExtractingRequestHandlerTikaServerTest.java    | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index cea8efa6fce..f23fbe41035 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -29,12 +29,8 @@
 import org.testcontainers.containers.GenericContainer;
 import org.testcontainers.containers.wait.strategy.Wait;
 
-/**
- * Generic tests, randomized between local and tikaserver backends TODO: This test still has thread
- * leaks.
- */
+/** Generic tests, randomized between local and tikaserver backends */
 @ThreadLeakFilters(
-    defaultFilters = true,
     filters = {
       SolrIgnoredThreadsFilter.class,
       QuickPatchThreadsFilter.class,
@@ -58,9 +54,10 @@ public boolean reject(Thread t) {
     }
   }
 
+  @SuppressWarnings("resource")
   @BeforeClass
-  public static void beforeClassTika() throws Exception {
-    String baseUrl = null;
+  public static void beforeClassTika() {
+    String baseUrl;
     tika =
         new GenericContainer<>("apache/tika:3.2.3.0-full")
             .withExposedPorts(9998)
@@ -81,8 +78,7 @@ public static void beforeClassTika() throws Exception {
   }
 
   @AfterClass
-  public static void afterClassTika() throws Exception {
-    // Stop and dispose of the Tika container if it was started
+  public static void afterClassTika() {
     if (tika != null) {
       try {
         tika.stop();

From 4397fe6ec6c61fdec927e2a2e0b1092d7cf54981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 11:49:47 +0200
Subject: [PATCH 32/47] Use JUnit @ClassRule for cleaning up container Add some
 thread names to filter

---
 ...xtractingRequestHandlerTikaServerTest.java | 30 +++++++------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index f23fbe41035..e1d7c1275f7 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -24,6 +24,7 @@
 import org.junit.AfterClass;
 import org.junit.Assume;
 import org.junit.BeforeClass;
+import org.junit.ClassRule;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.testcontainers.containers.GenericContainer;
@@ -38,7 +39,13 @@
     })
 public class ExtractingRequestHandlerTikaServerTest extends ExtractingRequestHandlerTestAbstract {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-  private static GenericContainer<?> tika;
+
+  @SuppressWarnings("resource")
+  @ClassRule
+  public static GenericContainer<?> tika =
+      new GenericContainer<>("apache/tika:3.2.3.0-full")
+          .withExposedPorts(9998)
+          .waitingFor(Wait.forListeningPort());
 
   // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test
   @SuppressWarnings("NewClassNamingConvention")
@@ -49,19 +56,16 @@ public boolean reject(Thread t) {
       String n = t.getName();
       return n.startsWith("testcontainers-ryuk")
           || n.startsWith("testcontainers-wait-")
+          || n.startsWith("testcontainers-pull-watchdog-")
+          || n.equals("JNA Cleaner")
           || n.startsWith("HttpClient-")
           || n.startsWith("HttpClient-TestContainers");
     }
   }
 
-  @SuppressWarnings("resource")
   @BeforeClass
   public static void beforeClassTika() {
     String baseUrl;
-    tika =
-        new GenericContainer<>("apache/tika:3.2.3.0-full")
-            .withExposedPorts(9998)
-            .waitingFor(Wait.forListeningPort());
     try {
       tika.start();
       baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998);
@@ -69,9 +73,6 @@ public static void beforeClassTika() {
       System.setProperty("solr.test.extraction.backend", "tikaserver");
       log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl);
     } catch (Throwable t) {
-      // Best-effort cleanup to avoid leaking resources if class initialization fails
-      System.clearProperty("solr.test.tikaserver.url");
-      System.clearProperty("solr.test.extraction.backend");
       // Skip tests if Docker/Testcontainers are not available in the environment
       Assume.assumeNoException("Docker/Testcontainers not available; skipping test", t);
     }
@@ -79,17 +80,6 @@ public static void beforeClassTika() {
 
   @AfterClass
   public static void afterClassTika() {
-    if (tika != null) {
-      try {
-        tika.stop();
-      } finally {
-        try {
-          tika.close();
-        } catch (Throwable ignore2) {
-        }
-        tika = null;
-      }
-    }
     System.clearProperty("solr.test.tikaserver.url");
     System.clearProperty("solr.test.extraction.backend");
   }

From 1a80ca887f9a42c64e8ce86f4f585d2291d0eb6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 11:55:39 +0200
Subject: [PATCH 33/47] Lockfile

---
 solr/modules/cuvs/gradle.lockfile | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/solr/modules/cuvs/gradle.lockfile b/solr/modules/cuvs/gradle.lockfile
index 37d36a3d639..474f1b60526 100644
--- a/solr/modules/cuvs/gradle.lockfile
+++ b/solr/modules/cuvs/gradle.lockfile
@@ -44,17 +44,17 @@ io.dropwizard.metrics:metrics-jmx:4.2.26=jarValidation,runtimeClasspath,runtimeL
 io.dropwizard.metrics:metrics-jvm:4.2.26=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath
 io.github.eisop:dataflow-errorprone:3.41.0-eisop1=annotationProcessor,errorprone,testAnnotationProcessor
 io.github.java-diff-utils:java-diff-utils:4.12=annotationProcessor,errorprone,testAnnotationProcessor
-io.netty:netty-buffer:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-codec-base:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-common:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-handler:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-resolver:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-buffer:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-codec-base:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-common:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-handler:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-resolver:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 io.netty:netty-tcnative-boringssl-static:2.0.70.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 io.netty:netty-tcnative-classes:2.0.70.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-transport-classes-epoll:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-transport-native-epoll:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-transport-native-unix-common:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
-io.netty:netty-transport:4.2.4.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-transport-classes-epoll:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-transport-native-epoll:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-transport-native-unix-common:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
+io.netty:netty-transport:4.2.6.Final=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 io.opentelemetry:opentelemetry-api:1.53.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 io.opentelemetry:opentelemetry-common:1.53.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath
 io.opentelemetry:opentelemetry-context:1.53.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath

From 8822fba6f1d89d0ed6e9a297db5c31c50044633b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 12:05:06 +0200
Subject: [PATCH 34/47] Move threadFilter into default SolrIgnoredThreadsFilter

---
 ...xtractingRequestHandlerTikaServerTest.java | 20 +------------------
 .../apache/solr/SolrIgnoredThreadsFilter.java | 16 ++++++++++++++-
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index e1d7c1275f7..5bb44b05f6b 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -16,7 +16,6 @@
  */
 package org.apache.solr.handler.extraction;
 
-import com.carrotsearch.randomizedtesting.ThreadFilter;
 import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
 import java.lang.invoke.MethodHandles;
 import org.apache.lucene.tests.util.QuickPatchThreadsFilter;
@@ -34,8 +33,7 @@
 @ThreadLeakFilters(
     filters = {
       SolrIgnoredThreadsFilter.class,
-      QuickPatchThreadsFilter.class,
-      ExtractingRequestHandlerTikaServerTest.TestcontainersThreadsFilter.class
+      QuickPatchThreadsFilter.class
     })
 public class ExtractingRequestHandlerTikaServerTest extends ExtractingRequestHandlerTestAbstract {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -47,22 +45,6 @@ public class ExtractingRequestHandlerTikaServerTest extends ExtractingRequestHan
           .withExposedPorts(9998)
           .waitingFor(Wait.forListeningPort());
 
-  // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test
-  @SuppressWarnings("NewClassNamingConvention")
-  public static class TestcontainersThreadsFilter implements ThreadFilter {
-    @Override
-    public boolean reject(Thread t) {
-      if (t == null || t.getName() == null) return false;
-      String n = t.getName();
-      return n.startsWith("testcontainers-ryuk")
-          || n.startsWith("testcontainers-wait-")
-          || n.startsWith("testcontainers-pull-watchdog-")
-          || n.equals("JNA Cleaner")
-          || n.startsWith("HttpClient-")
-          || n.startsWith("HttpClient-TestContainers");
-    }
-  }
-
   @BeforeClass
   public static void beforeClassTika() {
     String baseUrl;
diff --git a/solr/test-framework/src/java/org/apache/solr/SolrIgnoredThreadsFilter.java b/solr/test-framework/src/java/org/apache/solr/SolrIgnoredThreadsFilter.java
index b9d31b8b604..ed840e44b21 100644
--- a/solr/test-framework/src/java/org/apache/solr/SolrIgnoredThreadsFilter.java
+++ b/solr/test-framework/src/java/org/apache/solr/SolrIgnoredThreadsFilter.java
@@ -83,6 +83,20 @@ public boolean reject(Thread t) {
       return true;
     }
 
-    return threadName.startsWith("closeThreadPool");
+    if (threadName.startsWith("closeThreadPool")) {
+      return true;
+    }
+
+    // TestContainers
+    if (threadName.startsWith("testcontainers-ryuk")
+        || threadName.startsWith("testcontainers-wait-")
+        || threadName.startsWith("testcontainers-pull-watchdog-")
+        || threadName.equals("JNA Cleaner")
+        || threadName.startsWith("HttpClient-")
+        || threadName.startsWith("HttpClient-TestContainers")) {
+      return true;
+    }
+
+    return false;
   }
 }

From 06ee12406ad415e9e80e26c90c4eea525f93aafa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 12:07:56 +0200
Subject: [PATCH 35/47] Update jar checksums

---
 solr/licenses/cuvs-java-25.10.0.jar.sha1   | 2 +-
 solr/licenses/cuvs-lucene-25.10.0.jar.sha1 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/solr/licenses/cuvs-java-25.10.0.jar.sha1 b/solr/licenses/cuvs-java-25.10.0.jar.sha1
index 0ebe3a3840e..e69a7cc14c3 100644
--- a/solr/licenses/cuvs-java-25.10.0.jar.sha1
+++ b/solr/licenses/cuvs-java-25.10.0.jar.sha1
@@ -1 +1 @@
-61f3a3ce565a659d296775a7b06fc20dabccee41
+6c22acfbdbc7f3a4a78a2edea46124df872e9ea5
diff --git a/solr/licenses/cuvs-lucene-25.10.0.jar.sha1 b/solr/licenses/cuvs-lucene-25.10.0.jar.sha1
index cee312df718..21e8aa8806b 100644
--- a/solr/licenses/cuvs-lucene-25.10.0.jar.sha1
+++ b/solr/licenses/cuvs-lucene-25.10.0.jar.sha1
@@ -1 +1 @@
-b5a458458255bfdfa6688a571b5bee032428f2d4
+28c49fd9f03aa25dccc75da517247c9bc46e64b8

From f3b83456f5778e96898dafb07bc493e34faa8265 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 12:11:15 +0200
Subject: [PATCH 36/47] Tidy

---
 .../extraction/ExtractingRequestHandlerTikaServerTest.java  | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
index 5bb44b05f6b..b313a103668 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java
@@ -30,11 +30,7 @@
 import org.testcontainers.containers.wait.strategy.Wait;
 
 /** Generic tests, randomized between local and tikaserver backends */
-@ThreadLeakFilters(
-    filters = {
-      SolrIgnoredThreadsFilter.class,
-      QuickPatchThreadsFilter.class
-    })
+@ThreadLeakFilters(filters = {SolrIgnoredThreadsFilter.class, QuickPatchThreadsFilter.class})
 public class ExtractingRequestHandlerTikaServerTest extends ExtractingRequestHandlerTestAbstract {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 

From ad4b815cc16d13d0917ee88a6bb3b354ace14653 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 12:40:41 +0200
Subject: [PATCH 37/47] Update license and notice files

---
 solr/licenses/docker-java-LICENSE-ASL.txt    | 372 ++++++++++---------
 solr/licenses/docker-java-NOTICE.txt         |   2 +-
 solr/licenses/duct-tape-LICENSE-MIT.txt      |   4 +-
 solr/licenses/testcontainers-LICENSE-MIT.txt |   4 +-
 4 files changed, 206 insertions(+), 176 deletions(-)

diff --git a/solr/licenses/docker-java-LICENSE-ASL.txt b/solr/licenses/docker-java-LICENSE-ASL.txt
index 492933f08c2..38275f2f4fe 100644
--- a/solr/licenses/docker-java-LICENSE-ASL.txt
+++ b/solr/licenses/docker-java-LICENSE-ASL.txt
@@ -1,176 +1,202 @@
-Apache License
+
+                                 Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
 
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-1. Definitions.
-
-"License" shall mean the terms and conditions for use, reproduction,
-    and distribution as defined by Sections 1 through 9 of this document.
-
-"Licensor" shall mean the copyright owner or entity authorized by
-    the copyright owner that is granting the License.
-
-"Legal Entity" shall mean the union of the acting entity and all
-    other entities that control, are controlled by, or are under common
-    control with that entity. For the purposes of this definition,
-    "control" means (i) the power, direct or indirect, to cause the
-    direction or management of such entity, whether by contract or
-    otherwise, or (ii) ownership of fifty percent (50%) or more of the
-    outstanding shares, or (iii) beneficial ownership of such entity.
-
-"You" (or "Your") shall mean an individual or Legal Entity
-    exercising permissions granted by this License.
-
-"Source" form shall mean the preferred form for making modifications,
-    including but not limited to software source code, documentation
-    source, and configuration files.
-
-"Object" form shall mean any form resulting from mechanical
-    transformation or translation of a Source form, including but
-    not limited to compiled object code, generated documentation,
-    and conversions to other media types.
-
-"Work" shall mean the work of authorship, whether in Source or
-    Object form, made available under the License, as indicated by a
-    copyright notice that is included in or attached to the work
-    (an example is provided in the Appendix below).
-
-"Derivative Works" shall mean any work, whether in Source or Object
-    form, that is based on (or derived from) the Work and for which the
-    editorial revisions, annotations, elaborations, or other modifications
-    represent, as a whole, an original work of authorship. For the purposes
-    of this License, Derivative Works shall not include works that remain
-    separable from, or merely link (or bind by name) to the interfaces of,
-    the Work and Derivative Works thereof.
-
-"Contribution" shall mean any work of authorship, including
-    the original version of the Work and any modifications or additions
-    to that Work or Derivative Works thereof, that is intentionally
-    submitted to Licensor for inclusion in the Work by the copyright owner
-    or by an individual or Legal Entity authorized to submit on behalf of
-    the copyright owner. For the purposes of this definition, "submitted"
-    means any form of electronic, verbal, or written communication sent
-    to the Licensor or its representatives, including but not limited to
-    communication on electronic mailing lists, source code control systems,
-    and issue tracking systems that are managed by, or on behalf of, the
-    Licensor for the purpose of discussing and improving the Work, but
-    excluding communication that is conspicuously marked or otherwise
-    designated in writing by the copyright owner as "Not a Contribution."
-
-"Contributor" shall mean Licensor and any individual or Legal Entity
-    on behalf of whom a Contribution has been received by Licensor and
-    subsequently incorporated within the Work.
-
-2. Grant of Copyright License. Subject to the terms and conditions of
-    this License, each Contributor hereby grants to You a perpetual,
-    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-    copyright license to reproduce, prepare Derivative Works of,
-    publicly display, publicly perform, sublicense, and distribute the
-    Work and such Derivative Works in Source or Object form.
-
-3. Grant of Patent License. Subject to the terms and conditions of
-    this License, each Contributor hereby grants to You a perpetual,
-    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-    (except as stated in this section) patent license to make, have made,
-    use, offer to sell, sell, import, and otherwise transfer the Work,
-    where such license applies only to those patent claims licensable
-    by such Contributor that are necessarily infringed by their
-    Contribution(s) alone or by combination of their Contribution(s)
-    with the Work to which such Contribution(s) was submitted. If You
-    institute patent litigation against any entity (including a
-    cross-claim or counterclaim in a lawsuit) alleging that the Work
-    or a Contribution incorporated within the Work constitutes direct
-    or contributory patent infringement, then any patent licenses
-    granted to You under this License for that Work shall terminate
-    as of the date such litigation is filed.
-
-4. Redistribution. You may reproduce and distribute copies of the
-    Work or Derivative Works thereof in any medium, with or without
-    modifications, and in Source or Object form, provided that You
-    meet the following conditions:
-
-    (a) You must give any other recipients of the Work or
-        Derivative Works a copy of this License; and
-
-    (b) You must cause any modified files to carry prominent notices
-        stating that You changed the files; and
-
-    (c) You must retain, in the Source form of any Derivative Works
-        that You distribute, all copyright, patent, trademark, and
-        attribution notices from the Source form of the Work,
-        excluding those notices that do not pertain to any part of
-        the Derivative Works; and
-
-    (d) If the Work includes a "NOTICE" text file as part of its
-        distribution, then any Derivative Works that You distribute must
-        include a readable copy of the attribution notices contained
-        within such NOTICE file, excluding those notices that do not
-        pertain to any part of the Derivative Works, in at least one
-        of the following places: within a NOTICE text file distributed
-        as part of the Derivative Works; within the Source form or
-        documentation, if provided along with the Derivative Works; or,
-        within a display generated by the Derivative Works, if and
-        wherever such third-party notices normally appear. The contents
-        of the NOTICE file are for informational purposes only and
-        do not modify the License. You may add Your own attribution
-        notices within Derivative Works that You distribute, alongside
-        or as an addendum to the NOTICE text from the Work, provided
-        that such additional attribution notices cannot be construed
-        as modifying the License.
-
-    You may add Your own copyright statement to Your modifications and
-    may provide additional or different license terms and conditions
-    for use, reproduction, or distribution of Your modifications, or
-    for any such Derivative Works as a whole, provided Your use,
-    reproduction, and distribution of the Work otherwise complies with
-    the conditions stated in this License.
-
-5. Submission of Contributions. Unless You explicitly state otherwise,
-    any Contribution intentionally submitted for inclusion in the Work
-    by You to the Licensor shall be under the terms and conditions of
-    this License, without any additional terms or conditions.
-    Notwithstanding the above, nothing herein shall supersede or modify
-    the terms of any separate license agreement you may have executed
-    with Licensor regarding such Contributions.
-
-6. Trademarks. This License does not grant permission to use the trade
-    names, trademarks, service marks, or product names of the Licensor,
-    except as required for reasonable and customary use in describing the
-    origin of the Work and reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty. Unless required by applicable law or
-    agreed to in writing, Licensor provides the Work (and each
-    Contributor provides its Contributions) on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-    implied, including, without limitation, any warranties or conditions
-    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-    PARTICULAR PURPOSE. You are solely responsible for determining the
-    appropriateness of using or redistributing the Work and assume any
-    risks associated with Your exercise of permissions under this License.
-
-8. Limitation of Liability. In no event and under no legal theory,
-    whether in tort (including negligence), contract, or otherwise,
-    unless required by applicable law (such as deliberate and grossly
-    negligent acts) or agreed to in writing, shall any Contributor be
-    liable to You for damages, including any direct, indirect, special,
-    incidental, or consequential damages of any character arising as a
-    result of this License or out of the use or inability to use the
-    Work (including but not limited to damages for loss of goodwill,
-    work stoppage, computer failure or malfunction, or any and all
-    other commercial damages or losses), even if such Contributor
-    has been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability. While redistributing
-    the Work or Derivative Works thereof, You may choose to offer,
-    and charge a fee for, acceptance of support, warranty, indemnity,
-    or other liability obligations and/or rights consistent with this
-    License. However, in accepting such obligations, You may act only
-    on Your own behalf and on Your sole responsibility, not on behalf
-    of any other Contributor, and only if You agree to indemnify,
-    defend, and hold each Contributor harmless for any liability
-    incurred by, or claims asserted against, such Contributor by reason
-    of your accepting any such warranty or additional liability.
-
-END OF TERMS AND CONDITIONS
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2013] [docker-java@googlegroups.com]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/solr/licenses/docker-java-NOTICE.txt b/solr/licenses/docker-java-NOTICE.txt
index 49a9e022cce..f54dcc4f15a 100644
--- a/solr/licenses/docker-java-NOTICE.txt
+++ b/solr/licenses/docker-java-NOTICE.txt
@@ -1,6 +1,6 @@
 This product includes software developed by the docker-java project.
 
-Copyright (c) 2013-2025, docker-java project contributors
+Copyright (c) 2013, docker-java project contributors
 
 Project: https://github.com/docker-java/docker-java
 
diff --git a/solr/licenses/duct-tape-LICENSE-MIT.txt b/solr/licenses/duct-tape-LICENSE-MIT.txt
index 9cf106272ac..2091a63f988 100644
--- a/solr/licenses/duct-tape-LICENSE-MIT.txt
+++ b/solr/licenses/duct-tape-LICENSE-MIT.txt
@@ -1,4 +1,6 @@
-MIT License
+The MIT License (MIT)
+
+Copyright (c) 2014 Richard North
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/solr/licenses/testcontainers-LICENSE-MIT.txt b/solr/licenses/testcontainers-LICENSE-MIT.txt
index 9cf106272ac..9c9e8bc5563 100644
--- a/solr/licenses/testcontainers-LICENSE-MIT.txt
+++ b/solr/licenses/testcontainers-LICENSE-MIT.txt
@@ -1,4 +1,6 @@
-MIT License
+The MIT License (MIT)
+
+Copyright (c) 2015-2019 Richard North
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From 2a407982a080d438a4461caf5321706dc9a4ddd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 12:51:54 +0200
Subject: [PATCH 38/47] Remove Dummy backend

---
 .../extraction/DummyExtractionBackend.java    | 58 -------------------
 .../handler/extraction/ExtractionBackend.java |  2 +-
 .../extraction/ExtractionBackendFactory.java  |  3 +-
 .../ExtractingRequestHandlerTestAbstract.java | 29 ----------
 4 files changed, 2 insertions(+), 90 deletions(-)
 delete mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
deleted file mode 100644
index cf42e72453b..00000000000
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.handler.extraction;
-
-import java.io.InputStream;
-import org.xml.sax.helpers.DefaultHandler;
-
-/** Dummy backend that emits predictable test data without actually parsing input content. */
-public class DummyExtractionBackend implements ExtractionBackend {
-  public static final String NAME = "dummy";
-  private final String text = "This is dummy extracted content";
-
-  @Override
-  public String name() {
-    return NAME;
-  }
-
-  @Override
-  public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) {
-    ExtractionMetadata metadata = buildMetadataFromRequest(request);
-    metadata.add("Dummy-Backend", "true");
-    metadata.add(
-        "Content-Type",
-        request.contentType != null ? request.contentType : "application/octet-stream");
-    if (request.resourceName != null) {
-      metadata.add("resourcename", request.resourceName);
-    }
-    return new ExtractionResult(text, metadata);
-  }
-
-  @Override
-  public void extractWithSaxHandler(
-      InputStream inputStream,
-      ExtractionRequest request,
-      ExtractionMetadata md,
-      DefaultHandler saxContentHandler)
-      throws Exception {
-
-    ExtractionResult res = extract(inputStream, request);
-    md.putAll(res.getMetadata().asMap());
-    // Append the content to the SAX handler
-    saxContentHandler.characters(res.getContent().toCharArray(), 0, res.getContent().length());
-  }
-}
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index 9d15b5a1159..9bdc61c39cc 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -53,6 +53,6 @@ default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) {
     return md;
   }
 
-  /** A short name for debugging/config, e.g., "local" or "dummy". */
+  /** A short name for debugging/config, e.g., "local" or "tikaserver". */
   String name();
 }
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
index 7ee0c163152..d4a53e9d510 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java
@@ -24,7 +24,7 @@
 
 /**
  * Factory for ExtractionBackend instances. Lazily constructs backends by short name (e.g., "local",
- * "dummy") and caches them for reuse.
+ * "tikaserver") and caches them for reuse.
  */
 public class ExtractionBackendFactory {
   private final SolrCore core;
@@ -69,7 +69,6 @@ private String normalize(String name) {
   /** Creates a new backend instance for the given normalized name. */
   protected ExtractionBackend create(String normalizedName) throws Exception {
     return switch (normalizedName) {
-      case DummyExtractionBackend.NAME -> new DummyExtractionBackend();
       case TikaServerExtractionBackend.NAME -> new TikaServerExtractionBackend(
           tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998");
       case LocalTikaExtractionBackend.NAME -> new LocalTikaExtractionBackend(
diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
index 7c9a9ad2106..c8a43fcb9e8 100644
--- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
+++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java
@@ -1067,35 +1067,6 @@ SolrQueryResponse loadLocalFromHandler(String handler, String filename, String..
     }
   }
 
-  @Test
-  public void testDummyBackendExtractOnly() throws Exception {
-    ExtractingRequestHandler handler =
-        (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
-    assertNotNull("handler is null and it shouldn't be", handler);
-    SolrQueryResponse rsp =
-        loadLocal(
-            "extraction/version_control.txt",
-            ExtractingParams.EXTRACTION_BACKEND,
-            DummyExtractionBackend.NAME,
-            ExtractingParams.EXTRACT_ONLY,
-            "true",
-            ExtractingParams.EXTRACT_FORMAT,
-            ExtractingDocumentLoader.TEXT_FORMAT);
-    assertNotNull("rsp is null and it shouldn't be", rsp);
-    NamedList<?> list = rsp.getValues();
-    String extraction = (String) list.get("version_control.txt");
-    assertNotNull("extraction is null and it shouldn't be", extraction);
-    assertEquals("This is dummy extracted content", extraction);
-
-    NamedList<?> nl = (NamedList<?>) list.get("version_control.txt_metadata");
-    assertNotNull("metadata is null and it shouldn't be", nl);
-    Object dummyFlag = nl.get("Dummy-Backend");
-    assertNotNull("Dummy-Backend metadata missing", dummyFlag);
-    if (dummyFlag instanceof String[]) {
-      assertEquals("true", ((String[]) dummyFlag)[0]);
-    }
-  }
-
   SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
     return loadLocalFromHandler("/update/extract", filename, args);
   }

From 326e28af008cf0485f29bdbf140ff2586007d163 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 13:09:31 +0200
Subject: [PATCH 39/47] Refactor duplicated code

---
 .../extraction/ExtractingDocumentLoader.java  | 52 ++++++++++---------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index d07f25f3a0c..75c67d0f540 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -35,6 +35,8 @@
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.helpers.DefaultHandler;
@@ -112,7 +114,7 @@ public void load(
       String extractFormat =
           params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT);
 
-      // Parse optional passwords file into a map (keeps Tika usages out of this class)
+      // Parse optional passwords file into a map
       LinkedHashMap<Pattern, String> pwMap = null;
       String passwordsFile = params.get("passwordsFile");
       if (passwordsFile != null) {
@@ -152,31 +154,13 @@ public void load(
           String content;
           if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractionRequest.extractFormat)
               || xpathExpr != null) {
-            org.apache.tika.sax.ToTextContentHandler textHandler =
-                new org.apache.tika.sax.ToTextContentHandler();
-            DefaultHandler ch = textHandler;
-            if (xpathExpr != null) {
-              org.apache.tika.sax.xpath.XPathParser xparser =
-                  new org.apache.tika.sax.xpath.XPathParser(
-                      "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
-              org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
-              ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher);
-            }
-            backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch);
-            content = textHandler.toString();
+            content =
+                extractWithHandler(
+                    inputStream, xpathExpr, extractionRequest, md, new ToTextContentHandler());
           } else { // XML format
-            org.apache.tika.sax.ToXMLContentHandler toXml =
-                new org.apache.tika.sax.ToXMLContentHandler();
-            DefaultHandler ch = toXml;
-            if (xpathExpr != null) {
-              org.apache.tika.sax.xpath.XPathParser xparser =
-                  new org.apache.tika.sax.xpath.XPathParser(
-                      "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
-              org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
-              ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher);
-            }
-            backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch);
-            content = toXml.toString();
+            content =
+                extractWithHandler(
+                    inputStream, xpathExpr, extractionRequest, md, new ToXMLContentHandler());
             if (!content.startsWith("<?xml")) {
               content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + content;
             }
@@ -264,6 +248,24 @@ public void load(
     }
   }
 
+  private String extractWithHandler(
+      InputStream inputStream,
+      String xpathExpr,
+      ExtractionRequest extractionRequest,
+      ExtractionMetadata md,
+      DefaultHandler ch)
+      throws Exception {
+    if (xpathExpr != null) {
+      org.apache.tika.sax.xpath.XPathParser xparser =
+          new org.apache.tika.sax.xpath.XPathParser(
+              "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML);
+      org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr);
+      ch = new org.apache.tika.sax.xpath.MatchingContentHandler(ch, matcher);
+    }
+    backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch);
+    return ch.toString();
+  }
+
   private final Map<String, String> fieldMappings = new LinkedHashMap<>();
 
   {

From 7b81edf2dbee7f6bc2949bd9f0d0e847466feca6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 23:34:06 +0200
Subject: [PATCH 40/47] Remove unused code and add docs to
 ExtractingDocumentLoader

---
 .../extraction/ExtractingDocumentLoader.java  | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index 75c67d0f540..e706b308f29 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -41,7 +41,10 @@
 import org.slf4j.LoggerFactory;
 import org.xml.sax.helpers.DefaultHandler;
 
-/** The class responsible for loading extracted content into Solr. */
+/**
+ * The class responsible for loading extracted content into Solr. It will delegate parsing to a
+ * {@link ExtractionBackend} and then load the resulting SolrInputDocument into Solr.
+ */
 public class ExtractingDocumentLoader extends ContentStreamLoader {
 
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -140,7 +143,7 @@ public void load(
 
       boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false);
       String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS);
-      boolean needLegacySax =
+      boolean needsSaxParsing =
           extractOnly
               || xpathExpr != null
               || captureAttr
@@ -168,19 +171,12 @@ public void load(
 
           appendBackCompatTikaMetadata(md);
 
-          // Write content
           rsp.add(stream.getName(), content);
-          // Write metadata
           NamedList<String[]> metadataNL = new NamedList<>();
           for (String name : md.names()) {
             metadataNL.add(name, md.getValues(name));
           }
           rsp.add(stream.getName() + "_metadata", metadataNL);
-        } catch (UnsupportedOperationException uoe) {
-          // For backends that don't support xpath
-          throw new SolrException(
-              SolrException.ErrorCode.BAD_REQUEST,
-              "XPath filtering is not supported by backend '" + backend.name() + "'.");
         } catch (Exception e) {
           if (ignoreTikaException) {
             if (log.isWarnEnabled())
@@ -192,21 +188,12 @@ public void load(
         return;
       }
 
-      if (needLegacySax) {
-        // Indexing with capture/xpath/etc: delegate SAX parse to backend
+      if (needsSaxParsing) {
         ExtractionMetadata metadata = backend.buildMetadataFromRequest(extractionRequest);
         SolrContentHandler handler =
             factory.createSolrContentHandler(metadata, params, req.getSchema());
         try {
           backend.extractWithSaxHandler(inputStream, extractionRequest, metadata, handler);
-        } catch (UnsupportedOperationException uoe) {
-          // For backends that don't support parseToSolrContentHandler
-          if (log.isWarnEnabled()) {
-            log.warn("skip extracting text since tika backend does not yet support this option");
-          }
-          throw new SolrException(
-              SolrException.ErrorCode.BAD_REQUEST,
-              "The requested operation is not supported by backend '" + backend.name() + "'.");
         } catch (Exception e) {
           if (ignoreTikaException) {
             if (log.isWarnEnabled()) {
@@ -222,7 +209,6 @@ public void load(
         return;
       }
 
-      // Default simple backend-neutral path
       ExtractionResult result;
       try {
         result = backend.extract(inputStream, extractionRequest);
@@ -248,6 +234,11 @@ public void load(
     }
   }
 
+  /*
+   * Extracts content from the given input stream using an optional XPath expression
+   * and a SAX content handler. The extraction process may filter content based on
+   * the XPath expression, if provided.
+   */
   private String extractWithHandler(
       InputStream inputStream,
       String xpathExpr,
@@ -268,6 +259,7 @@ private String extractWithHandler(
 
   private final Map<String, String> fieldMappings = new LinkedHashMap<>();
 
+  // TODO: Improve backward compatibility by adding more mappings
   {
     fieldMappings.put("dc:title", "title");
     fieldMappings.put("dc:creator", "author");
@@ -284,6 +276,12 @@ private String extractWithHandler(
     fieldMappings.put("pdf:docinfo:keywords", "Keywords");
   }
 
+  /*
+   * Appends back-compatible metadata into the given {@code ExtractionMetadata} instance by mapping
+   * source fields to target fields, provided that backward compatibility is enabled. If a source
+   * field exists and the target field is not yet populated, the values from the source field will
+   * be added to the target field.
+   */
   private void appendBackCompatTikaMetadata(ExtractionMetadata md) {
     if (!backCompat) {
       return;

From 87434287e9b2b1b789b02a99052afa89150290e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 23:39:51 +0200
Subject: [PATCH 41/47] Better naming of backCompat and recursive flags

---
 .../apache/solr/handler/extraction/ExtractingParams.java | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
index eb70d5b6f6d..8baa817c4fb 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
@@ -141,8 +141,11 @@ public interface ExtractingParams {
   public static final String EXTRACTION_BACKEND = "extraction.backend";
 
   /** Fix metadata to match Tika 1.x */
-  public static final String BACK_COMPATIBILITY = "backCompatibility";
+  public static final String BACK_COMPATIBILITY = "extraction.metadataBackCompat";
 
-  /** Enable recursive parsing of embedded documents */
-  String RECURSIVE = "recursive";
+  /**
+   * Enable recursive parsing of embedded documents when using TikaServer. This is experimental,
+   * uses /rmeta endpoint, uses more RAM and is disabled by default.
+   */
+  public static final String RECURSIVE = "extraction.tikaserver.recursive";
 }

From 1b67618e92744d548a5e2082975ae9ddf59bd70c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 23:52:58 +0200
Subject: [PATCH 42/47] Docs for ExtractionRequest

---
 .../handler/extraction/ExtractionRequest.java | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
index 99ab4d8d742..747e2e51e3c 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java
@@ -21,20 +21,39 @@
 
 /** Immutable request info needed by extraction backends. */
 public class ExtractionRequest {
-  public final String streamType; // explicit MIME type (optional)
-  public final String resourceName; // filename hint
-  public final String contentType; // HTTP content-type header
-  public final String charset; // derived charset if available
+  public final String streamType;
+  public final String resourceName;
+  public final String contentType;
+  public final String charset;
   public final String streamName;
   public final String streamSourceInfo;
   public final Long streamSize;
-  public final String resourcePassword; // optional password for encrypted docs
-  public final java.util.LinkedHashMap<java.util.regex.Pattern, String>
-      passwordsMap; // optional passwords map
+  public final String resourcePassword;
+  public final java.util.LinkedHashMap<java.util.regex.Pattern, String> passwordsMap;
   public final String extractFormat;
+  // TODO: This is only used by TikaServerExtractionBackend.
   public final boolean recursive;
+  // TODO: This is only used by TikaServerExtractionBackend, change to `features` map?
   public final Map<String, String> tikaRequestHeaders = new HashMap<>();
 
+  /**
+   * Constructs an ExtractionRequest object containing metadata and configurations for extraction
+   * backends.
+   *
+   * @param streamType the explicit MIME type of the document (optional)
+   * @param resourceName the name of the resource, typically a filename hint
+   * @param contentType the HTTP content-type header value
+   * @param charset the derived character set of the stream if available
+   * @param streamName the name of the content stream
+   * @param streamSourceInfo additional information about the stream source
+   * @param streamSize the size of the stream in bytes
+   * @param resourcePassword an optional password used for encrypted documents
+   * @param passwordsMap an optional map of regex patterns to passwords for encrypted content
+   * @param extractFormat the desired format for extraction output
+   * @param recursive a flag indicating whether extraction should be recursive. TikaServer only
+   * @param tikaRequestHeaders optional headers to be included in requests to the extraction
+   *     service. TikaServer only
+   */
   public ExtractionRequest(
       String streamType,
       String resourceName,

From 2803323a21de1d4a703294887f6e166fd35e4fd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 23:59:29 +0200
Subject: [PATCH 43/47] Avoid copying content string unnecessary

---
 .../solr/handler/extraction/ExtractingDocumentLoader.java     | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index e706b308f29..cc04b15dfcc 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -225,11 +225,9 @@ public void load(
 
       appendBackCompatTikaMetadata(metadata);
 
-      String content = result.getContent();
-
       SolrContentHandler handler =
           factory.createSolrContentHandler(metadata, params, req.getSchema());
-      handler.appendToContent(content);
+      handler.appendToContent(result.getContent());
       addDoc(handler);
     }
   }

From 9a26b2e5ff5b633a854b7ccb8b57ffa597fa93b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sat, 4 Oct 2025 23:59:42 +0200
Subject: [PATCH 44/47] Docs for ExtractionBackend

---
 .../apache/solr/handler/extraction/ExtractionBackend.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
index 9bdc61c39cc..f6efa46bb30 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java
@@ -21,17 +21,17 @@
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.xml.sax.helpers.DefaultHandler;
 
-/** Strategy interface for content extraction backends. */
+/** Content extraction backends must implement this interface */
 public interface ExtractionBackend {
   /**
    * Extract plain text and metadata from the inputStream. Implementations should not close the
-   * inputStream. This API is backend-neutral and does not expose SAX or XML-specific types.
+   * inputStream.
    */
   ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception;
 
   /**
-   * Perform extraction of text from input stream with SAX handler. Sax handler can be
-   * SolrContentHandler, ToTextContentHandler, ToXMLContentHandler, MatchingContentHandler etc
+   * Perform extraction of text from inputStream with SAX handler. Examples of SAX handlers are
+   * SolrContentHandler, ToTextContentHandler, ToXMLContentHandler and MatchingContentHandler.
    */
   void extractWithSaxHandler(
       InputStream inputStream,

From 4444764f9220144162a667eb22d69c396c646e6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sun, 5 Oct 2025 00:20:37 +0200
Subject: [PATCH 45/47] LocalTikaExtractionBackend simplification Validate path
 of tikaConfigLoc

---
 .../extraction/LocalTikaExtractionBackend.java        | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
index 8ad2adc47c0..972c09b9037 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java
@@ -86,6 +86,7 @@ public LocalTikaExtractionBackend(
       }
     } else {
       Path configFile = Path.of(tikaConfigLoc);
+      core.getCoreContainer().assertPathAllowed(configFile);
       if (configFile.isAbsolute()) {
         cfg = new TikaConfig(configFile);
       } else { // in conf/
@@ -129,12 +130,12 @@ private ParseContext buildContext(Parser parser, ExtractionRequest request) {
     ParseContext context = parseContextConfig.create();
     context.set(Parser.class, parser);
     context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
-    PasswordProvider pwd = new RegexRulesPasswordProvider();
-    if (request.resourcePassword != null && pwd instanceof RegexRulesPasswordProvider) {
-      ((RegexRulesPasswordProvider) pwd).setExplicitPassword(request.resourcePassword);
+    RegexRulesPasswordProvider pwd = new RegexRulesPasswordProvider();
+    if (request.resourcePassword != null) {
+      pwd.setExplicitPassword(request.resourcePassword);
     }
-    if (request.passwordsMap != null && pwd instanceof RegexRulesPasswordProvider) {
-      ((RegexRulesPasswordProvider) pwd).setPasswordMap(request.passwordsMap);
+    if (request.passwordsMap != null) {
+      pwd.setPasswordMap(request.passwordsMap);
     }
     context.set(PasswordProvider.class, pwd);
     return context;

From d60395fc75eb20d0cc1daf1a922735d0ae43ee1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sun, 5 Oct 2025 00:29:15 +0200
Subject: [PATCH 46/47] Decouple SolrContentHandler from tika dependencies

---
 .../apache/solr/handler/extraction/ExtractionMetadata.java  | 4 +++-
 .../apache/solr/handler/extraction/SolrContentHandler.java  | 6 ++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
index 67592432fa0..31bb15d57eb 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java
@@ -22,8 +22,10 @@
 import java.util.Map;
 import java.util.Objects;
 
-/** Simple metadata bean */
+/** Holder of metadata for extracted content */
 public class ExtractionMetadata {
+  public static final String RESOURCE_NAME_KEY = "resourceName";
+
   private final Map<String, List<String>> map = new LinkedHashMap<>();
 
   public void add(String name, String value) {
diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
index 22be163c816..63a1de649e1 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
@@ -30,8 +30,6 @@
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
-// note: decoupled from Tika Metadata
-import org.apache.tika.metadata.TikaMetadataKeys;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
@@ -207,7 +205,7 @@ protected void addField(String fname, String fval, String[] vals) {
       sf = schema.getFieldOrNull(name);
     } else if (sf == null
         && defaultField.length() > 0
-        && name.equals(TikaMetadataKeys.RESOURCE_NAME_KEY)
+        && name.equals(ExtractionMetadata.RESOURCE_NAME_KEY)
             == false /*let the fall through below handle this*/) {
       name = defaultField;
       sf = schema.getFieldOrNull(name);
@@ -220,7 +218,7 @@ protected void addField(String fname, String fval, String[] vals) {
     // you?
     if (sf == null
         && unknownFieldPrefix.length() == 0
-        && Objects.equals(name, TikaMetadataKeys.RESOURCE_NAME_KEY)) {
+        && Objects.equals(name, ExtractionMetadata.RESOURCE_NAME_KEY)) {
       return;
     }
 

From 80c480a52aa1a87e3ff69af81ba3d391966fe319 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Sun, 5 Oct 2025 13:45:18 +0200
Subject: [PATCH 47/47] Javadocs for TikaServerExtractionBackend

---
 .../TikaServerExtractionBackend.java          | 21 ++++++-------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
index d2dbc5485a9..ef868fa18a1 100644
--- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
+++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java
@@ -27,13 +27,7 @@
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.helpers.DefaultHandler;
 
-/**
- * Extraction backend that delegates parsing to a remote Apache Tika Server.
- *
- * <p>This backend uses Java 11 HttpClient to call Tika Server endpoints. It supports
- * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing is not supported
- * and will throw UnsupportedOperationException.
- */
+/** Extraction backend using the Tika Server. */
 public class TikaServerExtractionBackend implements ExtractionBackend {
   private final HttpClient httpClient;
   private final String baseUrl; // e.g., http://localhost:9998
@@ -92,24 +86,20 @@ public void extractWithSaxHandler(
     }
   }
 
-  private static String firstNonNull(String a, String b) {
-    return a != null ? a : b;
-  }
-
   /**
-   * Call the Tika Server to extract text and metadata. Depending on request.recursive, will either
+   * Call the Tika Server to extract text and metadata. Depending on <code>request.recursive</code>, will either
    * return XML (false) or JSON array (true)
    *
-   * @return InputStream of the response body, either XML or json depending on request.recursive
+   * @return InputStream of the response body, either XML or json depending on <code>request.recursive</code>
    */
-  private InputStream callTikaServer(InputStream inputStream, ExtractionRequest request)
+  InputStream callTikaServer(InputStream inputStream, ExtractionRequest request)
       throws IOException, InterruptedException {
     String url = baseUrl + (request.recursive ? "/rmeta" : "/tika");
     HttpRequest.Builder b =
         HttpRequest.newBuilder(URI.create(url))
             .timeout(timeout)
             .header("Accept", (request.recursive ? "application/json" : "text/xml"));
-    String contentType = firstNonNull(request.streamType, request.contentType);
+    String contentType = (request.streamType != null) ? request.streamType : request.contentType;
     if (contentType != null) {
       b.header("Content-Type", contentType);
     }
@@ -128,6 +118,7 @@ private InputStream callTikaServer(InputStream inputStream, ExtractionRequest re
 
       String pwd = passwordProvider.getPassword(md);
       if (pwd != null) {
+        //noinspection UastIncorrectHttpHeaderInspection
         b.header("Password", pwd);
       }
     }