From 26bde102efb4b38da1a88a590e106afe0b0bf14e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 15:15:16 +0200 Subject: [PATCH 01/47] Introduce ExtractionBackend interface --- .../extraction/DummyExtractionBackend.java | 41 +++ .../extraction/ExtractingDocumentLoader.java | 333 ++++++++++++------ .../extraction/ExtractingRequestHandler.java | 28 +- .../handler/extraction/ExtractionBackend.java | 31 ++ .../extraction/ExtractionMetadata.java | 31 ++ .../handler/extraction/ExtractionRequest.java | 48 +++ .../handler/extraction/ExtractionResult.java | 38 ++ .../LocalTikaExtractionBackend.java | 118 +++++++ .../extraction/SimpleExtractionMetadata.java | 52 +++ .../extraction/SolrContentHandler.java | 13 +- .../extraction/SolrContentHandlerFactory.java | 3 +- .../ExtractingRequestHandlerTest.java | 36 ++ 12 files changed, 649 insertions(+), 123 deletions(-) create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java new file mode 100644 index 00000000000..ddaefadf5d2 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; + +/** Dummy backend that emits predictable test data without actually parsing input content. */ +public class DummyExtractionBackend implements ExtractionBackend { + @Override + public String name() { + return "dummy"; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) { + ExtractionMetadata metadata = new SimpleExtractionMetadata(); + metadata.add("Dummy-Backend", "true"); + metadata.add( + "Content-Type", + request.contentType != null ? request.contentType : "application/octet-stream"); + if (request.resourceName != null) { + metadata.add("resourcename", request.resourceName); + } + String text = "This is dummy extracted content"; + return new ExtractionResult(text, metadata); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 5040abc6425..b60ac3ac9c6 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.io.InputStream; -import java.io.StringWriter; import java.lang.invoke.MethodHandles; import java.util.Locale; import org.apache.solr.common.SolrException; @@ -34,29 +33,12 @@ import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.tika.config.TikaConfig; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.HttpHeaders; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; -import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.xpath.Matcher; -import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; -import org.apache.xml.serialize.BaseMarkupSerializer; -import org.apache.xml.serialize.OutputFormat; -import org.apache.xml.serialize.TextSerializer; -import org.apache.xml.serialize.XMLSerializer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; /** The class responsible for loading extracted content into Solr. */ public class ExtractingDocumentLoader extends ContentStreamLoader { @@ -83,13 +65,15 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { protected TikaConfig config; protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; + protected ExtractionBackend backend; public ExtractingDocumentLoader( SolrQueryRequest req, UpdateRequestProcessor processor, TikaConfig config, ParseContextConfig parseContextConfig, - SolrContentHandlerFactory factory) { + SolrContentHandlerFactory factory, + ExtractionBackend backend) { this.params = req.getParams(); this.core = req.getCore(); this.config = config; @@ -103,6 +87,7 @@ public ExtractingDocumentLoader( // this is lightweight autoDetectParser = new AutoDetectParser(config); this.factory = factory; + this.backend = backend; ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false); } @@ -125,119 +110,243 @@ public void load( ContentStream stream, UpdateRequestProcessor processor) throws Exception { - Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); - if (streamType != null) { - // Cache? Parsers are lightweight to construct and thread-safe, so I'm told - MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); - parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); - } else { - parser = autoDetectParser; - } - if (parser != null) { - Metadata metadata = new Metadata(); - - // If you specify the resource name (the filename, roughly) with this parameter, - // then Tika can make use of it in guessing the appropriate MIME type: - String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); - if (resourceName != null) { - metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); - } - // Provide stream's content type as hint for auto detection - if (stream.getContentType() != null) { - metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); - } + // If you specify the resource name (the filename, roughly) with this parameter, + // some backends can make use of it in guessing the appropriate MIME type: + String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); + + try (InputStream inputStream = stream.getStream()) { + // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata + String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); + + String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); + boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); + + ExtractionRequest extractionRequest = + new ExtractionRequest( + streamType, + resourceName, + stream.getContentType(), + charset, + stream.getName(), + stream.getSourceInfo(), + stream.getSize(), + params.get(ExtractingParams.RESOURCE_PASSWORD, null)); - try (InputStream inputStream = stream.getStream()) { - metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); - metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); - metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); - metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); - // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata - String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); + // Determine if we must use the legacy SAX/XHTML pipeline (needed for + // capture/xpath/extractOnly) + boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false); + String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS); + boolean needLegacySax = + extractOnly + || xpathExpr != null + || captureAttr + || (captureElems != null && captureElems.length > 0) + || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null); + + if (backend instanceof LocalTikaExtractionBackend) { + // Use in-process Tika and SAX pipeline to preserve legacy behavior & test expectations + org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata(); + if (resourceName != null) { + md.add(org.apache.tika.metadata.TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); + } + if (stream.getContentType() != null) { + md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, stream.getContentType()); + md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); + } if (charset != null) { - metadata.add(HttpHeaders.CONTENT_ENCODING, charset); + md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING, charset); + } + if (stream.getName() != null) { + md.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); + } + if (stream.getSourceInfo() != null) { + md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); + } + if (stream.getSize() != null) { + md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); } - String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); - boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); - SolrContentHandler handler = - factory.createSolrContentHandler(metadata, params, req.getSchema()); - ContentHandler parsingHandler = handler; + org.apache.tika.parser.Parser parser; + if (streamType != null) { + org.apache.tika.mime.MediaType mt = + org.apache.tika.mime.MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); + parser = + new org.apache.tika.parser.DefaultParser(config.getMediaTypeRegistry()) + .getParsers() + .get(mt); + } else { + parser = autoDetectParser; + } + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + streamType); + } - StringWriter writer = null; - BaseMarkupSerializer serializer = null; - if (extractOnly == true) { - String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); - writer = new StringWriter(); - if (extractFormat.equals(TEXT_FORMAT)) { - serializer = new TextSerializer(); - serializer.setOutputCharStream(writer); - serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); - } else { - serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); + org.apache.tika.parser.ParseContext context = parseContextConfig.create(); + context.set(org.apache.tika.parser.Parser.class, parser); + context.set( + org.apache.tika.parser.html.HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); + RegexRulesPasswordProvider pwd = new RegexRulesPasswordProvider(); + String explicitPwd = params.get(ExtractingParams.RESOURCE_PASSWORD); + if (explicitPwd != null) pwd.setExplicitPassword(explicitPwd); + String passwordsFile = params.get("passwordsFile"); + if (passwordsFile != null) { + try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) { + pwd.parse(is); } + } + context.set(org.apache.tika.parser.PasswordProvider.class, pwd); + + if (extractOnly) { + String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT); + if (xpathExpr != null) { - Matcher matcher = PARSER.parse(xpathExpr); - serializer - .startDocument(); // The MatchingContentHandler does not invoke startDocument. See - // https://lists.apache.org/thread.html/5ec63e104e564a2363e45f74d5aced6520b7d32b4b625762ef56cb86%401226775505%40%3Cdev.tika.apache.org%3E - parsingHandler = new MatchingContentHandler(serializer, matcher); - } else { - parsingHandler = serializer; + // Always return text when xpath is provided, matching legacy behavior + org.apache.tika.sax.ToTextContentHandler textHandler = + new org.apache.tika.sax.ToTextContentHandler(); + org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr); + org.xml.sax.ContentHandler ch = + new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); + try { + parser.parse(inputStream, ch, md, context); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + rsp.add(stream.getName(), textHandler.toString()); + + } else if (XML_FORMAT.equals(extractFormat)) { + org.apache.tika.sax.ToXMLContentHandler toXml = + new org.apache.tika.sax.ToXMLContentHandler(); + org.xml.sax.ContentHandler ch = toXml; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher); + } + try { + parser.parse(inputStream, ch, md, context); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + String xml = toXml.toString(); + if (!xml.startsWith("\n" + xml; + } + rsp.add(stream.getName(), xml); + } else { // TEXT_FORMAT + org.apache.tika.sax.ToTextContentHandler textHandler = + new org.apache.tika.sax.ToTextContentHandler(); + try { + if (xpathExpr != null) { + org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr); + org.xml.sax.ContentHandler ch = + new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); + parser.parse(inputStream, ch, md, context); + } else { + parser.parse(inputStream, textHandler, md, context); + } + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + rsp.add(stream.getName(), textHandler.toString()); } - } else if (xpathExpr != null) { - Matcher matcher = PARSER.parse(xpathExpr); - parsingHandler = new MatchingContentHandler(handler, matcher); - } // else leave it as is - try { - // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler - // for getting the document. - ParseContext context = parseContextConfig.create(); - - context.set(Parser.class, parser); - context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); - - // Password handling - RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); - String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); - if (pwMapFile != null && pwMapFile.length() > 0) { - InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); - if (is != null) { - log.debug("Password file supplied: {}", pwMapFile); - epp.parse(is); + // Add metadata to the response + NamedList metadataNL = new NamedList<>(); + for (String name : md.names()) { + String[] vals = md.getValues(name); + metadataNL.add(name, vals); + } + rsp.add(stream.getName() + "_metadata", metadataNL); + } else { + // Indexing with capture/captureAttr etc. + SimpleExtractionMetadata neutral = new SimpleExtractionMetadata(); + SolrContentHandler handler = + factory.createSolrContentHandler(neutral, params, req.getSchema()); + try { + parser.parse(inputStream, handler, md, context); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + // Index a document with literals only (no extracted content/metadata) + addDoc(handler); + return; + } else { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } - context.set(PasswordProvider.class, epp); - String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); - if (resourcePassword != null) { - epp.setExplicitPassword(resourcePassword); - log.debug("Literal password supplied for file {}", resourceName); + // After parsing, transfer metadata into neutral and index + for (String name : md.names()) { + String[] vals = md.getValues(name); + if (vals != null) { + for (String v : vals) neutral.add(name, v); + } } - parser.parse(inputStream, parsingHandler, metadata, context); - } catch (TikaException e) { + addDoc(handler); + } + } else { + // Default backend-neutral path + ExtractionResult result; + try { + result = backend.extract(inputStream, extractionRequest); + } catch (Exception e) { if (ignoreTikaException) { if (log.isWarnEnabled()) { - log.warn( - "skip extracting text due to {}. metadata={}", - e.getLocalizedMessage(), - metadata, - e); + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); } + // Index a document with literals only (no extracted content/metadata) + SolrContentHandler handler = + factory.createSolrContentHandler( + new SimpleExtractionMetadata(), params, req.getSchema()); + addDoc(handler); + return; } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } + + ExtractionMetadata metadata = result.getMetadata(); + String content = result.getContent(); + if (extractOnly == false) { + SolrContentHandler handler = + factory.createSolrContentHandler(metadata, params, req.getSchema()); + handler.appendToContent(content); addDoc(handler); } else { - // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { - serializer.endDocument(); + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "XPath filtering is not supported with the backend-neutral extraction API."); + } + String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); + String out; + if (extractFormat.equals(TEXT_FORMAT)) { + out = content != null ? content : ""; + } else { + // wrap content in basic XML with CDATA to avoid escaping + String safe = content == null ? "" : content.replace("]]>", "]]]]>\u003c![CDATA[>"); + out = ""; } - rsp.add(stream.getName(), writer.toString()); - writer.close(); + rsp.add(stream.getName(), out); String[] names = metadata.names(); NamedList metadataNL = new NamedList<>(); for (int i = 0; i < names.length; i++) { @@ -246,17 +355,7 @@ public void load( } rsp.add(stream.getName() + "_metadata", metadataNL); } - } catch (SAXException e) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - } else { - throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, - "Stream type of " - + streamType - + " didn't match any known parsers. Please supply the " - + ExtractingParams.STREAM_TYPE - + " parameter."); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 6caef96cf62..45449f31929 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -44,6 +44,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; + protected ExtractionBackend backend; @Override public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) { @@ -82,6 +83,19 @@ public void inform(SolrCore core) { } factory = createFactory(); + + // Choose backend implementation + String backendName = (String) initArgs.get("extraction.backend"); + if (backendName == null + || backendName.trim().isEmpty() + || backendName.equalsIgnoreCase("local")) { + backend = new LocalTikaExtractionBackend(config, parseContextConfig); + } else if (backendName.equalsIgnoreCase("dummy")) { + backend = new DummyExtractionBackend(); + } else { + // Fallback to local if unknown + backend = new LocalTikaExtractionBackend(config, parseContextConfig); + } } protected SolrContentHandlerFactory createFactory() { @@ -90,7 +104,19 @@ protected SolrContentHandlerFactory createFactory() { @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { - return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory); + // Allow per-request override of backend via request param "extraction.backend" + ExtractionBackend backendToUse = this.backend; + String backendParam = req.getParams().get("extraction.backend"); + if (backendParam != null) { + if (backendParam.equalsIgnoreCase("dummy")) { + backendToUse = new DummyExtractionBackend(); + } else if (backendParam.equalsIgnoreCase("local")) { + backendToUse = new LocalTikaExtractionBackend(config, parseContextConfig); + } + // unknown values fall back to the handler-configured backend + } + return new ExtractingDocumentLoader( + req, processor, config, parseContextConfig, factory, backendToUse); } // ////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java new file mode 100644 index 00000000000..e4758336383 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; + +/** Strategy interface for content extraction backends. */ +public interface ExtractionBackend { + /** + * Extract plain text and metadata from the inputStream. Implementations should not close the + * inputStream. This API is backend-neutral and does not expose SAX or XML-specific types. + */ + ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception; + + /** A short name for debugging/config, e.g., "local" or "dummy". */ + String name(); +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java new file mode 100644 index 00000000000..b5864ec05c3 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +/** + * Neutral metadata container used by extraction backends. Provides minimal operations needed by + * SolrContentHandler and response building without depending on Apache Tika's Metadata class. + */ +public interface ExtractionMetadata { + void add(String name, String value); + + String[] getValues(String name); + + String get(String name); + + String[] names(); +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java new file mode 100644 index 00000000000..4a72e89e4b0 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +/** Immutable request info needed by extraction backends. */ +public class ExtractionRequest { + public final String streamType; // explicit MIME type (optional) + public final String resourceName; // filename hint + public final String contentType; // HTTP content-type header + public final String charset; // derived charset if available + public final String streamName; + public final String streamSourceInfo; + public final Long streamSize; + public final String resourcePassword; // optional password for encrypted docs + + public ExtractionRequest( + String streamType, + String resourceName, + String contentType, + String charset, + String streamName, + String streamSourceInfo, + Long streamSize, + String resourcePassword) { + this.streamType = streamType; + this.resourceName = resourceName; + this.contentType = contentType; + this.charset = charset; + this.streamName = streamName; + this.streamSourceInfo = streamSourceInfo; + this.streamSize = streamSize; + this.resourcePassword = resourcePassword; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java new file mode 100644 index 00000000000..97767d15367 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionResult.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +/** Immutable extraction result with plain text content and neutral metadata. */ +public final class ExtractionResult { + private final String content; + private final ExtractionMetadata metadata; + + public ExtractionResult(String content, ExtractionMetadata metadata) { + this.content = content == null ? "" : content; + this.metadata = metadata; + } + + /** Extracted textual content (plain text). */ + public String getContent() { + return content; + } + + /** Extracted metadata in neutral, backend-agnostic form. */ + public ExtractionMetadata getMetadata() { + return metadata; + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java new file mode 100644 index 00000000000..85fef5b7252 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; +import java.util.Locale; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.HttpHeaders; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.html.HtmlMapper; +import org.apache.tika.sax.BodyContentHandler; + +/** + * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct + * usage of Tika from the loader. + */ +public class LocalTikaExtractionBackend implements ExtractionBackend { + private final TikaConfig tikaConfig; + private final ParseContextConfig parseContextConfig; + private final AutoDetectParser autoDetectParser; + + public LocalTikaExtractionBackend(TikaConfig config, ParseContextConfig parseContextConfig) { + this.tikaConfig = config; + this.parseContextConfig = parseContextConfig; + this.autoDetectParser = new AutoDetectParser(config); + } + + @Override + public String name() { + return "local"; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + Parser parser = null; + if (request.streamType != null) { + MediaType mt = MediaType.parse(request.streamType.trim().toLowerCase(Locale.ROOT)); + parser = new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt); + } else { + parser = autoDetectParser; + } + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + + Metadata md = new Metadata(); + if (request.resourceName != null) { + md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); + } + if (request.contentType != null) { + md.add(HttpHeaders.CONTENT_TYPE, request.contentType); + } + if (request.streamName != null) { + md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName); + } + if (request.streamSourceInfo != null) { + md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); + } + if (request.streamSize != null) { + md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); + } + if (request.contentType != null) { + md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); + } + if (request.charset != null) { + md.add(HttpHeaders.CONTENT_ENCODING, request.charset); + } + + ParseContext context = parseContextConfig.create(); + context.set(Parser.class, parser); + context.set(HtmlMapper.class, ExtractingDocumentLoader.MostlyPassthroughHtmlMapper.INSTANCE); + + // Password handling: allow passing explicit and map via params in future if needed. + PasswordProvider epp = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null && epp instanceof RegexRulesPasswordProvider) { + ((RegexRulesPasswordProvider) epp).setExplicitPassword(request.resourcePassword); + } + context.set(PasswordProvider.class, epp); + + BodyContentHandler textHandler = new BodyContentHandler(-1); + parser.parse(inputStream, textHandler, md, context); + + // copy metadata to neutral container + ExtractionMetadata outMetadata = new SimpleExtractionMetadata(); + for (String name : md.names()) { + String[] vals = md.getValues(name); + if (vals != null) { + for (String v : vals) { + outMetadata.add(name, v); + } + } + } + String content = textHandler.toString(); + return new ExtractionResult(content, outMetadata); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java new file mode 100644 index 00000000000..d414b2eb05b --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** Simple in-memory implementation of ExtractionMetadata. */ +public class SimpleExtractionMetadata implements ExtractionMetadata { + private final Map> map = new LinkedHashMap<>(); + + @Override + public void add(String name, String value) { + if (name == null || value == null) return; + map.computeIfAbsent(name, k -> new ArrayList<>()).add(value); + } + + @Override + public String[] getValues(String name) { + List vals = map.get(name); + if (vals == null) return new String[0]; + return vals.toArray(new String[0]); + } + + @Override + public String get(String name) { + List vals = map.get(name); + if (vals == null || vals.isEmpty()) return null; + return vals.get(0); + } + + @Override + public String[] names() { + return map.keySet().toArray(new String[0]); + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java index 9edba0e925e..22be163c816 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -30,7 +30,7 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; -import org.apache.tika.metadata.Metadata; +// note: decoupled from Tika Metadata import org.apache.tika.metadata.TikaMetadataKeys; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -57,7 +57,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara protected final SolrInputDocument document; - protected final Metadata metadata; + protected final ExtractionMetadata metadata; protected final SolrParams params; protected final StringBuilder catchAllBuilder = new StringBuilder(2048); protected final IndexSchema schema; @@ -74,7 +74,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara private Set literalFieldNames = null; - public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { + public SolrContentHandler(ExtractionMetadata metadata, SolrParams params, IndexSchema schema) { this.document = new SolrInputDocument(); this.metadata = metadata; this.params = params; @@ -152,6 +152,13 @@ protected void addContent() { addField(contentFieldName, catchAllBuilder.toString(), null); } + /** Append pre-extracted plain text content to the catch-all builder. */ + public void appendToContent(String text) { + if (text != null && !text.isEmpty()) { + catchAllBuilder.append(text); + } + } + /** * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}. */ diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java index 1070e744d84..b4fe031a068 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java @@ -18,7 +18,6 @@ import org.apache.solr.common.params.SolrParams; import org.apache.solr.schema.IndexSchema; -import org.apache.tika.metadata.Metadata; /** */ public class SolrContentHandlerFactory { @@ -26,7 +25,7 @@ public class SolrContentHandlerFactory { public SolrContentHandlerFactory() {} public SolrContentHandler createSolrContentHandler( - Metadata metadata, SolrParams params, IndexSchema schema) { + ExtractionMetadata metadata, SolrParams params, IndexSchema schema) { return new SolrContentHandler(metadata, params, schema); } } diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index 0097b86e818..68426bbc7d2 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -38,6 +38,13 @@ /** */ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { + static { + // Allow the SecureRandom algorithm used in this environment to avoid class configuration + // failure in tests. + // This mirrors passing -Dtest.solr.allowed.securerandom=NativePRNG at JVM startup. + System.setProperty("test.solr.allowed.securerandom", "NativePRNG"); + } + @BeforeClass public static void beforeClass() throws Exception { // Is the JDK/env affected by a known bug? @@ -1142,6 +1149,35 @@ SolrQueryResponse loadLocalFromHandler(String handler, String filename, String.. } } + @Test + public void testDummyBackendExtractOnly() throws Exception { + ExtractingRequestHandler handler = + (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); + assertNotNull("handler is null and it shouldn't be", handler); + SolrQueryResponse rsp = + loadLocal( + "extraction/version_control.txt", + "extraction.backend", + "dummy", + ExtractingParams.EXTRACT_ONLY, + "true", + ExtractingParams.EXTRACT_FORMAT, + ExtractingDocumentLoader.TEXT_FORMAT); + assertNotNull("rsp is null and it shouldn't be", rsp); + NamedList list = rsp.getValues(); + String extraction = (String) list.get("version_control.txt"); + assertNotNull("extraction is null and it shouldn't be", extraction); + assertEquals("This is dummy extracted content", extraction); + + NamedList nl = (NamedList) list.get("version_control.txt_metadata"); + assertNotNull("metadata is null and it shouldn't be", nl); + Object dummyFlag = nl.get("Dummy-Backend"); + assertNotNull("Dummy-Backend metadata missing", dummyFlag); + if (dummyFlag instanceof String[]) { + assertEquals("true", ((String[]) dummyFlag)[0]); + } + } + SolrQueryResponse loadLocal(String filename, String... args) throws Exception { return loadLocalFromHandler("/update/extract", filename, args); } From 57d8d4ece153cd5fa809a6dc032c84e4d217c679 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 15:27:02 +0200 Subject: [PATCH 02/47] Move some tika tests to new test file --- .../ExtractingRequestHandlerTest.java | 75 ---------- .../LocalTikaExtractionBackendTest.java | 138 ++++++++++++++++++ 2 files changed, 138 insertions(+), 75 deletions(-) create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index 68426bbc7d2..fa23833d918 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -796,82 +796,7 @@ public void testArabicPDF() throws Exception { assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]"); } - @Test - public void testTikaExceptionHandling() throws Exception { - ExtractingRequestHandler handler = - (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); - assertNotNull("handler is null and it shouldn't be", handler); - - expectThrows( - Exception.class, - () -> { - loadLocal("extraction/password-is-solrcell.docx", "literal.id", "one"); - }); - assertU(commit()); - assertQ(req("*:*"), "//result[@numFound=0]"); - - try { - loadLocal( - "extraction/password-is-solrcell.docx", - "fmap.created", - "extractedDate", - "fmap.producer", - "extractedProducer", - "fmap.creator", - "extractedCreator", - "fmap.Keywords", - "extractedKeywords", - "fmap.Creation-Date", - "extractedDate", - "uprefix", - "ignored_", - "fmap.Author", - "extractedAuthor", - "fmap.content", - "wdf_nocase", - "literal.id", - "one", - "ignoreTikaException", - "true", // set ignore flag - "fmap.Last-Modified", - "extractedDate"); - } catch (Exception e) { - fail("TikaException should be ignored."); - } - assertU(commit()); - assertQ(req("*:*"), "//result[@numFound=1]"); - } - - @Test - public void testWrongStreamType() throws Exception { - ExtractingRequestHandler handler = - (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); - assertNotNull("handler is null and it shouldn't be", handler); - expectThrows( - Exception.class, - () -> { - // Load plain text specifying another mime type, should fail - loadLocal( - "extraction/version_control.txt", - "literal.id", - "one", - ExtractingParams.STREAM_TYPE, - "application/pdf"); - }); - - expectThrows( - Exception.class, - () -> { - // Load plain text specifying non existing mimetype, should fail - loadLocal( - "extraction/version_control.txt", - "literal.id", - "one", - ExtractingParams.STREAM_TYPE, - "foo/bar"); - }); - } public void testLiteralsOverride() throws Exception { ExtractingRequestHandler handler = diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java new file mode 100644 index 00000000000..4110713ea66 --- /dev/null +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.InputStream; +import java.nio.file.Files; +import org.apache.solr.SolrTestCaseJ4; +import org.apache.tika.config.TikaConfig; +import org.junit.BeforeClass; +import org.junit.Test; + +/** Unit tests for LocalTikaExtractionBackend independent of the HTTP handler. */ +public class LocalTikaExtractionBackendTest extends SolrTestCaseJ4 { + + private static TikaConfig tikaConfig; + private static ParseContextConfig parseContextConfig; + + @BeforeClass + public static void setupClass() throws Exception { + try (InputStream is = LocalTikaExtractionBackendTest.class + .getClassLoader() + .getResourceAsStream("solr-default-tika-config.xml")) { + assertNotNull("solr-default-tika-config.xml not on classpath", is); + tikaConfig = new TikaConfig(is); + } + parseContextConfig = new ParseContextConfig(); + } + + private LocalTikaExtractionBackend newBackend() { + return new LocalTikaExtractionBackend(tikaConfig, parseContextConfig); + } + + private ExtractionRequest newRequest( + String resourceName, + String streamType, + String contentType, + String charset, + String streamName, + String streamSourceInfo, + Long streamSize, + String resourcePassword) { + return new ExtractionRequest( + streamType, + resourceName, + contentType, + charset, + streamName, + streamSourceInfo, + streamSize, + resourcePassword); + } + + @Test + public void testWrongStreamTypeThrows() throws Exception { + LocalTikaExtractionBackend backend = newBackend(); + try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) { + // Non-existing type -> no parser available + ExtractionRequest req = newRequest( + "version_control.txt", + "foo/bar", + null, + null, + "version_control.txt", + null, + null, + null); + expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req)); + } + + try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) { + // Wrong but existing type -> likely to fail when parsing + ExtractionRequest req = newRequest( + "version_control.txt", + "application/pdf", + null, + null, + "version_control.txt", + null, + null, + null); + expectThrows(Exception.class, () -> backend.extract(in, req)); + } + } + + @Test + public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception { + LocalTikaExtractionBackend backend = newBackend(); + try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) { + ExtractionRequest req = newRequest( + "password-is-Word2010.docx", + null, + null, + null, + "password-is-Word2010.docx", + null, + null, + null); + expectThrows(Exception.class, () -> backend.extract(in, req)); + } + } + + @Test + public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception { + LocalTikaExtractionBackend backend = newBackend(); + try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) { + ExtractionRequest req = newRequest( + "password-is-Word2010.docx", + null, + null, + null, + "password-is-Word2010.docx", + null, + null, + "Word2010"); + ExtractionResult res = backend.extract(in, req); + assertNotNull(res); + assertNotNull(res.getMetadata()); + String content = res.getContent(); + assertNotNull(content); + assertTrue("Content should mention password-protected doc text", + content.contains("Test password protected word doc")); + } + } +} From dc151c5999e948920711ac2d5b7e101e7aa6aebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 15:42:32 +0200 Subject: [PATCH 03/47] ExtractingRequestHandler and ExtractingDocumentLoader not depend on Tika API Refactor some tests to LocalTikaExtractionBackendTest --- .../extraction/DummyExtractionBackend.java | 25 ++ .../extraction/ExtractingDocumentLoader.java | 316 ++++-------------- .../extraction/ExtractingRequestHandler.java | 59 ++-- .../handler/extraction/ExtractionBackend.java | 21 ++ .../handler/extraction/ExtractionRequest.java | 6 +- .../LocalTikaExtractionBackend.java | 195 ++++++++--- .../ExtractingRequestHandlerTest.java | 2 - .../LocalTikaExtractionBackendTest.java | 89 ++--- 8 files changed, 348 insertions(+), 365 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index ddaefadf5d2..c9cdf724ef2 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -38,4 +38,29 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque String text = "This is dummy extracted content"; return new ExtractionResult(text, metadata); } + + @Override + public ExtractionResult extractOnly( + InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) { + if (xpathExpr != null) { + throw new UnsupportedOperationException("XPath not supported by dummy backend"); + } + return extract(inputStream, request); + } + + @Override + public void parseToSolrContentHandler( + InputStream inputStream, + ExtractionRequest request, + SolrContentHandler handler, + ExtractionMetadata outMetadata) { + // Fill metadata + ExtractionResult r = extract(inputStream, request); + for (String name : r.getMetadata().names()) { + String[] vals = r.getMetadata().getValues(name); + if (vals != null) for (String v : vals) outMetadata.add(name, v); + } + // Append content + handler.appendToContent(r.getContent()); + } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index b60ac3ac9c6..b6a74008ff5 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -19,7 +19,8 @@ import java.io.IOException; import java.io.InputStream; import java.lang.invoke.MethodHandles; -import java.util.Locale; +import java.util.LinkedHashMap; +import java.util.regex.Pattern; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.UpdateParams; @@ -32,11 +33,6 @@ import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.UpdateRequestProcessor; -import org.apache.tika.config.TikaConfig; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.html.HtmlMapper; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.sax.xpath.XPathParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,41 +47,29 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { /** Extract Only supported format. Default */ public static final String XML_FORMAT = "xml"; - /** XHTML XPath parser. */ - private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML); - final SolrCore core; final SolrParams params; final UpdateRequestProcessor processor; final boolean ignoreTikaException; - protected AutoDetectParser autoDetectParser; private final AddUpdateCommand templateAdd; - protected TikaConfig config; - protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; protected ExtractionBackend backend; public ExtractingDocumentLoader( SolrQueryRequest req, UpdateRequestProcessor processor, - TikaConfig config, - ParseContextConfig parseContextConfig, SolrContentHandlerFactory factory, ExtractionBackend backend) { this.params = req.getParams(); this.core = req.getCore(); - this.config = config; - this.parseContextConfig = parseContextConfig; this.processor = processor; templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); - // this is lightweight - autoDetectParser = new AutoDetectParser(config); this.factory = factory; this.backend = backend; @@ -111,17 +95,23 @@ public void load( UpdateRequestProcessor processor) throws Exception { String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); - // If you specify the resource name (the filename, roughly) with this parameter, - // some backends can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); try (InputStream inputStream = stream.getStream()) { - // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); + // Parse optional passwords file into a map (keeps Tika usages out of this class) + LinkedHashMap pwMap = null; + String passwordsFile = params.get("passwordsFile"); + if (passwordsFile != null) { + try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) { + pwMap = RegexRulesPasswordProvider.parseRulesFile(is); + } + } + ExtractionRequest extractionRequest = new ExtractionRequest( streamType, @@ -131,10 +121,9 @@ public void load( stream.getName(), stream.getSourceInfo(), stream.getSize(), - params.get(ExtractingParams.RESOURCE_PASSWORD, null)); + params.get(ExtractingParams.RESOURCE_PASSWORD, null), + pwMap); - // Determine if we must use the legacy SAX/XHTML pipeline (needed for - // capture/xpath/extractOnly) boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false); String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS); boolean needLegacySax = @@ -142,251 +131,84 @@ public void load( || xpathExpr != null || captureAttr || (captureElems != null && captureElems.length > 0) - || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null); - - if (backend instanceof LocalTikaExtractionBackend) { - // Use in-process Tika and SAX pipeline to preserve legacy behavior & test expectations - org.apache.tika.metadata.Metadata md = new org.apache.tika.metadata.Metadata(); - if (resourceName != null) { - md.add(org.apache.tika.metadata.TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); - } - if (stream.getContentType() != null) { - md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, stream.getContentType()); - md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); - } - if (charset != null) { - md.add(org.apache.tika.metadata.HttpHeaders.CONTENT_ENCODING, charset); - } - if (stream.getName() != null) { - md.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); - } - if (stream.getSourceInfo() != null) { - md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); - } - if (stream.getSize() != null) { - md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); - } + || (params.get(ExtractingParams.RESOURCE_PASSWORD) != null) + || (passwordsFile != null); - org.apache.tika.parser.Parser parser; - if (streamType != null) { - org.apache.tika.mime.MediaType mt = - org.apache.tika.mime.MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); - parser = - new org.apache.tika.parser.DefaultParser(config.getMediaTypeRegistry()) - .getParsers() - .get(mt); - } else { - parser = autoDetectParser; - } - if (parser == null) { - throw new IllegalArgumentException("No Tika parser for stream type: " + streamType); - } - - org.apache.tika.parser.ParseContext context = parseContextConfig.create(); - context.set(org.apache.tika.parser.Parser.class, parser); - context.set( - org.apache.tika.parser.html.HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); - RegexRulesPasswordProvider pwd = new RegexRulesPasswordProvider(); - String explicitPwd = params.get(ExtractingParams.RESOURCE_PASSWORD); - if (explicitPwd != null) pwd.setExplicitPassword(explicitPwd); - String passwordsFile = params.get("passwordsFile"); - if (passwordsFile != null) { - try (java.io.InputStream is = core.getResourceLoader().openResource(passwordsFile)) { - pwd.parse(is); - } - } - context.set(org.apache.tika.parser.PasswordProvider.class, pwd); - - if (extractOnly) { - String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT); - - if (xpathExpr != null) { - // Always return text when xpath is provided, matching legacy behavior - org.apache.tika.sax.ToTextContentHandler textHandler = - new org.apache.tika.sax.ToTextContentHandler(); - org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr); - org.xml.sax.ContentHandler ch = - new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); - try { - parser.parse(inputStream, ch, md, context); - } catch (Exception e) { - if (ignoreTikaException) { - if (log.isWarnEnabled()) - log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); - return; - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } - } - rsp.add(stream.getName(), textHandler.toString()); - - } else if (XML_FORMAT.equals(extractFormat)) { - org.apache.tika.sax.ToXMLContentHandler toXml = - new org.apache.tika.sax.ToXMLContentHandler(); - org.xml.sax.ContentHandler ch = toXml; - if (xpathExpr != null) { - org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr); - ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher); - } - try { - parser.parse(inputStream, ch, md, context); - } catch (Exception e) { - if (ignoreTikaException) { - if (log.isWarnEnabled()) - log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); - return; - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } - } - String xml = toXml.toString(); - if (!xml.startsWith("\n" + xml; - } - rsp.add(stream.getName(), xml); - } else { // TEXT_FORMAT - org.apache.tika.sax.ToTextContentHandler textHandler = - new org.apache.tika.sax.ToTextContentHandler(); - try { - if (xpathExpr != null) { - org.apache.tika.sax.xpath.Matcher matcher = PARSER.parse(xpathExpr); - org.xml.sax.ContentHandler ch = - new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); - parser.parse(inputStream, ch, md, context); - } else { - parser.parse(inputStream, textHandler, md, context); - } - } catch (Exception e) { - if (ignoreTikaException) { - if (log.isWarnEnabled()) - log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); - return; - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } - } - rsp.add(stream.getName(), textHandler.toString()); - } - - // Add metadata to the response + if (extractOnly) { + String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT); + try { + ExtractionResult result = + backend.extractOnly(inputStream, extractionRequest, extractFormat, xpathExpr); + // Write content + rsp.add(stream.getName(), result.getContent()); + // Write metadata NamedList metadataNL = new NamedList<>(); - for (String name : md.names()) { - String[] vals = md.getValues(name); - metadataNL.add(name, vals); + for (String name : result.getMetadata().names()) { + metadataNL.add(name, result.getMetadata().getValues(name)); } rsp.add(stream.getName() + "_metadata", metadataNL); - } else { - // Indexing with capture/captureAttr etc. - SimpleExtractionMetadata neutral = new SimpleExtractionMetadata(); - SolrContentHandler handler = - factory.createSolrContentHandler(neutral, params, req.getSchema()); - try { - parser.parse(inputStream, handler, md, context); - } catch (Exception e) { - if (ignoreTikaException) { - if (log.isWarnEnabled()) - log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); - // Index a document with literals only (no extracted content/metadata) - addDoc(handler); - return; - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } - } - // After parsing, transfer metadata into neutral and index - for (String name : md.names()) { - String[] vals = md.getValues(name); - if (vals != null) { - for (String v : vals) neutral.add(name, v); - } + } catch (UnsupportedOperationException uoe) { + // For backends that don't support xpath + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "XPath filtering is not supported by backend '" + backend.name() + "'."); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } - addDoc(handler); + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - } else { - // Default backend-neutral path - ExtractionResult result; + return; + } + + if (needLegacySax) { + // Indexing with capture/xpath/etc: delegate SAX parse to backend + SimpleExtractionMetadata neutral = new SimpleExtractionMetadata(); + SolrContentHandler handler = + factory.createSolrContentHandler(neutral, params, req.getSchema()); try { - result = backend.extract(inputStream, extractionRequest); + backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral); } catch (Exception e) { if (ignoreTikaException) { - if (log.isWarnEnabled()) { + if (log.isWarnEnabled()) log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); - } // Index a document with literals only (no extracted content/metadata) - SolrContentHandler handler = - factory.createSolrContentHandler( - new SimpleExtractionMetadata(), params, req.getSchema()); addDoc(handler); return; - } else { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } + addDoc(handler); + return; + } - ExtractionMetadata metadata = result.getMetadata(); - String content = result.getContent(); - - if (extractOnly == false) { + // Default simple backend-neutral path + ExtractionResult result; + try { + result = backend.extract(inputStream, extractionRequest); + } catch (Exception e) { + if (ignoreTikaException) { + if (log.isWarnEnabled()) + log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + // Index a document with literals only (no extracted content/metadata) SolrContentHandler handler = - factory.createSolrContentHandler(metadata, params, req.getSchema()); - handler.appendToContent(content); + factory.createSolrContentHandler( + new SimpleExtractionMetadata(), params, req.getSchema()); addDoc(handler); - } else { - if (xpathExpr != null) { - throw new SolrException( - SolrException.ErrorCode.BAD_REQUEST, - "XPath filtering is not supported with the backend-neutral extraction API."); - } - String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); - String out; - if (extractFormat.equals(TEXT_FORMAT)) { - out = content != null ? content : ""; - } else { - // wrap content in basic XML with CDATA to avoid escaping - String safe = content == null ? "" : content.replace("]]>", "]]]]>\u003c![CDATA[>"); - out = ""; - } - rsp.add(stream.getName(), out); - String[] names = metadata.names(); - NamedList metadataNL = new NamedList<>(); - for (int i = 0; i < names.length; i++) { - String[] vals = metadata.getValues(names[i]); - metadataNL.add(names[i], vals); - } - rsp.add(stream.getName() + "_metadata", metadataNL); + return; } + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } - } - } - - public static class MostlyPassthroughHtmlMapper implements HtmlMapper { - public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); - /** - * Keep all elements and their content. - * - *

Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere - */ - @Override - public boolean isDiscardElement(String name) { - return false; - } - - /** Lowercases the attribute name */ - @Override - public String mapSafeAttribute(String elementName, String attributeName) { - return attributeName.toLowerCase(Locale.ENGLISH); - } + ExtractionMetadata metadata = result.getMetadata(); + String content = result.getContent(); - /** - * Lowercases the element name, but returns null for <BR>, which suppresses the - * start-element event for lt;BR> tags. This also suppresses the <BODY> tags because - * those are handled internally by Tika's XHTMLContentHandler. - */ - @Override - public String mapSafeElement(String name) { - String lowerName = name.toLowerCase(Locale.ROOT); - return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; + SolrContentHandler handler = + factory.createSolrContentHandler(metadata, params, req.getSchema()); + handler.appendToContent(content); + addDoc(handler); } } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 45449f31929..6250601d6b1 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -16,8 +16,6 @@ */ package org.apache.solr.handler.extraction; -import java.io.InputStream; -import java.nio.file.Path; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.core.SolrCore; @@ -28,7 +26,6 @@ import org.apache.solr.security.PermissionNameProvider; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.util.plugin.SolrCoreAware; -import org.apache.tika.config.TikaConfig; /** * Handler for rich documents like PDF or Word or any other file format that Tika handles that need @@ -40,7 +37,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase public static final String PARSE_CONTEXT_CONFIG = "parseContext.config"; public static final String CONFIG_LOCATION = "tika.config"; - protected TikaConfig config; + protected String tikaConfigLoc; protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; @@ -54,22 +51,8 @@ public PermissionNameProvider.Name getPermissionName(AuthorizationContext reques @Override public void inform(SolrCore core) { try { - String tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); - if (tikaConfigLoc == null) { // default - ClassLoader classLoader = core.getResourceLoader().getClassLoader(); - try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) { - config = new TikaConfig(is); - } - } else { - Path configFile = Path.of(tikaConfigLoc); - if (configFile.isAbsolute()) { - config = new TikaConfig(configFile); - } else { // in conf/ - try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) { - config = new TikaConfig(is); - } - } - } + // Store tika config location (backend-specific) + this.tikaConfigLoc = (String) initArgs.get(CONFIG_LOCATION); String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG); if (parseContextConfigLoc == null) { // default: @@ -79,22 +62,27 @@ public void inform(SolrCore core) { new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc); } } catch (Exception e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to load Tika Config", e); + throw new SolrException( + ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e); } factory = createFactory(); // Choose backend implementation String backendName = (String) initArgs.get("extraction.backend"); - if (backendName == null - || backendName.trim().isEmpty() - || backendName.equalsIgnoreCase("local")) { - backend = new LocalTikaExtractionBackend(config, parseContextConfig); - } else if (backendName.equalsIgnoreCase("dummy")) { - backend = new DummyExtractionBackend(); - } else { - // Fallback to local if unknown - backend = new LocalTikaExtractionBackend(config, parseContextConfig); + try { + if (backendName == null + || backendName.trim().isEmpty() + || backendName.equalsIgnoreCase("local")) { + backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + } else if (backendName.equalsIgnoreCase("dummy")) { + backend = new DummyExtractionBackend(); + } else { + // Fallback to local if unknown + backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + } + } catch (Exception e) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e); } } @@ -111,12 +99,17 @@ protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProce if (backendParam.equalsIgnoreCase("dummy")) { backendToUse = new DummyExtractionBackend(); } else if (backendParam.equalsIgnoreCase("local")) { - backendToUse = new LocalTikaExtractionBackend(config, parseContextConfig); + try { + backendToUse = + new LocalTikaExtractionBackend(req.getCore(), tikaConfigLoc, parseContextConfig); + } catch (Exception e) { + throw new SolrException( + ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e); + } } // unknown values fall back to the handler-configured backend } - return new ExtractingDocumentLoader( - req, processor, config, parseContextConfig, factory, backendToUse); + return new ExtractingDocumentLoader(req, processor, factory, backendToUse); } // ////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java index e4758336383..3a253dc1ec3 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -26,6 +26,27 @@ public interface ExtractionBackend { */ ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception; + /** + * Perform extractOnly operation. If extractFormat equals ExtractingDocumentLoader.TEXT_FORMAT, + * return plain text. If XML, return XML body as string. Implementations may support optional + * xpathExpr; if unsupported and xpathExpr is not null, they should throw + * UnsupportedOperationException. + */ + ExtractionResult extractOnly( + InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) + throws Exception; + + /** + * Parse the content and stream SAX events into the provided SolrContentHandler, while also + * filling outMetadata with extracted metadata. + */ + void parseToSolrContentHandler( + InputStream inputStream, + ExtractionRequest request, + SolrContentHandler handler, + ExtractionMetadata outMetadata) + throws Exception; + /** A short name for debugging/config, e.g., "local" or "dummy". */ String name(); } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java index 4a72e89e4b0..f1af3029193 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java @@ -26,6 +26,8 @@ public class ExtractionRequest { public final String streamSourceInfo; public final Long streamSize; public final String resourcePassword; // optional password for encrypted docs + public final java.util.LinkedHashMap + passwordsMap; // optional passwords map public ExtractionRequest( String streamType, @@ -35,7 +37,8 @@ public ExtractionRequest( String streamName, String streamSourceInfo, Long streamSize, - String resourcePassword) { + String resourcePassword, + java.util.LinkedHashMap passwordsMap) { this.streamType = streamType; this.resourceName = resourceName; this.contentType = contentType; @@ -44,5 +47,6 @@ public ExtractionRequest( this.streamSourceInfo = streamSourceInfo; this.streamSize = streamSize; this.resourcePassword = resourcePassword; + this.passwordsMap = passwordsMap; } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java index 85fef5b7252..315a582ea2a 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -17,7 +17,9 @@ package org.apache.solr.handler.extraction; import java.io.InputStream; +import java.nio.file.Path; import java.util.Locale; +import org.apache.solr.core.SolrCore; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; @@ -40,79 +42,190 @@ public class LocalTikaExtractionBackend implements ExtractionBackend { private final ParseContextConfig parseContextConfig; private final AutoDetectParser autoDetectParser; + // Local HtmlMapper moved from ExtractingDocumentLoader + private static class MostlyPassthroughHtmlMapper implements HtmlMapper { + static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper(); + + @Override + public boolean isDiscardElement(String name) { + return false; + } + + @Override + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(java.util.Locale.ENGLISH); + } + + @Override + public String mapSafeElement(String name) { + String lowerName = name.toLowerCase(java.util.Locale.ROOT); + return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName; + } + } + public LocalTikaExtractionBackend(TikaConfig config, ParseContextConfig parseContextConfig) { this.tikaConfig = config; this.parseContextConfig = parseContextConfig; this.autoDetectParser = new AutoDetectParser(config); } + /** + * Construct backend by loading TikaConfig based on handler/core configuration without exposing + * Tika types to the handler. + */ + public LocalTikaExtractionBackend( + SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) throws Exception { + TikaConfig cfg; + if (tikaConfigLoc == null) { // default + ClassLoader classLoader = core.getResourceLoader().getClassLoader(); + try (InputStream is = classLoader.getResourceAsStream("solr-default-tika-config.xml")) { + cfg = new TikaConfig(is); + } + } else { + Path configFile = Path.of(tikaConfigLoc); + if (configFile.isAbsolute()) { + cfg = new TikaConfig(configFile); + } else { // in conf/ + try (InputStream is = core.getResourceLoader().openResource(tikaConfigLoc)) { + cfg = new TikaConfig(is); + } + } + } + this.tikaConfig = cfg; + this.parseContextConfig = parseContextConfig; + this.autoDetectParser = new AutoDetectParser(cfg); + } + @Override public String name() { return "local"; } - @Override - public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) - throws Exception { - Parser parser = null; + private Parser selectParser(ExtractionRequest request) { if (request.streamType != null) { MediaType mt = MediaType.parse(request.streamType.trim().toLowerCase(Locale.ROOT)); - parser = new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt); - } else { - parser = autoDetectParser; - } - if (parser == null) { - throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + return new DefaultParser(tikaConfig.getMediaTypeRegistry()).getParsers().get(mt); } + return autoDetectParser; + } + private Metadata buildMetadata(ExtractionRequest request) { Metadata md = new Metadata(); - if (request.resourceName != null) { + if (request.resourceName != null) md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); - } - if (request.contentType != null) { - md.add(HttpHeaders.CONTENT_TYPE, request.contentType); - } - if (request.streamName != null) { + if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType); + if (request.streamName != null) md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName); - } - if (request.streamSourceInfo != null) { + if (request.streamSourceInfo != null) md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); - } - if (request.streamSize != null) { + if (request.streamSize != null) md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); - } - if (request.contentType != null) { + if (request.contentType != null) md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); - } - if (request.charset != null) { - md.add(HttpHeaders.CONTENT_ENCODING, request.charset); - } + if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset); + return md; + } + private ParseContext buildContext(Parser parser, ExtractionRequest request) { ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); - context.set(HtmlMapper.class, ExtractingDocumentLoader.MostlyPassthroughHtmlMapper.INSTANCE); + context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); + PasswordProvider pwd = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null && pwd instanceof RegexRulesPasswordProvider) { + ((RegexRulesPasswordProvider) pwd).setExplicitPassword(request.resourcePassword); + } + if (request.passwordsMap != null && pwd instanceof RegexRulesPasswordProvider) { + ((RegexRulesPasswordProvider) pwd).setPasswordMap(request.passwordsMap); + } + context.set(PasswordProvider.class, pwd); + return context; + } - // Password handling: allow passing explicit and map via params in future if needed. - PasswordProvider epp = new RegexRulesPasswordProvider(); - if (request.resourcePassword != null && epp instanceof RegexRulesPasswordProvider) { - ((RegexRulesPasswordProvider) epp).setExplicitPassword(request.resourcePassword); + private static ExtractionMetadata copyToNeutral(Metadata md) { + ExtractionMetadata out = new SimpleExtractionMetadata(); + for (String name : md.names()) { + String[] vals = md.getValues(name); + if (vals != null) for (String v : vals) out.add(name, v); } - context.set(PasswordProvider.class, epp); + return out; + } + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + Metadata md = buildMetadata(request); + ParseContext context = buildContext(parser, request); BodyContentHandler textHandler = new BodyContentHandler(-1); parser.parse(inputStream, textHandler, md, context); + return new ExtractionResult(textHandler.toString(), copyToNeutral(md)); + } + + @Override + public ExtractionResult extractOnly( + InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + Metadata md = buildMetadata(request); + ParseContext context = buildContext(parser, request); + + String content; + if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractFormat) || xpathExpr != null) { + org.apache.tika.sax.ToTextContentHandler textHandler = + new org.apache.tika.sax.ToTextContentHandler(); + org.xml.sax.ContentHandler ch = textHandler; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); + } + parser.parse(inputStream, ch, md, context); + content = textHandler.toString(); + } else { // XML format + org.apache.tika.sax.ToXMLContentHandler toXml = new org.apache.tika.sax.ToXMLContentHandler(); + org.xml.sax.ContentHandler ch = toXml; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher); + } + parser.parse(inputStream, ch, md, context); + content = toXml.toString(); + if (!content.startsWith("\n" + content; + } + } + return new ExtractionResult(content, copyToNeutral(md)); + } - // copy metadata to neutral container - ExtractionMetadata outMetadata = new SimpleExtractionMetadata(); + @Override + public void parseToSolrContentHandler( + InputStream inputStream, + ExtractionRequest request, + SolrContentHandler handler, + ExtractionMetadata outMetadata) + throws Exception { + Parser parser = selectParser(request); + if (parser == null) { + throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); + } + Metadata md = buildMetadata(request); + ParseContext context = buildContext(parser, request); + parser.parse(inputStream, handler, md, context); + // populate outMetadata for (String name : md.names()) { String[] vals = md.getValues(name); - if (vals != null) { - for (String v : vals) { - outMetadata.add(name, v); - } - } + if (vals != null) for (String v : vals) outMetadata.add(name, v); } - String content = textHandler.toString(); - return new ExtractionResult(content, outMetadata); } } diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index fa23833d918..acff92e1071 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -796,8 +796,6 @@ public void testArabicPDF() throws Exception { assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]"); } - - public void testLiteralsOverride() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java index 4110713ea66..df365f2bedf 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java @@ -31,9 +31,10 @@ public class LocalTikaExtractionBackendTest extends SolrTestCaseJ4 { @BeforeClass public static void setupClass() throws Exception { - try (InputStream is = LocalTikaExtractionBackendTest.class - .getClassLoader() - .getResourceAsStream("solr-default-tika-config.xml")) { + try (InputStream is = + LocalTikaExtractionBackendTest.class + .getClassLoader() + .getResourceAsStream("solr-default-tika-config.xml")) { assertNotNull("solr-default-tika-config.xml not on classpath", is); tikaConfig = new TikaConfig(is); } @@ -61,7 +62,8 @@ private ExtractionRequest newRequest( streamName, streamSourceInfo, streamSize, - resourcePassword); + resourcePassword, + null); } @Test @@ -69,29 +71,31 @@ public void testWrongStreamTypeThrows() throws Exception { LocalTikaExtractionBackend backend = newBackend(); try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) { // Non-existing type -> no parser available - ExtractionRequest req = newRequest( - "version_control.txt", - "foo/bar", - null, - null, - "version_control.txt", - null, - null, - null); + ExtractionRequest req = + newRequest( + "version_control.txt", + "foo/bar", + null, + null, + "version_control.txt", + null, + null, + null); expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req)); } try (InputStream in = Files.newInputStream(getFile("extraction/version_control.txt"))) { // Wrong but existing type -> likely to fail when parsing - ExtractionRequest req = newRequest( - "version_control.txt", - "application/pdf", - null, - null, - "version_control.txt", - null, - null, - null); + ExtractionRequest req = + newRequest( + "version_control.txt", + "application/pdf", + null, + null, + "version_control.txt", + null, + null, + null); expectThrows(Exception.class, () -> backend.extract(in, req)); } } @@ -100,15 +104,16 @@ public void testWrongStreamTypeThrows() throws Exception { public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception { LocalTikaExtractionBackend backend = newBackend(); try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) { - ExtractionRequest req = newRequest( - "password-is-Word2010.docx", - null, - null, - null, - "password-is-Word2010.docx", - null, - null, - null); + ExtractionRequest req = + newRequest( + "password-is-Word2010.docx", + null, + null, + null, + "password-is-Word2010.docx", + null, + null, + null); expectThrows(Exception.class, () -> backend.extract(in, req)); } } @@ -117,21 +122,23 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception { public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception { LocalTikaExtractionBackend backend = newBackend(); try (InputStream in = Files.newInputStream(getFile("extraction/password-is-Word2010.docx"))) { - ExtractionRequest req = newRequest( - "password-is-Word2010.docx", - null, - null, - null, - "password-is-Word2010.docx", - null, - null, - "Word2010"); + ExtractionRequest req = + newRequest( + "password-is-Word2010.docx", + null, + null, + null, + "password-is-Word2010.docx", + null, + null, + "Word2010"); ExtractionResult res = backend.extract(in, req); assertNotNull(res); assertNotNull(res.getMetadata()); String content = res.getContent(); assertNotNull(content); - assertTrue("Content should mention password-protected doc text", + assertTrue( + "Content should mention password-protected doc text", content.contains("Test password protected word doc")); } } From 5a19251c1f39e8cfe2b3220df4dc875728bcc0e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 15:53:29 +0200 Subject: [PATCH 04/47] Use a factory to create the backend to keep it DRY --- .../extraction/ExtractingRequestHandler.java | 57 +++++---------- .../extraction/ExtractionBackendFactory.java | 72 +++++++++++++++++++ 2 files changed, 90 insertions(+), 39 deletions(-) create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 6250601d6b1..5f1b6f2be3f 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -41,7 +41,8 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase protected ParseContextConfig parseContextConfig; protected SolrContentHandlerFactory factory; - protected ExtractionBackend backend; + protected ExtractionBackendFactory backendFactory; + protected String defaultBackendName; @Override public PermissionNameProvider.Name getPermissionName(AuthorizationContext request) { @@ -61,55 +62,33 @@ public void inform(SolrCore core) { parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc); } - } catch (Exception e) { - throw new SolrException( - ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e); - } - factory = createFactory(); + // Initialize backend factory once; backends are created lazily on demand + backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig); + + // Choose default backend name (do not instantiate yet) + String backendName = (String) initArgs.get("extraction.backend"); + defaultBackendName = + (backendName == null || backendName.trim().isEmpty()) ? "local" : backendName; - // Choose backend implementation - String backendName = (String) initArgs.get("extraction.backend"); - try { - if (backendName == null - || backendName.trim().isEmpty() - || backendName.equalsIgnoreCase("local")) { - backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); - } else if (backendName.equalsIgnoreCase("dummy")) { - backend = new DummyExtractionBackend(); - } else { - // Fallback to local if unknown - backend = new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); - } } catch (Exception e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e); + throw new SolrException( + ErrorCode.SERVER_ERROR, "Unable to initialize ExtractingRequestHandler", e); } - } - protected SolrContentHandlerFactory createFactory() { - return new SolrContentHandlerFactory(); + factory = new SolrContentHandlerFactory(); } @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { // Allow per-request override of backend via request param "extraction.backend" - ExtractionBackend backendToUse = this.backend; String backendParam = req.getParams().get("extraction.backend"); - if (backendParam != null) { - if (backendParam.equalsIgnoreCase("dummy")) { - backendToUse = new DummyExtractionBackend(); - } else if (backendParam.equalsIgnoreCase("local")) { - try { - backendToUse = - new LocalTikaExtractionBackend(req.getCore(), tikaConfigLoc, parseContextConfig); - } catch (Exception e) { - throw new SolrException( - ErrorCode.SERVER_ERROR, "Unable to initialize extraction backend", e); - } - } - // unknown values fall back to the handler-configured backend - } - return new ExtractingDocumentLoader(req, processor, factory, backendToUse); + String nameToUse = + (backendParam != null && !backendParam.trim().isEmpty()) + ? backendParam + : defaultBackendName; + ExtractionBackend extractionBackend = backendFactory.getBackend(nameToUse); + return new ExtractingDocumentLoader(req, processor, factory, extractionBackend); } // ////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java new file mode 100644 index 00000000000..234a6064c62 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.solr.core.SolrCore; + +/** + * Factory for ExtractionBackend instances. Lazily constructs backends by short name (e.g., "local", + * "dummy") and caches them for reuse. + */ +public class ExtractionBackendFactory { + private final SolrCore core; + private final String tikaConfigLoc; + private final ParseContextConfig parseContextConfig; + private final Map cache = new ConcurrentHashMap<>(); + + public ExtractionBackendFactory( + SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) { + this.core = core; + this.tikaConfigLoc = tikaConfigLoc; + this.parseContextConfig = parseContextConfig; + } + + /** Returns a backend instance for the given name, creating it if necessary. */ + public ExtractionBackend getBackend(String name) { + String key = normalize(name); + return cache.computeIfAbsent( + key, + k -> { + try { + return create(k); + } catch (Exception e) { + throw new RuntimeException("Failed to create extraction backend '" + k + "'", e); + } + }); + } + + private String normalize(String name) { + if (name == null || name.trim().isEmpty()) return "local"; + return name.trim().toLowerCase(Locale.ROOT); + } + + /** Creates a new backend instance for the given normalized name. */ + protected ExtractionBackend create(String normalizedName) throws Exception { + switch (normalizedName) { + case "local": + return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + case "dummy": + return new DummyExtractionBackend(); + default: + // Fallback to local for unknown names + return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + } + } +} From 35fef11f3c78b48d6965f0c05b6bc21fd9719e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 16:25:27 +0200 Subject: [PATCH 05/47] Add TikaServerExtractionBackend --- .../extraction/DummyExtractionBackend.java | 3 +- .../handler/extraction/ExtractingParams.java | 3 + .../extraction/ExtractingRequestHandler.java | 12 +- .../extraction/ExtractionBackendFactory.java | 12 +- .../LocalTikaExtractionBackend.java | 4 +- .../TikaServerExtractionBackend.java | 210 ++++++++++++++++++ .../ExtractingRequestHandlerTest.java | 4 +- .../TikaServerExtractionBackendTest.java | 199 +++++++++++++++++ 8 files changed, 434 insertions(+), 13 deletions(-) create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index c9cdf724ef2..4d3955b4b1b 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -20,9 +20,10 @@ /** Dummy backend that emits predictable test data without actually parsing input content. */ public class DummyExtractionBackend implements ExtractionBackend { + public static final String ID = "dummy"; @Override public String name() { - return "dummy"; + return ID; } @Override diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java index a7d159678f1..840af280243 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java @@ -136,4 +136,7 @@ public interface ExtractingParams { * .*=<defaultmypassword> at the end */ public static final String PASSWORD_MAP_FILE = "passwordsFile"; + + /** Backend selection parameter and */ + public static final String EXTRACTION_BACKEND = "extraction.backend"; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 5f1b6f2be3f..ff4bddd0039 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -36,6 +36,7 @@ public class ExtractingRequestHandler extends ContentStreamHandlerBase public static final String PARSE_CONTEXT_CONFIG = "parseContext.config"; public static final String CONFIG_LOCATION = "tika.config"; + public static final String TIKASERVER_URL = "tikaserver.url"; protected String tikaConfigLoc; protected ParseContextConfig parseContextConfig; @@ -64,12 +65,13 @@ public void inform(SolrCore core) { } // Initialize backend factory once; backends are created lazily on demand - backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig); + String tikaServerUrl = (String) initArgs.get(TIKASERVER_URL); + backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl); // Choose default backend name (do not instantiate yet) - String backendName = (String) initArgs.get("extraction.backend"); + String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND); defaultBackendName = - (backendName == null || backendName.trim().isEmpty()) ? "local" : backendName; + (backendName == null || backendName.trim().isEmpty()) ? LocalTikaExtractionBackend.ID : backendName; } catch (Exception e) { throw new SolrException( @@ -81,8 +83,8 @@ public void inform(SolrCore core) { @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { - // Allow per-request override of backend via request param "extraction.backend" - String backendParam = req.getParams().get("extraction.backend"); + // Allow per-request override of backend via request param + String backendParam = req.getParams().get(ExtractingParams.EXTRACTION_BACKEND); String nameToUse = (backendParam != null && !backendParam.trim().isEmpty()) ? backendParam diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java index 234a6064c62..558e5cd7f72 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java @@ -29,13 +29,15 @@ public class ExtractionBackendFactory { private final SolrCore core; private final String tikaConfigLoc; private final ParseContextConfig parseContextConfig; + private final String tikaServerUrl; private final Map cache = new ConcurrentHashMap<>(); public ExtractionBackendFactory( - SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig) { + SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig, String tikaServerUrl) { this.core = core; this.tikaConfigLoc = tikaConfigLoc; this.parseContextConfig = parseContextConfig; + this.tikaServerUrl = tikaServerUrl; } /** Returns a backend instance for the given name, creating it if necessary. */ @@ -53,17 +55,19 @@ public ExtractionBackend getBackend(String name) { } private String normalize(String name) { - if (name == null || name.trim().isEmpty()) return "local"; + if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.ID; return name.trim().toLowerCase(Locale.ROOT); } /** Creates a new backend instance for the given normalized name. */ protected ExtractionBackend create(String normalizedName) throws Exception { switch (normalizedName) { - case "local": + case LocalTikaExtractionBackend.ID: return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); - case "dummy": + case DummyExtractionBackend.ID: return new DummyExtractionBackend(); + case TikaServerExtractionBackend.ID: + return new TikaServerExtractionBackend(tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); default: // Fallback to local for unknown names return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java index 315a582ea2a..687f0e6cc1e 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -96,9 +96,11 @@ public LocalTikaExtractionBackend( this.autoDetectParser = new AutoDetectParser(cfg); } + public static final String ID = "local"; + @Override public String name() { - return "local"; + return ID; } private Parser selectParser(ExtractionRequest request) { diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java new file mode 100644 index 00000000000..33ac66e7d86 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -0,0 +1,210 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import org.noggit.JSONParser; + +/** + * Extraction backend that delegates parsing to a remote Apache Tika Server. + * + *

This backend uses Java 11 HttpClient to call Tika Server endpoints. It supports + * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing + * is not supported and will throw UnsupportedOperationException. + */ +public class TikaServerExtractionBackend implements ExtractionBackend { + private final HttpClient httpClient; + private final String baseUrl; // e.g., http://localhost:9998 + private final Duration timeout = Duration.ofSeconds(30); + + public TikaServerExtractionBackend(String baseUrl) { + this(HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build(), baseUrl); + } + + // Visible for tests + TikaServerExtractionBackend(HttpClient httpClient, String baseUrl) { + if (baseUrl.endsWith("/")) { + this.baseUrl = baseUrl.substring(0, baseUrl.length() - 1); + } else { + this.baseUrl = baseUrl; + } + this.httpClient = httpClient; + } + + public static final String ID = "tikaserver"; + + @Override + public String name() { + return ID; + } + + @Override + public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) + throws Exception { + // 1) Extract text + String text = requestText(inputStream, request, false, null); + + // 2) Fetch metadata as JSON and convert to neutral metadata + ExtractionMetadata md = fetchMetadata(request); + + return new ExtractionResult(text, md); + } + + @Override + public ExtractionResult extractOnly( + InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) + throws Exception { + if (xpathExpr != null) { + throw new UnsupportedOperationException("XPath filtering is not supported by TikaServer backend"); + } + boolean wantXml = !ExtractingDocumentLoader.TEXT_FORMAT.equalsIgnoreCase(extractFormat); + String content = requestText(inputStream, request, wantXml, xpathExpr); + ExtractionMetadata md = fetchMetadata(request); + return new ExtractionResult(content, md); + } + + @Override + public void parseToSolrContentHandler( + InputStream inputStream, + ExtractionRequest request, + SolrContentHandler handler, + ExtractionMetadata outMetadata) + throws Exception { + throw new UnsupportedOperationException( + "Legacy SAX-based parsing is not supported by TikaServer backend"); + } + + private String requestText( + InputStream inputStream, ExtractionRequest request, boolean wantXml, String xpath) + throws IOException, InterruptedException { + String url = baseUrl + "/tika"; + HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream)); + // Content-Type + String contentType = firstNonNull(request.streamType, request.contentType); + if (contentType != null) { + b.header("Content-Type", contentType); + } + // Filename hint + if (request.resourceName != null) { + b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); + } + // Response type + b.header("Accept", wantXml ? "application/xml" : "text/plain"); + + HttpResponse resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofByteArray()); + int code = resp.statusCode(); + if (code < 200 || code >= 300) { + throw new IOException("TikaServer /tika returned status " + code); + } + return new String(resp.body(), StandardCharsets.UTF_8); + } + + private ExtractionMetadata fetchMetadata(ExtractionRequest request) + throws IOException, InterruptedException { + // Call /meta to get metadata. Ask JSON form; Tika Server returns application/json map. + String url = baseUrl + "/meta"; + HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.noBody()); + String contentType = firstNonNull(request.streamType, request.contentType); + if (contentType != null) { + b.header("Content-Type", contentType); + } + if (request.resourceName != null) { + b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); + } + b.header("Accept", "application/json"); + + HttpResponse resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + int code = resp.statusCode(); + if (code < 200 || code >= 300) { + throw new IOException("TikaServer /meta returned status " + code); + } + return parseJsonToMetadata(resp.body()); + } + + private static String firstNonNull(String a, String b) { + return a != null ? a : b; + } + + // Parse Tika Server metadata JSON using Noggit JSONParser. Supports values as strings, + // arrays of strings, and basic scalars (numbers/booleans) which are coerced to String. + private static ExtractionMetadata parseJsonToMetadata(String json) { + SimpleExtractionMetadata md = new SimpleExtractionMetadata(); + if (json == null) return md; + try { + JSONParser p = new JSONParser(json); + int ev = p.nextEvent(); + if (ev != JSONParser.OBJECT_START) { + return md; + } + String currentKey = null; + while (true) { + ev = p.nextEvent(); + if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) { + break; + } + if (ev == JSONParser.STRING && p.wasKey()) { + currentKey = p.getString(); + // Next event is the value for this key + ev = p.nextEvent(); + if (ev == JSONParser.STRING) { + md.add(currentKey, p.getString()); + } else if (ev == JSONParser.ARRAY_START) { + // Read array elements + while (true) { + ev = p.nextEvent(); + if (ev == JSONParser.ARRAY_END) break; + if (ev == JSONParser.STRING) { + md.add(currentKey, p.getString()); + } else if (ev == JSONParser.LONG + || ev == JSONParser.NUMBER + || ev == JSONParser.BIGNUMBER) { + md.add(currentKey, p.getNumberChars().toString()); + } else if (ev == JSONParser.BOOLEAN) { + md.add(currentKey, String.valueOf(p.getBoolean())); + } else if (ev == JSONParser.NULL) { + // ignore nulls + } else { + // skip nested objects or unsupported types within arrays + } + } + } else if (ev == JSONParser.LONG + || ev == JSONParser.NUMBER + || ev == JSONParser.BIGNUMBER) { + md.add(currentKey, p.getNumberChars().toString()); + } else if (ev == JSONParser.BOOLEAN) { + md.add(currentKey, String.valueOf(p.getBoolean())); + } else if (ev == JSONParser.NULL) { + // ignore nulls + } else { + // skip nested objects or unsupported value types + } + } + } + } catch (java.io.IOException ioe) { + // Fall back to empty metadata on parsing error + return md; + } + return md; + } +} diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index acff92e1071..c7098665027 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -1080,8 +1080,8 @@ public void testDummyBackendExtractOnly() throws Exception { SolrQueryResponse rsp = loadLocal( "extraction/version_control.txt", - "extraction.backend", - "dummy", + ExtractingParams.EXTRACTION_BACKEND, + DummyExtractionBackend.ID, ExtractingParams.EXTRACT_ONLY, "true", ExtractingParams.EXTRACT_FORMAT, diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java new file mode 100644 index 00000000000..19846da1142 --- /dev/null +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.CookieHandler; +import java.net.ProxySelector; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpHeaders; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.security.SecureRandom; +import java.time.Duration; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Executor; +import javax.net.ssl.SSLContext; +import javax.net.ssl.SSLParameters; +import org.apache.solr.SolrTestCaseJ4; +import org.junit.Test; + +/** Unit tests for TikaServerExtractionBackend using a mocked HttpClient (no networking). */ +public class TikaServerExtractionBackendTest extends SolrTestCaseJ4 { + + static { + // Allow the SecureRandom algorithm used in this environment to avoid class configuration + // failure in tests. + System.setProperty("test.solr.allowed.securerandom", "NativePRNG"); + } + + private static class FakeHttpClient extends HttpClient { + @Override + public Optional cookieHandler() { return Optional.empty(); } + + @Override + public Optional connectTimeout() { return Optional.of(Duration.ofSeconds(5)); } + + @Override + public Redirect followRedirects() { return Redirect.NEVER; } + + @Override + public Optional proxy() { return Optional.empty(); } + + @Override + public SSLContext sslContext() { try { return SSLContext.getDefault(); } catch (Exception e) { throw new RuntimeException(e);} } + + @Override + public SSLParameters sslParameters() { return new SSLParameters(); } + + @Override + public Optional executor() { return Optional.empty(); } + + @Override + public Optional authenticator() { return Optional.empty(); } + + @Override + public Version version() { return Version.HTTP_1_1; } + + @Override + public HttpResponse send(HttpRequest request, HttpResponse.BodyHandler responseBodyHandler) + throws IOException, InterruptedException { + return respond(request, responseBodyHandler); + } + + @Override + public CompletableFuture> sendAsync(HttpRequest request, HttpResponse.BodyHandler responseBodyHandler) { + return CompletableFuture.completedFuture(respond(request, responseBodyHandler)); + } + + @Override + public CompletableFuture> sendAsync(HttpRequest request, HttpResponse.BodyHandler responseBodyHandler, HttpResponse.PushPromiseHandler pushPromiseHandler) { + return CompletableFuture.completedFuture(respond(request, responseBodyHandler)); + } + + private HttpResponse respond(HttpRequest request, HttpResponse.BodyHandler handler) { + try { + URI uri = request.uri(); + String path = uri.getPath(); + byte[] body; + String ct; + int sc = 200; + if ("/tika".equals(path)) { + String accept = request.headers().firstValue("Accept").orElse("text/plain"); + if ("application/xml".equalsIgnoreCase(accept)) { + String xml = "XML OUT"; + body = xml.getBytes(java.nio.charset.StandardCharsets.UTF_8); + ct = "application/xml"; + } else { + body = "TEXT OUT".getBytes(java.nio.charset.StandardCharsets.UTF_8); + ct = "text/plain"; + } + } else if ("/meta".equals(path)) { + String json = + "{\"Content-Type\":[\"text/plain\"],\"resourcename\":[\"test.txt\"],\"X-Parsed-By\":[\"SomeParser\"]}"; + body = json.getBytes(java.nio.charset.StandardCharsets.UTF_8); + ct = "application/json"; + } else { + body = "Not Found".getBytes(java.nio.charset.StandardCharsets.UTF_8); + sc = 404; + ct = "text/plain"; + } + final int status = sc; + final String contentType = ct; + // Decide expected body type based on endpoint (mimics our backend usage) + final Object bodyObj = + "/meta".equals(path) + ? new String(body, java.nio.charset.StandardCharsets.UTF_8) + : body; // /tika returns bytes + return new HttpResponse<>() { + @Override public int statusCode() { return status; } + @Override public HttpRequest request() { return request; } + @Override public Optional> previousResponse() { return Optional.empty(); } + @Override public HttpHeaders headers() { return HttpHeaders.of(java.util.Map.of("Content-Type", java.util.List.of(contentType)), (k,v)->true); } + @Override public T body() { @SuppressWarnings("unchecked") T t = (T) bodyObj; return t; } + @Override public Optional sslSession() { return Optional.empty(); } + @Override public URI uri() { return uri; } + @Override public Version version() { return Version.HTTP_1_1; } + }; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + private static ExtractionRequest newRequest(String resourceName, String contentType) { + return new ExtractionRequest( + contentType, // streamType + resourceName, // resourceName + contentType, // contentType + null, // charset + resourceName, // streamName + null, // sourceInfo + null, // size + null, // resourcePassword + null // passwordsMap + ); + } + + @Test + public void testExtractTextAndMetadata() throws Exception { + TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example"); + byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8); + try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { + ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain")); + assertNotNull(res); + assertEquals("TEXT OUT", res.getContent()); + assertNotNull(res.getMetadata()); + assertArrayEquals(new String[] {"text/plain"}, res.getMetadata().getValues("Content-Type")); + assertArrayEquals(new String[] {"test.txt"}, res.getMetadata().getValues("resourcename")); + } + } + + @Test + public void testExtractOnlyXml() throws Exception { + TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example"); + byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8); + try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { + ExtractionResult res = + backend.extractOnly( + in, newRequest("test.txt", "text/plain"), ExtractingDocumentLoader.XML_FORMAT, null); + assertNotNull(res); + assertTrue(res.getContent().contains(" + backend.parseToSolrContentHandler( + in, + newRequest("test.txt", "text/plain"), + new SolrContentHandler(new SimpleExtractionMetadata(), params(), null), + new SimpleExtractionMetadata())); + } + } +} From 196dcdc1ca7c8d435f1f4d8f30a0df5e46cf6466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 16:58:50 +0200 Subject: [PATCH 06/47] Change testing to use TestContainers --- gradle/libs.versions.toml | 2 + solr/licenses/docker-java-LICENSE-ASL.txt | 176 +++++++++++++++ solr/licenses/docker-java-NOTICE.txt | 7 + solr/licenses/docker-java-api-3.4.0.jar.sha1 | 1 + .../docker-java-transport-3.4.0.jar.sha1 | 1 + ...cker-java-transport-zerodep-3.4.0.jar.sha1 | 1 + solr/licenses/duct-tape-1.0.8.jar.sha1 | 1 + solr/licenses/duct-tape-LICENSE-MIT.txt | 19 ++ solr/licenses/jna-5.13.0.jar.sha1 | 1 + solr/licenses/testcontainers-1.20.4.jar.sha1 | 1 + solr/licenses/testcontainers-LICENSE-MIT.txt | 19 ++ solr/modules/extraction/build.gradle | 5 +- solr/modules/extraction/gradle.lockfile | 9 +- .../extraction/DummyExtractionBackend.java | 1 + .../extraction/ExtractingRequestHandler.java | 7 +- .../extraction/ExtractionBackendFactory.java | 8 +- .../TikaServerExtractionBackend.java | 47 ++-- .../TikaServerExtractionBackendTest.java | 205 ++++++++---------- 18 files changed, 374 insertions(+), 137 deletions(-) create mode 100644 solr/licenses/docker-java-LICENSE-ASL.txt create mode 100644 solr/licenses/docker-java-NOTICE.txt create mode 100644 solr/licenses/docker-java-api-3.4.0.jar.sha1 create mode 100644 solr/licenses/docker-java-transport-3.4.0.jar.sha1 create mode 100644 solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 create mode 100644 solr/licenses/duct-tape-1.0.8.jar.sha1 create mode 100644 solr/licenses/duct-tape-LICENSE-MIT.txt create mode 100644 solr/licenses/jna-5.13.0.jar.sha1 create mode 100644 solr/licenses/testcontainers-1.20.4.jar.sha1 create mode 100644 solr/licenses/testcontainers-LICENSE-MIT.txt diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ea14d91ce6b..396befbab3c 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -194,6 +194,7 @@ squareup-okhttp3-okhttp = "4.12.0" stephenc-jcip = "1.0-1" swagger3 = "2.2.22" tdunning-tdigest = "3.3" +testcontainers = "1.20.4" thetaphi-forbiddenapis = "3.9" thisptr-jacksonjq = "0.0.13" threeten-bp = "1.6.8" @@ -512,6 +513,7 @@ stephenc-jcip-annotations = { module = "com.github.stephenc.jcip:jcip-annotation swagger3-annotations-jakarta = { module = "io.swagger.core.v3:swagger-annotations-jakarta", version.ref = "swagger3" } swagger3-jaxrs2-jakarta = { module = "io.swagger.core.v3:swagger-jaxrs2-jakarta", version.ref = "swagger3" } tdunning-tdigest = { module = "com.tdunning:t-digest", version.ref = "tdunning-tdigest" } +testcontainers = { module = "org.testcontainers:testcontainers", version.ref = "testcontainers" } thisptr-jacksonjq = { module = "net.thisptr:jackson-jq", version.ref = "thisptr-jacksonjq" } threeten-bp = { module = "org.threeten:threetenbp", version.ref = "threeten-bp" } xerces-impl = { module = "xerces:xercesImpl", version.ref = "xerces" } diff --git a/solr/licenses/docker-java-LICENSE-ASL.txt b/solr/licenses/docker-java-LICENSE-ASL.txt new file mode 100644 index 00000000000..492933f08c2 --- /dev/null +++ b/solr/licenses/docker-java-LICENSE-ASL.txt @@ -0,0 +1,176 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + +"Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS diff --git a/solr/licenses/docker-java-NOTICE.txt b/solr/licenses/docker-java-NOTICE.txt new file mode 100644 index 00000000000..49a9e022cce --- /dev/null +++ b/solr/licenses/docker-java-NOTICE.txt @@ -0,0 +1,7 @@ +This product includes software developed by the docker-java project. + +Copyright (c) 2013-2025, docker-java project contributors + +Project: https://github.com/docker-java/docker-java + +Licensed under the Apache License, Version 2.0. diff --git a/solr/licenses/docker-java-api-3.4.0.jar.sha1 b/solr/licenses/docker-java-api-3.4.0.jar.sha1 new file mode 100644 index 00000000000..bf5ca0d6db4 --- /dev/null +++ b/solr/licenses/docker-java-api-3.4.0.jar.sha1 @@ -0,0 +1 @@ +9ef23dcc93693f15e69b64632be096c38e31bc44 diff --git a/solr/licenses/docker-java-transport-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-3.4.0.jar.sha1 new file mode 100644 index 00000000000..c1232d24a6b --- /dev/null +++ b/solr/licenses/docker-java-transport-3.4.0.jar.sha1 @@ -0,0 +1 @@ +c058705684d782effc4b2edfdef1a87544ba4af8 diff --git a/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 new file mode 100644 index 00000000000..b658f8f0810 --- /dev/null +++ b/solr/licenses/docker-java-transport-zerodep-3.4.0.jar.sha1 @@ -0,0 +1 @@ +c4ce6d8695cfdb0027872f99cc20f8f679f8a969 diff --git a/solr/licenses/duct-tape-1.0.8.jar.sha1 b/solr/licenses/duct-tape-1.0.8.jar.sha1 new file mode 100644 index 00000000000..8ccb86d64ea --- /dev/null +++ b/solr/licenses/duct-tape-1.0.8.jar.sha1 @@ -0,0 +1 @@ +92edc22a9ab2f3e17c9bf700aaee377d50e8b530 diff --git a/solr/licenses/duct-tape-LICENSE-MIT.txt b/solr/licenses/duct-tape-LICENSE-MIT.txt new file mode 100644 index 00000000000..9cf106272ac --- /dev/null +++ b/solr/licenses/duct-tape-LICENSE-MIT.txt @@ -0,0 +1,19 @@ +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/solr/licenses/jna-5.13.0.jar.sha1 b/solr/licenses/jna-5.13.0.jar.sha1 new file mode 100644 index 00000000000..93b456b9293 --- /dev/null +++ b/solr/licenses/jna-5.13.0.jar.sha1 @@ -0,0 +1 @@ +1200e7ebeedbe0d10062093f32925a912020e747 diff --git a/solr/licenses/testcontainers-1.20.4.jar.sha1 b/solr/licenses/testcontainers-1.20.4.jar.sha1 new file mode 100644 index 00000000000..29746a98e88 --- /dev/null +++ b/solr/licenses/testcontainers-1.20.4.jar.sha1 @@ -0,0 +1 @@ +ee2fe3afc9fa6cb2e6a43233998f3633f761692f diff --git a/solr/licenses/testcontainers-LICENSE-MIT.txt b/solr/licenses/testcontainers-LICENSE-MIT.txt new file mode 100644 index 00000000000..9cf106272ac --- /dev/null +++ b/solr/licenses/testcontainers-LICENSE-MIT.txt @@ -0,0 +1,19 @@ +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/solr/modules/extraction/build.gradle b/solr/modules/extraction/build.gradle index da6ebaccd68..0cbb4c0174f 100644 --- a/solr/modules/extraction/build.gradle +++ b/solr/modules/extraction/build.gradle @@ -35,11 +35,10 @@ dependencies { exclude group: 'org.quartz-scheduler', module: 'quartz' exclude group: 'xml-apis', module: 'xml-apis' }) - implementation (libs.xerces.impl, { - exclude group: 'xml-apis', module: 'xml-apis' - }) testImplementation project(':solr:test-framework') testImplementation libs.apache.lucene.testframework testImplementation libs.junit.junit + testImplementation libs.testcontainers + testImplementation libs.carrotsearch.randomizedtesting.runner } diff --git a/solr/modules/extraction/gradle.lockfile b/solr/modules/extraction/gradle.lockfile index 458aae19c39..5e498280731 100644 --- a/solr/modules/extraction/gradle.lockfile +++ b/solr/modules/extraction/gradle.lockfile @@ -15,6 +15,9 @@ com.fasterxml.jackson.module:jackson-module-jakarta-xmlbind-annotations:2.20.0=j com.fasterxml.jackson:jackson-bom:2.20.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath com.fasterxml.woodstox:woodstox-core:7.0.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath com.github.ben-manes.caffeine:caffeine:3.2.2=annotationProcessor,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testRuntimeClasspath +com.github.docker-java:docker-java-api:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath +com.github.docker-java:docker-java-transport-zerodep:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath +com.github.docker-java:docker-java-transport:3.4.0=jarValidation,testCompileClasspath,testRuntimeClasspath com.github.jai-imageio:jai-imageio-core:1.4.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath com.github.junrar:junrar:7.5.3=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath com.github.kevinstern:software-and-algorithms:1.0=annotationProcessor,errorprone,testAnnotationProcessor @@ -99,7 +102,8 @@ javax.inject:javax.inject:1=annotationProcessor,errorprone,testAnnotationProcess javax.measure:unit-api:1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath joda-time:joda-time:2.14.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath junit:junit:4.13.2=jarValidation,testCompileClasspath,testRuntimeClasspath -net.java.dev.jna:jna:5.12.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +net.java.dev.jna:jna:5.12.1=compileClasspath,runtimeClasspath,runtimeLibs +net.java.dev.jna:jna:5.13.0=jarValidation,testCompileClasspath,testRuntimeClasspath net.sf.ehcache:ehcache-core:2.6.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.antlr:antlr4-runtime:4.13.2=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.apache.commons:commons-collections4:4.5.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath @@ -215,6 +219,7 @@ org.hamcrest:hamcrest:3.0=jarValidation,testCompileClasspath,testRuntimeClasspat org.itadaki:bzip2:0.9.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.javassist:javassist:3.30.2-GA=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.jdom:jdom2:2.0.6.1=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +org.jetbrains:annotations:26.0.2=jarValidation,testCompileClasspath,testRuntimeClasspath org.jspecify:jspecify:1.0.0=annotationProcessor,compileClasspath,errorprone,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testAnnotationProcessor,testCompileClasspath,testRuntimeClasspath org.junit.jupiter:junit-jupiter-api:5.6.2=jarValidation,testRuntimeClasspath org.junit.platform:junit-platform-commons:1.6.2=jarValidation,testRuntimeClasspath @@ -226,6 +231,7 @@ org.ow2.asm:asm-commons:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatf org.ow2.asm:asm-tree:9.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.ow2.asm:asm:9.8=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.pcollections:pcollections:4.0.1=annotationProcessor,errorprone,testAnnotationProcessor +org.rnorth.duct-tape:duct-tape:1.0.8=jarValidation,testCompileClasspath,testRuntimeClasspath org.semver4j:semver4j:6.0.0=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath org.slf4j:jcl-over-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath org.slf4j:jul-to-slf4j:2.0.17=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testCompileClasspath,testRuntimeClasspath @@ -234,6 +240,7 @@ org.tallison.xmp:xmpcore-shaded:6.1.10=compileClasspath,jarValidation,runtimeCla org.tallison:isoparser:1.9.41.7=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:jmatio:1.5=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.tallison:metadata-extractor:2.17.1.0=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath +org.testcontainers:testcontainers:1.20.4=jarValidation,testCompileClasspath,testRuntimeClasspath org.tukaani:xz:1.9=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath org.xerial.snappy:snappy-java:1.1.10.8=jarValidation,runtimeClasspath,runtimeLibs,solrPlatformLibs,testRuntimeClasspath xerces:xercesImpl:2.12.2=compileClasspath,jarValidation,runtimeClasspath,runtimeLibs,testCompileClasspath,testRuntimeClasspath diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index 4d3955b4b1b..864bba00fdd 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -21,6 +21,7 @@ /** Dummy backend that emits predictable test data without actually parsing input content. */ public class DummyExtractionBackend implements ExtractionBackend { public static final String ID = "dummy"; + @Override public String name() { return ID; diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index ff4bddd0039..224ee54f0ac 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -66,12 +66,15 @@ public void inform(SolrCore core) { // Initialize backend factory once; backends are created lazily on demand String tikaServerUrl = (String) initArgs.get(TIKASERVER_URL); - backendFactory = new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl); + backendFactory = + new ExtractionBackendFactory(core, tikaConfigLoc, parseContextConfig, tikaServerUrl); // Choose default backend name (do not instantiate yet) String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND); defaultBackendName = - (backendName == null || backendName.trim().isEmpty()) ? LocalTikaExtractionBackend.ID : backendName; + (backendName == null || backendName.trim().isEmpty()) + ? LocalTikaExtractionBackend.ID + : backendName; } catch (Exception e) { throw new SolrException( diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java index 558e5cd7f72..38033d8b935 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java @@ -33,7 +33,10 @@ public class ExtractionBackendFactory { private final Map cache = new ConcurrentHashMap<>(); public ExtractionBackendFactory( - SolrCore core, String tikaConfigLoc, ParseContextConfig parseContextConfig, String tikaServerUrl) { + SolrCore core, + String tikaConfigLoc, + ParseContextConfig parseContextConfig, + String tikaServerUrl) { this.core = core; this.tikaConfigLoc = tikaConfigLoc; this.parseContextConfig = parseContextConfig; @@ -67,7 +70,8 @@ protected ExtractionBackend create(String normalizedName) throws Exception { case DummyExtractionBackend.ID: return new DummyExtractionBackend(); case TikaServerExtractionBackend.ID: - return new TikaServerExtractionBackend(tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); + return new TikaServerExtractionBackend( + tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); default: // Fallback to local for unknown names return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index 33ac66e7d86..c37cd1ba76c 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -30,8 +30,8 @@ * Extraction backend that delegates parsing to a remote Apache Tika Server. * *

This backend uses Java 11 HttpClient to call Tika Server endpoints. It supports - * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing - * is not supported and will throw UnsupportedOperationException. + * backend-neutral extract() and extractOnly() operations. Legacy SAX-based parsing is not supported + * and will throw UnsupportedOperationException. */ public class TikaServerExtractionBackend implements ExtractionBackend { private final HttpClient httpClient; @@ -62,11 +62,14 @@ public String name() { @Override public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception { + // Buffer the input so we can send it to multiple Tika Server endpoints + byte[] data = inputStream.readAllBytes(); + // 1) Extract text - String text = requestText(inputStream, request, false, null); + String text = requestText(data, request, false, null); // 2) Fetch metadata as JSON and convert to neutral metadata - ExtractionMetadata md = fetchMetadata(request); + ExtractionMetadata md = fetchMetadata(data, request); return new ExtractionResult(text, md); } @@ -76,11 +79,15 @@ public ExtractionResult extractOnly( InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) throws Exception { if (xpathExpr != null) { - throw new UnsupportedOperationException("XPath filtering is not supported by TikaServer backend"); + throw new UnsupportedOperationException( + "XPath filtering is not supported by TikaServer backend"); } + // Buffer the input so we can send it to multiple Tika Server endpoints + byte[] data = inputStream.readAllBytes(); + boolean wantXml = !ExtractingDocumentLoader.TEXT_FORMAT.equalsIgnoreCase(extractFormat); - String content = requestText(inputStream, request, wantXml, xpathExpr); - ExtractionMetadata md = fetchMetadata(request); + String content = requestText(data, request, wantXml, xpathExpr); + ExtractionMetadata md = fetchMetadata(data, request); return new ExtractionResult(content, md); } @@ -95,11 +102,14 @@ public void parseToSolrContentHandler( "Legacy SAX-based parsing is not supported by TikaServer backend"); } - private String requestText( - InputStream inputStream, ExtractionRequest request, boolean wantXml, String xpath) + private String requestText(byte[] data, ExtractionRequest request, boolean wantXml, String xpath) throws IOException, InterruptedException { - String url = baseUrl + "/tika"; - HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream)); + String path = wantXml ? "/tika/xhtml" : "/tika/text"; + String url = baseUrl + path; + HttpRequest.Builder b = + HttpRequest.newBuilder(URI.create(url)) + .timeout(timeout) + .PUT(HttpRequest.BodyPublishers.ofByteArray(data)); // Content-Type String contentType = firstNonNull(request.streamType, request.contentType); if (contentType != null) { @@ -109,8 +119,7 @@ private String requestText( if (request.resourceName != null) { b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); } - // Response type - b.header("Accept", wantXml ? "application/xml" : "text/plain"); + // Do not set Accept, let server choose default representation for the endpoint HttpResponse resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofByteArray()); int code = resp.statusCode(); @@ -120,11 +129,14 @@ private String requestText( return new String(resp.body(), StandardCharsets.UTF_8); } - private ExtractionMetadata fetchMetadata(ExtractionRequest request) + private ExtractionMetadata fetchMetadata(byte[] data, ExtractionRequest request) throws IOException, InterruptedException { - // Call /meta to get metadata. Ask JSON form; Tika Server returns application/json map. + // Call /meta to get metadata for the provided content. Ask JSON form. String url = baseUrl + "/meta"; - HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)).timeout(timeout).POST(HttpRequest.BodyPublishers.noBody()); + HttpRequest.Builder b = + HttpRequest.newBuilder(URI.create(url)) + .timeout(timeout) + .PUT(HttpRequest.BodyPublishers.ofByteArray(data)); String contentType = firstNonNull(request.streamType, request.contentType); if (contentType != null) { b.header("Content-Type", contentType); @@ -134,7 +146,8 @@ private ExtractionMetadata fetchMetadata(ExtractionRequest request) } b.header("Accept", "application/json"); - HttpResponse resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + HttpResponse resp = + httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); int code = resp.statusCode(); if (code < 200 || code >= 300) { throw new IOException("TikaServer /meta returned status " + code); diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java index 19846da1142..5c51ccba38f 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java @@ -16,127 +16,95 @@ */ package org.apache.solr.handler.extraction; +import com.carrotsearch.randomizedtesting.ThreadFilter; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.CookieHandler; -import java.net.ProxySelector; -import java.net.URI; import java.net.http.HttpClient; -import java.net.http.HttpHeaders; -import java.net.http.HttpRequest; -import java.net.http.HttpResponse; -import java.security.SecureRandom; -import java.time.Duration; -import java.util.Optional; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.Executor; -import javax.net.ssl.SSLContext; -import javax.net.ssl.SSLParameters; +import java.util.concurrent.ExecutorService; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.util.ExecutorUtil; +import org.junit.AfterClass; +import org.junit.Assume; +import org.junit.BeforeClass; import org.junit.Test; +import org.testcontainers.containers.GenericContainer; -/** Unit tests for TikaServerExtractionBackend using a mocked HttpClient (no networking). */ +/** + * Integration tests for TikaServerExtractionBackend using a real Tika Server via Testcontainers. + */ +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + TikaServerExtractionBackendTest.TestcontainersThreadsFilter.class + }) public class TikaServerExtractionBackendTest extends SolrTestCaseJ4 { + // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test + public static class TestcontainersThreadsFilter implements ThreadFilter { + @Override + public boolean reject(Thread t) { + if (t == null || t.getName() == null) return false; + String n = t.getName(); + return n.startsWith("testcontainers-ryuk") + || n.startsWith("testcontainers-wait-") + || n.startsWith("HttpClient-") + || n.startsWith("HttpClient-TestContainers"); + } + } + static { // Allow the SecureRandom algorithm used in this environment to avoid class configuration // failure in tests. System.setProperty("test.solr.allowed.securerandom", "NativePRNG"); } - private static class FakeHttpClient extends HttpClient { - @Override - public Optional cookieHandler() { return Optional.empty(); } - - @Override - public Optional connectTimeout() { return Optional.of(Duration.ofSeconds(5)); } - - @Override - public Redirect followRedirects() { return Redirect.NEVER; } - - @Override - public Optional proxy() { return Optional.empty(); } - - @Override - public SSLContext sslContext() { try { return SSLContext.getDefault(); } catch (Exception e) { throw new RuntimeException(e);} } - - @Override - public SSLParameters sslParameters() { return new SSLParameters(); } - - @Override - public Optional executor() { return Optional.empty(); } - - @Override - public Optional authenticator() { return Optional.empty(); } - - @Override - public Version version() { return Version.HTTP_1_1; } - - @Override - public HttpResponse send(HttpRequest request, HttpResponse.BodyHandler responseBodyHandler) - throws IOException, InterruptedException { - return respond(request, responseBodyHandler); - } - - @Override - public CompletableFuture> sendAsync(HttpRequest request, HttpResponse.BodyHandler responseBodyHandler) { - return CompletableFuture.completedFuture(respond(request, responseBodyHandler)); + private static GenericContainer tika; + private static String baseUrl; + private static ExecutorService httpExec; + private static HttpClient client; + + @BeforeClass + public static void startTikaServer() { + try { + httpExec = + ExecutorUtil.newMDCAwareFixedThreadPool( + 2, + r -> { + Thread t = new Thread(r, "HttpClient-TestContainers"); + t.setDaemon(true); + return t; + }); + client = HttpClient.newBuilder().executor(httpExec).build(); + tika = new GenericContainer<>("apache/tika:3.2.3.0-full").withExposedPorts(9998); + tika.start(); + baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998); + } catch (Throwable t) { + // Skip tests if Docker/Testcontainers are not available in the environment + Assume.assumeNoException("Docker/Testcontainers not available; skipping TikaServer tests", t); } + } - @Override - public CompletableFuture> sendAsync(HttpRequest request, HttpResponse.BodyHandler responseBodyHandler, HttpResponse.PushPromiseHandler pushPromiseHandler) { - return CompletableFuture.completedFuture(respond(request, responseBodyHandler)); + @AfterClass + public static void stopTikaServer() { + if (tika != null) { + try { + tika.stop(); + } catch (Throwable ignore) { + } + tika = null; } - - private HttpResponse respond(HttpRequest request, HttpResponse.BodyHandler handler) { + if (httpExec != null) { try { - URI uri = request.uri(); - String path = uri.getPath(); - byte[] body; - String ct; - int sc = 200; - if ("/tika".equals(path)) { - String accept = request.headers().firstValue("Accept").orElse("text/plain"); - if ("application/xml".equalsIgnoreCase(accept)) { - String xml = "XML OUT"; - body = xml.getBytes(java.nio.charset.StandardCharsets.UTF_8); - ct = "application/xml"; - } else { - body = "TEXT OUT".getBytes(java.nio.charset.StandardCharsets.UTF_8); - ct = "text/plain"; - } - } else if ("/meta".equals(path)) { - String json = - "{\"Content-Type\":[\"text/plain\"],\"resourcename\":[\"test.txt\"],\"X-Parsed-By\":[\"SomeParser\"]}"; - body = json.getBytes(java.nio.charset.StandardCharsets.UTF_8); - ct = "application/json"; - } else { - body = "Not Found".getBytes(java.nio.charset.StandardCharsets.UTF_8); - sc = 404; - ct = "text/plain"; - } - final int status = sc; - final String contentType = ct; - // Decide expected body type based on endpoint (mimics our backend usage) - final Object bodyObj = - "/meta".equals(path) - ? new String(body, java.nio.charset.StandardCharsets.UTF_8) - : body; // /tika returns bytes - return new HttpResponse<>() { - @Override public int statusCode() { return status; } - @Override public HttpRequest request() { return request; } - @Override public Optional> previousResponse() { return Optional.empty(); } - @Override public HttpHeaders headers() { return HttpHeaders.of(java.util.Map.of("Content-Type", java.util.List.of(contentType)), (k,v)->true); } - @Override public T body() { @SuppressWarnings("unchecked") T t = (T) bodyObj; return t; } - @Override public Optional sslSession() { return Optional.empty(); } - @Override public URI uri() { return uri; } - @Override public Version version() { return Version.HTTP_1_1; } - }; - } catch (Exception e) { - throw new RuntimeException(e); + httpExec.shutdownNow(); + } catch (Throwable ignore) { } + httpExec = null; } + client = null; } private static ExtractionRequest newRequest(String resourceName, String contentType) { @@ -155,35 +123,48 @@ private static ExtractionRequest newRequest(String resourceName, String contentT @Test public void testExtractTextAndMetadata() throws Exception { - TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example"); - byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8); + Assume.assumeTrue("Tika server container not started", tika != null); + TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl); + byte[] data = "Hello TestContainers".getBytes(java.nio.charset.StandardCharsets.UTF_8); try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain")); assertNotNull(res); - assertEquals("TEXT OUT", res.getContent()); + assertNotNull(res.getContent()); + assertTrue(res.getContent().contains("Hello TestContainers")); assertNotNull(res.getMetadata()); - assertArrayEquals(new String[] {"text/plain"}, res.getMetadata().getValues("Content-Type")); - assertArrayEquals(new String[] {"test.txt"}, res.getMetadata().getValues("resourcename")); + String[] cts = res.getMetadata().getValues("Content-Type"); + assertNotNull(cts); + assertTrue(cts.length >= 1); + // Tika may append charset; be flexible + assertTrue(cts[0].startsWith("text/plain")); } } @Test public void testExtractOnlyXml() throws Exception { - TikaServerExtractionBackend backend = new TikaServerExtractionBackend(new FakeHttpClient(), "http://example"); - byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8); + Assume.assumeTrue("Tika server container not started", tika != null); + TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl); + byte[] data = "Hello XML".getBytes(java.nio.charset.StandardCharsets.UTF_8); try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { ExtractionResult res = backend.extractOnly( in, newRequest("test.txt", "text/plain"), ExtractingDocumentLoader.XML_FORMAT, null); assertNotNull(res); - assertTrue(res.getContent().contains(" Date: Fri, 19 Sep 2025 17:57:19 +0200 Subject: [PATCH 07/47] Draft docs --- .../pages/indexing-with-tika.adoc | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc index b0cdb7eba30..183af23e30b 100644 --- a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc +++ b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc @@ -18,9 +18,9 @@ If the documents you need to index are in a binary format, such as Word, Excel, PDFs, etc., Solr includes a request handler which uses http://tika.apache.org/[Apache Tika] to extract text for indexing to Solr. -Solr uses code from the Tika project to provide a framework for incorporating many different file-format parsers such as http://pdfbox.apache.org/[Apache PDFBox] and http://poi.apache.org/index.html[Apache POI] into Solr itself. +There are two backends for this module. The `local` backend uses code from the Tika project to provide a framework for incorporating many different file-format parsers such as http://pdfbox.apache.org/[Apache PDFBox] and http://poi.apache.org/index.html[Apache POI] into Solr itself. The `tikaserver` backend uses an external Tika server process to do the extraction. -Working with this framework, Solr's `ExtractingRequestHandler` uses Tika internally to support uploading binary files +Working with this framework, Solr's `ExtractingRequestHandler` uses Tika to support uploading binary files for data extraction and indexing. Downloading Tika is not required to use Solr Cell. @@ -49,6 +49,9 @@ By default it maps to the same name but several parameters control how this is d * When Solr Cell finishes creating the internal `SolrInputDocument`, the rest of the indexing stack takes over. The next step after any update handler is the xref:configuration-guide:update-request-processors.adoc[Update Request Processor] chain. +== Tika Server + +TODO: Add documentation about Tika Server backend. == Module @@ -170,6 +173,32 @@ The following parameters are accepted by the `ExtractingRequestHandler`. These parameters can be set for each indexing request (as request parameters), or they can be set for all requests to the request handler by defining them in <>. +`extraction.backend`:: ++ +[%autowidth,frame=none] +|=== +|Optional |Default: local +|=== ++ +Choose the backend to use for extraction. The options are `local` or `tikaserver`. +The `local` backend uses Tika libraries included with Solr to do the extraction, and is the default in Solr 9. +The `tikaserver` backend uses an external Tika server process to do the extraction. +**The `local` backend is deprecated and will be removed in a future release.** ++ +Example: In `solrconfig.xml`: `tikaserver`. + +`tikaserver.url`:: ++ +[%autowidth,frame=none] +|=== +|Optional |Default: none +|=== ++ +Specifies the URL of the Tika server to use when the `extraction.backend` parameter is set to `tikaserver`. +This parameter is required when using the `tikaserver` backend. ++ +Example: In `solrconfig.xml`: `http://my.tika.server`. + `capture`:: + [%autowidth,frame=none] From a3794cee910514be522f9661b41386a10479c9e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 19 Sep 2025 18:58:50 +0200 Subject: [PATCH 08/47] Use json response from Tika Cleanup TestContainer Refactor ExtractionMetadata Add returnType to ExtractionRequest Remove static initializers --- .../extraction/DummyExtractionBackend.java | 4 +- .../extraction/ExtractingDocumentLoader.java | 14 +- .../handler/extraction/ExtractionBackend.java | 3 +- .../extraction/ExtractionMetadata.java | 66 ++++- .../handler/extraction/ExtractionRequest.java | 5 +- .../LocalTikaExtractionBackend.java | 7 +- .../extraction/SimpleExtractionMetadata.java | 52 ---- .../TikaServerExtractionBackend.java | 257 ++++++++++-------- .../solr/collection1/conf/solrconfig.xml | 2 + .../ExtractingRequestHandlerTest.java | 72 ++++- .../LocalTikaExtractionBackendTest.java | 18 +- .../TikaServerExtractionBackendTest.java | 23 +- 12 files changed, 304 insertions(+), 219 deletions(-) delete mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index 864bba00fdd..33ae55c63c8 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -29,7 +29,7 @@ public String name() { @Override public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) { - ExtractionMetadata metadata = new SimpleExtractionMetadata(); + ExtractionMetadata metadata = new ExtractionMetadata(); metadata.add("Dummy-Backend", "true"); metadata.add( "Content-Type", @@ -43,7 +43,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque @Override public ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) { + InputStream inputStream, ExtractionRequest request, String xpathExpr) { if (xpathExpr != null) { throw new UnsupportedOperationException("XPath not supported by dummy backend"); } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index b6a74008ff5..2214059e2f9 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -102,6 +102,8 @@ public void load( String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); + String extractFormat = + params.get(ExtractingParams.EXTRACT_FORMAT, extractOnly ? XML_FORMAT : TEXT_FORMAT); // Parse optional passwords file into a map (keeps Tika usages out of this class) LinkedHashMap pwMap = null; @@ -122,7 +124,8 @@ public void load( stream.getSourceInfo(), stream.getSize(), params.get(ExtractingParams.RESOURCE_PASSWORD, null), - pwMap); + pwMap, + extractFormat); boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false); String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS); @@ -135,10 +138,8 @@ public void load( || (passwordsFile != null); if (extractOnly) { - String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, XML_FORMAT); try { - ExtractionResult result = - backend.extractOnly(inputStream, extractionRequest, extractFormat, xpathExpr); + ExtractionResult result = backend.extractOnly(inputStream, extractionRequest, xpathExpr); // Write content rsp.add(stream.getName(), result.getContent()); // Write metadata @@ -165,7 +166,7 @@ public void load( if (needLegacySax) { // Indexing with capture/xpath/etc: delegate SAX parse to backend - SimpleExtractionMetadata neutral = new SimpleExtractionMetadata(); + ExtractionMetadata neutral = new ExtractionMetadata(); SolrContentHandler handler = factory.createSolrContentHandler(neutral, params, req.getSchema()); try { @@ -194,8 +195,7 @@ public void load( log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); // Index a document with literals only (no extracted content/metadata) SolrContentHandler handler = - factory.createSolrContentHandler( - new SimpleExtractionMetadata(), params, req.getSchema()); + factory.createSolrContentHandler(new ExtractionMetadata(), params, req.getSchema()); addDoc(handler); return; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java index 3a253dc1ec3..715c73636e2 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -32,8 +32,7 @@ public interface ExtractionBackend { * xpathExpr; if unsupported and xpathExpr is not null, they should throw * UnsupportedOperationException. */ - ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) + ExtractionResult extractOnly(InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception; /** diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java index b5864ec05c3..6229089d502 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java @@ -16,16 +16,64 @@ */ package org.apache.solr.handler.extraction; -/** - * Neutral metadata container used by extraction backends. Provides minimal operations needed by - * SolrContentHandler and response building without depending on Apache Tika's Metadata class. - */ -public interface ExtractionMetadata { - void add(String name, String value); +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** Simple metadata bean */ +public class ExtractionMetadata { + private final Map> map = new LinkedHashMap<>(); + + public void add(String name, String value) { + if (name == null || value == null) return; + map.computeIfAbsent(name, k -> new ArrayList<>()).add(value); + } + + public String[] getValues(String name) { + List vals = map.get(name); + if (vals == null) return new String[0]; + return vals.toArray(new String[0]); + } + + public String get(String name) { + List vals = map.get(name); + if (vals == null || vals.isEmpty()) return null; + return vals.get(0); + } + + public String[] names() { + return map.keySet().toArray(new String[0]); + } + + public void remove(String name) { + map.remove(name); + } - String[] getValues(String name); + @Override + public String toString() { + StringBuilder sb = new StringBuilder("ExtractionMetadata{"); + boolean first = true; + for (Map.Entry> e : map.entrySet()) { + if (!first) sb.append(", "); + first = false; + sb.append(e.getKey()).append('=').append(e.getValue()); + } + sb.append('}'); + return sb.toString(); + } - String get(String name); + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!(obj instanceof ExtractionMetadata)) return false; + ExtractionMetadata that = (ExtractionMetadata) obj; + return Objects.equals(this.map, that.map); + } - String[] names(); + @Override + public int hashCode() { + return Objects.hash(map); + } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java index f1af3029193..010f6633472 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java @@ -28,6 +28,7 @@ public class ExtractionRequest { public final String resourcePassword; // optional password for encrypted docs public final java.util.LinkedHashMap passwordsMap; // optional passwords map + public final String extractFormat; public ExtractionRequest( String streamType, @@ -38,7 +39,8 @@ public ExtractionRequest( String streamSourceInfo, Long streamSize, String resourcePassword, - java.util.LinkedHashMap passwordsMap) { + java.util.LinkedHashMap passwordsMap, + String extractFormat) { this.streamType = streamType; this.resourceName = resourceName; this.contentType = contentType; @@ -48,5 +50,6 @@ public ExtractionRequest( this.streamSize = streamSize; this.resourcePassword = resourcePassword; this.passwordsMap = passwordsMap; + this.extractFormat = extractFormat; } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java index 687f0e6cc1e..e91716e1652 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -144,7 +144,7 @@ private ParseContext buildContext(Parser parser, ExtractionRequest request) { } private static ExtractionMetadata copyToNeutral(Metadata md) { - ExtractionMetadata out = new SimpleExtractionMetadata(); + ExtractionMetadata out = new ExtractionMetadata(); for (String name : md.names()) { String[] vals = md.getValues(name); if (vals != null) for (String v : vals) out.add(name, v); @@ -168,8 +168,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque @Override public ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) - throws Exception { + InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception { Parser parser = selectParser(request); if (parser == null) { throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); @@ -178,7 +177,7 @@ public ExtractionResult extractOnly( ParseContext context = buildContext(parser, request); String content; - if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractFormat) || xpathExpr != null) { + if (ExtractingDocumentLoader.TEXT_FORMAT.equals(request.extractFormat) || xpathExpr != null) { org.apache.tika.sax.ToTextContentHandler textHandler = new org.apache.tika.sax.ToTextContentHandler(); org.xml.sax.ContentHandler ch = textHandler; diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java deleted file mode 100644 index d414b2eb05b..00000000000 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/SimpleExtractionMetadata.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.handler.extraction; - -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - -/** Simple in-memory implementation of ExtractionMetadata. */ -public class SimpleExtractionMetadata implements ExtractionMetadata { - private final Map> map = new LinkedHashMap<>(); - - @Override - public void add(String name, String value) { - if (name == null || value == null) return; - map.computeIfAbsent(name, k -> new ArrayList<>()).add(value); - } - - @Override - public String[] getValues(String name) { - List vals = map.get(name); - if (vals == null) return new String[0]; - return vals.toArray(new String[0]); - } - - @Override - public String get(String name) { - List vals = map.get(name); - if (vals == null || vals.isEmpty()) return null; - return vals.get(0); - } - - @Override - public String[] names() { - return map.keySet().toArray(new String[0]); - } -} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index c37cd1ba76c..ba12680ce7b 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -24,6 +24,8 @@ import java.net.http.HttpResponse; import java.nio.charset.StandardCharsets; import java.time.Duration; +import java.util.Arrays; +import java.util.Set; import org.noggit.JSONParser; /** @@ -62,33 +64,43 @@ public String name() { @Override public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception { - // Buffer the input so we can send it to multiple Tika Server endpoints - byte[] data = inputStream.readAllBytes(); - - // 1) Extract text - String text = requestText(data, request, false, null); - - // 2) Fetch metadata as JSON and convert to neutral metadata - ExtractionMetadata md = fetchMetadata(data, request); + String url = + baseUrl + + "/tika/" + + (Set.of("html", "xml").contains(request.extractFormat) ? "html" : "text"); + HttpRequest.Builder b = + HttpRequest.newBuilder(URI.create(url)) + .timeout(timeout) + .header("Accept", "application/json"); + String contentType = firstNonNull(request.streamType, request.contentType); + if (contentType != null) { + b.header("Content-Type", contentType); + } + if (request.resourceName != null) { + b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); + } + b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream)); - return new ExtractionResult(text, md); + HttpResponse resp = + httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + int code = resp.statusCode(); + if (code < 200 || code >= 300) { + String body = resp.body(); + String preview = body == null ? "" : body.substring(0, Math.min(body.length(), 512)); + throw new IOException("TikaServer " + url + " returned status " + code + " body: " + preview); + } + String body = resp.body(); + return parseCombinedJson(body); } @Override public ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String extractFormat, String xpathExpr) - throws Exception { + InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception { if (xpathExpr != null) { throw new UnsupportedOperationException( "XPath filtering is not supported by TikaServer backend"); } - // Buffer the input so we can send it to multiple Tika Server endpoints - byte[] data = inputStream.readAllBytes(); - - boolean wantXml = !ExtractingDocumentLoader.TEXT_FORMAT.equalsIgnoreCase(extractFormat); - String content = requestText(data, request, wantXml, xpathExpr); - ExtractionMetadata md = fetchMetadata(data, request); - return new ExtractionResult(content, md); + return extract(inputStream, request); } @Override @@ -96,128 +108,145 @@ public void parseToSolrContentHandler( InputStream inputStream, ExtractionRequest request, SolrContentHandler handler, - ExtractionMetadata outMetadata) - throws Exception { + ExtractionMetadata outMetadata) { throw new UnsupportedOperationException( "Legacy SAX-based parsing is not supported by TikaServer backend"); } - private String requestText(byte[] data, ExtractionRequest request, boolean wantXml, String xpath) - throws IOException, InterruptedException { - String path = wantXml ? "/tika/xhtml" : "/tika/text"; - String url = baseUrl + path; - HttpRequest.Builder b = - HttpRequest.newBuilder(URI.create(url)) - .timeout(timeout) - .PUT(HttpRequest.BodyPublishers.ofByteArray(data)); - // Content-Type - String contentType = firstNonNull(request.streamType, request.contentType); - if (contentType != null) { - b.header("Content-Type", contentType); - } - // Filename hint - if (request.resourceName != null) { - b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); - } - // Do not set Accept, let server choose default representation for the endpoint - - HttpResponse resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofByteArray()); - int code = resp.statusCode(); - if (code < 200 || code >= 300) { - throw new IOException("TikaServer /tika returned status " + code); - } - return new String(resp.body(), StandardCharsets.UTF_8); + private static String firstNonNull(String a, String b) { + return a != null ? a : b; } - private ExtractionMetadata fetchMetadata(byte[] data, ExtractionRequest request) - throws IOException, InterruptedException { - // Call /meta to get metadata for the provided content. Ask JSON form. - String url = baseUrl + "/meta"; - HttpRequest.Builder b = - HttpRequest.newBuilder(URI.create(url)) - .timeout(timeout) - .PUT(HttpRequest.BodyPublishers.ofByteArray(data)); - String contentType = firstNonNull(request.streamType, request.contentType); - if (contentType != null) { - b.header("Content-Type", contentType); - } - if (request.resourceName != null) { - b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); - } - b.header("Accept", "application/json"); - - HttpResponse resp = - httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); - int code = resp.statusCode(); - if (code < 200 || code >= 300) { - throw new IOException("TikaServer /meta returned status " + code); + // Reads key-values of the current object into md. Assumes the parser is positioned + // right after OBJECT_START of that object. + private static ExtractionMetadata parseMetadataObject(JSONParser p) throws java.io.IOException { + ExtractionMetadata md = new ExtractionMetadata(); + String currentKey; + while (true) { + int ev = p.nextEvent(); + if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) { + break; + } + if (ev == JSONParser.STRING && p.wasKey()) { + currentKey = p.getString(); + ev = p.nextEvent(); + if (ev == JSONParser.STRING) { + md.add(currentKey, p.getString()); + } else if (ev == JSONParser.ARRAY_START) { + while (true) { + ev = p.nextEvent(); + if (ev == JSONParser.ARRAY_END) break; + if (ev == JSONParser.STRING) { + md.add(currentKey, p.getString()); + } else if (ev == JSONParser.LONG + || ev == JSONParser.NUMBER + || ev == JSONParser.BIGNUMBER) { + md.add(currentKey, p.getNumberChars().toString()); + } else if (ev == JSONParser.BOOLEAN) { + md.add(currentKey, String.valueOf(p.getBoolean())); + } else if (ev == JSONParser.NULL) { + // ignore nulls + } else { + // skip nested objects or unsupported types within arrays + } + } + } else if (ev == JSONParser.LONG || ev == JSONParser.NUMBER || ev == JSONParser.BIGNUMBER) { + md.add(currentKey, p.getNumberChars().toString()); + } else if (ev == JSONParser.BOOLEAN) { + md.add(currentKey, String.valueOf(p.getBoolean())); + } else if (ev == JSONParser.NULL) { + // ignore nulls + } else if (ev == JSONParser.OBJECT_START) { + // Unexpected nested object; skip it entirely + skipObject(p); + } else { + // skip unsupported value types + } + } } - return parseJsonToMetadata(resp.body()); + return md; } - private static String firstNonNull(String a, String b) { - return a != null ? a : b; + private static void skipObject(JSONParser p) throws java.io.IOException { + int depth = 1; + while (depth > 0) { + int ev = p.nextEvent(); + if (ev == JSONParser.OBJECT_START) depth++; + else if (ev == JSONParser.OBJECT_END) depth--; + else if (ev == JSONParser.EOF) break; + } } - // Parse Tika Server metadata JSON using Noggit JSONParser. Supports values as strings, - // arrays of strings, and basic scalars (numbers/booleans) which are coerced to String. - private static ExtractionMetadata parseJsonToMetadata(String json) { - SimpleExtractionMetadata md = new SimpleExtractionMetadata(); - if (json == null) return md; + // Parses combined JSON from /tika/text with Accept: application/json and returns both content + // and metadata. Supports two shapes: + // 1) {"content": "...", "metadata": { ... }} + // 2) {"content": "...", } + private static ExtractionResult parseCombinedJson(String json) { + String content = ""; + ExtractionMetadata md = new ExtractionMetadata(); + if (json == null) return new ExtractionResult(content, md); try { JSONParser p = new JSONParser(json); int ev = p.nextEvent(); if (ev != JSONParser.OBJECT_START) { - return md; + return new ExtractionResult(content, md); } - String currentKey = null; while (true) { ev = p.nextEvent(); - if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) { - break; - } + if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) break; if (ev == JSONParser.STRING && p.wasKey()) { - currentKey = p.getString(); - // Next event is the value for this key + String key = p.getString(); ev = p.nextEvent(); - if (ev == JSONParser.STRING) { - md.add(currentKey, p.getString()); - } else if (ev == JSONParser.ARRAY_START) { - // Read array elements - while (true) { - ev = p.nextEvent(); - if (ev == JSONParser.ARRAY_END) break; - if (ev == JSONParser.STRING) { - md.add(currentKey, p.getString()); - } else if (ev == JSONParser.LONG - || ev == JSONParser.NUMBER - || ev == JSONParser.BIGNUMBER) { - md.add(currentKey, p.getNumberChars().toString()); - } else if (ev == JSONParser.BOOLEAN) { - md.add(currentKey, String.valueOf(p.getBoolean())); - } else if (ev == JSONParser.NULL) { - // ignore nulls - } else { - // skip nested objects or unsupported types within arrays - } + if ("X-TIKA:content".equals(key)) { + if (ev == JSONParser.STRING) { + content = p.getString(); + } else { + // Skip non-string content + if (ev == JSONParser.OBJECT_START) skipObject(p); + } + } else if ("metadata".equals(key)) { + if (ev == JSONParser.OBJECT_START) { + md = parseMetadataObject(p); + } else { + // unexpected shape; skip + if (ev == JSONParser.OBJECT_START) skipObject(p); } - } else if (ev == JSONParser.LONG - || ev == JSONParser.NUMBER - || ev == JSONParser.BIGNUMBER) { - md.add(currentKey, p.getNumberChars().toString()); - } else if (ev == JSONParser.BOOLEAN) { - md.add(currentKey, String.valueOf(p.getBoolean())); - } else if (ev == JSONParser.NULL) { - // ignore nulls } else { - // skip nested objects or unsupported value types + // Treat as flat metadata field + if (ev == JSONParser.STRING) { + md.add(key, p.getString()); + } else if (ev == JSONParser.ARRAY_START) { + while (true) { + ev = p.nextEvent(); + if (ev == JSONParser.ARRAY_END) break; + if (ev == JSONParser.STRING) md.add(key, p.getString()); + else if (ev == JSONParser.LONG + || ev == JSONParser.NUMBER + || ev == JSONParser.BIGNUMBER) md.add(key, p.getNumberChars().toString()); + else if (ev == JSONParser.BOOLEAN) md.add(key, String.valueOf(p.getBoolean())); + else if (ev == JSONParser.NULL) { + // ignore + } + } + } else if (ev == JSONParser.LONG + || ev == JSONParser.NUMBER + || ev == JSONParser.BIGNUMBER) { + md.add(key, p.getNumberChars().toString()); + } else if (ev == JSONParser.BOOLEAN) { + md.add(key, String.valueOf(p.getBoolean())); + } else if (ev == JSONParser.NULL) { + // ignore + } else if (ev == JSONParser.OBJECT_START) { + // skip nested object for unknown key + skipObject(p); + } } } } } catch (java.io.IOException ioe) { - // Fall back to empty metadata on parsing error - return md; + // ignore, return what we have } - return md; + Arrays.stream(md.names()).filter(k -> k.startsWith("X-TIKA:Parsed-")).forEach(md::remove); + return new ExtractionResult(content, md); } } diff --git a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml index 2c52f4591e8..f8a227b8cf9 100644 --- a/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml +++ b/solr/modules/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml @@ -152,6 +152,8 @@ parseContext.xml + ${solr.test.extraction.backend:local} + ${solr.test.tikaserver.url:} diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index c7098665027..1983f4e34e2 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -16,10 +16,15 @@ */ package org.apache.solr.handler.extraction; +import com.carrotsearch.randomizedtesting.ThreadFilter; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.TimeZone; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.ContentStream; @@ -31,20 +36,44 @@ import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.BufferingRequestProcessor; +import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; - -/** */ +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; + +/** Generic tests, randomized between local and tikaserver backends */ +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + ExtractingRequestHandlerTest.TestcontainersThreadsFilter.class + }) public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { - - static { - // Allow the SecureRandom algorithm used in this environment to avoid class configuration - // failure in tests. - // This mirrors passing -Dtest.solr.allowed.securerandom=NativePRNG at JVM startup. - System.setProperty("test.solr.allowed.securerandom", "NativePRNG"); + // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test + @SuppressWarnings("NewClassNamingConvention") + public static class TestcontainersThreadsFilter implements ThreadFilter { + @Override + public boolean reject(Thread t) { + if (t == null || t.getName() == null) return false; + String n = t.getName(); + return n.startsWith("testcontainers-ryuk") + || n.startsWith("testcontainers-wait-") + || n.startsWith("HttpClient-") + || n.startsWith("HttpClient-TestContainers"); + } } + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static GenericContainer tika; + private static boolean useTikaServer; + + @SuppressWarnings("resource") @BeforeClass public static void beforeClass() throws Exception { // Is the JDK/env affected by a known bug? @@ -59,9 +88,36 @@ public static void beforeClass() throws Exception { false); } + useTikaServer = random().nextBoolean(); + if (useTikaServer) { + String baseUrl; + tika = + new GenericContainer<>("apache/tika:3.2.3.0-full") + .withExposedPorts(9998) + .waitingFor(Wait.forListeningPort()); + tika.start(); + baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998); + System.setProperty("solr.test.tikaserver.url", baseUrl); + System.setProperty("solr.test.extraction.backend", "tikaserver"); + log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl); + } else { + log.info("Using extraction backend 'local'"); + } + initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr")); } + @AfterClass + public static void afterClass() throws Exception { + System.clearProperty("solr.test.tikaserver.url"); + System.clearProperty("solr.test.extraction.backend"); + if (useTikaServer && tika != null) { + tika.stop(); + tika.close(); + tika = null; + } + } + @Override @Before public void setUp() throws Exception { diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java index df365f2bedf..4974f5a1903 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java @@ -53,7 +53,8 @@ private ExtractionRequest newRequest( String streamName, String streamSourceInfo, Long streamSize, - String resourcePassword) { + String resourcePassword, + String returnType) { return new ExtractionRequest( streamType, resourceName, @@ -63,7 +64,8 @@ private ExtractionRequest newRequest( streamSourceInfo, streamSize, resourcePassword, - null); + null, + returnType); } @Test @@ -80,7 +82,8 @@ public void testWrongStreamTypeThrows() throws Exception { "version_control.txt", null, null, - null); + null, + "text"); expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req)); } @@ -95,7 +98,8 @@ public void testWrongStreamTypeThrows() throws Exception { "version_control.txt", null, null, - null); + null, + "text"); expectThrows(Exception.class, () -> backend.extract(in, req)); } } @@ -113,7 +117,8 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception { "password-is-Word2010.docx", null, null, - null); + null, + "text"); expectThrows(Exception.class, () -> backend.extract(in, req)); } } @@ -131,7 +136,8 @@ public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception { "password-is-Word2010.docx", null, null, - "Word2010"); + "Word2010", + "text"); ExtractionResult res = backend.extract(in, req); assertNotNull(res); assertNotNull(res.getMetadata()); diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java index 5c51ccba38f..15f54707638 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java @@ -56,12 +56,6 @@ public boolean reject(Thread t) { } } - static { - // Allow the SecureRandom algorithm used in this environment to avoid class configuration - // failure in tests. - System.setProperty("test.solr.allowed.securerandom", "NativePRNG"); - } - private static GenericContainer tika; private static String baseUrl; private static ExecutorService httpExec; @@ -107,7 +101,8 @@ public static void stopTikaServer() { client = null; } - private static ExtractionRequest newRequest(String resourceName, String contentType) { + private static ExtractionRequest newRequest( + String resourceName, String contentType, String extractFormat) { return new ExtractionRequest( contentType, // streamType resourceName, // resourceName @@ -117,7 +112,8 @@ private static ExtractionRequest newRequest(String resourceName, String contentT null, // sourceInfo null, // size null, // resourcePassword - null // passwordsMap + null, // passwordsMap + extractFormat // extraction format xml or text ); } @@ -127,7 +123,7 @@ public void testExtractTextAndMetadata() throws Exception { TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl); byte[] data = "Hello TestContainers".getBytes(java.nio.charset.StandardCharsets.UTF_8); try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { - ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain")); + ExtractionResult res = backend.extract(in, newRequest("test.txt", "text/plain", "text")); assertNotNull(res); assertNotNull(res.getContent()); assertTrue(res.getContent().contains("Hello TestContainers")); @@ -147,8 +143,7 @@ public void testExtractOnlyXml() throws Exception { byte[] data = "Hello XML".getBytes(java.nio.charset.StandardCharsets.UTF_8); try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { ExtractionResult res = - backend.extractOnly( - in, newRequest("test.txt", "text/plain"), ExtractingDocumentLoader.XML_FORMAT, null); + backend.extractOnly(in, newRequest("test.txt", "text/plain", "xml"), null); assertNotNull(res); String c = res.getContent(); assertNotNull(c); @@ -172,9 +167,9 @@ public void testParseToSolrContentHandlerUnsupported() throws Exception { () -> backend.parseToSolrContentHandler( in, - newRequest("test.txt", "text/plain"), - new SolrContentHandler(new SimpleExtractionMetadata(), params(), null), - new SimpleExtractionMetadata())); + newRequest("test.txt", "text/plain", "text"), + new SolrContentHandler(new ExtractionMetadata(), params(), null), + new ExtractionMetadata())); } } } From cf971699209f3761a44c404aae8de6970aae4c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Sun, 21 Sep 2025 01:44:14 +0200 Subject: [PATCH 09/47] Allow testcontainers to read config --- gradle/testing/randomization/policies/solr-tests.policy | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gradle/testing/randomization/policies/solr-tests.policy b/gradle/testing/randomization/policies/solr-tests.policy index 2d3246c6d9b..7eb635db831 100644 --- a/gradle/testing/randomization/policies/solr-tests.policy +++ b/gradle/testing/randomization/policies/solr-tests.policy @@ -31,6 +31,9 @@ grant { permission java.io.FilePermission "${java.io.tmpdir}", "read,write"; permission java.io.FilePermission "${java.io.tmpdir}${/}-", "read,write,delete"; + // Allow Testcontainers to read user-level configuration + permission java.io.FilePermission "${user.home}${/}.testcontainers.properties", "read"; + permission java.io.FilePermission "${tests.linedocsfile}", "read"; // DirectoryFactoryTest messes with these (wtf?) permission java.io.FilePermission "/tmp/inst1/conf/solrcore.properties", "read"; @@ -130,11 +133,11 @@ grant { permission javax.management.MBeanServerPermission "findMBeanServer"; permission javax.management.MBeanServerPermission "releaseMBeanServer"; permission javax.management.MBeanTrustPermission "register"; - + // needed by crossdc permission javax.security.auth.AuthPermission "getLoginConfiguration"; permission javax.security.auth.AuthPermission "setLoginConfiguration"; - + // needed by benchmark permission java.security.SecurityPermission "insertProvider"; @@ -206,7 +209,7 @@ grant { // additional permissions based on system properties set by /bin/solr // NOTE: if the property is not set, the permission entry is ignored. -grant { +grant { permission java.io.FilePermission "${solr.jetty.keystore}", "read,write,delete,readlink"; permission java.io.FilePermission "${solr.jetty.keystore}${/}-", "read,write,delete,readlink"; From 87cb45c3f9b9912db7c8a48b5b576cdf1644621f Mon Sep 17 00:00:00 2001 From: Eric Pugh Date: Mon, 22 Sep 2025 10:27:45 -0400 Subject: [PATCH 10/47] Disable JSM Java Security Manager and Testcontainers do not play nicely together. We prefer Testcontainers, so disable JSM --- solr/modules/extraction/build.gradle | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/solr/modules/extraction/build.gradle b/solr/modules/extraction/build.gradle index 0cbb4c0174f..66f3c2d0c00 100644 --- a/solr/modules/extraction/build.gradle +++ b/solr/modules/extraction/build.gradle @@ -19,6 +19,11 @@ apply plugin: 'java-library' description = 'Solr Integration with Tika for extracting content from binary file formats such as Microsoft Word and Adobe PDF' +ext { + // Disable security manager for extraction module tests + useSecurityManager = false +} + dependencies { implementation platform(project(':platform')) implementation project(':solr:core') From 7ebed82375d6e4b29f3128a7a0acb3e96e10fd16 Mon Sep 17 00:00:00 2001 From: Eric Pugh Date: Mon, 22 Sep 2025 10:28:02 -0400 Subject: [PATCH 11/47] IntelliJ prompted me.. and I couldn't resist. --- .../ExtractingRequestHandlerTest.java | 33 ++++++++----------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index 1983f4e34e2..2bd099ebe78 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -108,7 +108,7 @@ public static void beforeClass() throws Exception { } @AfterClass - public static void afterClass() throws Exception { + public static void afterClass() { System.clearProperty("solr.test.tikaserver.url"); System.clearProperty("solr.test.extraction.backend"); if (useTikaServer && tika != null) { @@ -404,20 +404,18 @@ public void testDefaultField() throws Exception { ignoreException("unknown field 'meta'"); // TODO: should this exception be happening? expectThrows( SolrException.class, - () -> { - loadLocal( - "extraction/simple.html", - "literal.id", - "simple2", - "lowernames", - "true", - "captureAttr", - "true", - // "fmap.content_type", "abcxyz", - "commit", - "true" // test immediate commit - ); - }); + () -> loadLocal( + "extraction/simple.html", + "literal.id", + "simple2", + "lowernames", + "true", + "captureAttr", + "true", + // "fmap.content_type", "abcxyz", + "commit", + "true" // test immediate commit + )); } finally { resetExceptionIgnores(); } @@ -1115,16 +1113,13 @@ public void testPasswordProtected() throws Exception { SolrQueryResponse loadLocalFromHandler(String handler, String filename, String... args) throws Exception { - LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args); - try { + try (LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args)) { // TODO: stop using locally defined streams once stream.file and // stream.body work everywhere List cs = new ArrayList<>(); cs.add(new ContentStreamBase.FileStream(getFile(filename))); req.setContentStreams(cs); return h.queryAndResponse(handler, req); - } finally { - req.close(); } } From f25631ded66d0bfcd18736de5984c14443fd8829 Mon Sep 17 00:00:00 2001 From: Eric Pugh Date: Mon, 22 Sep 2025 10:46:29 -0400 Subject: [PATCH 12/47] lint --- .../ExtractingRequestHandlerTest.java | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java index 2bd099ebe78..947860337f8 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java @@ -404,18 +404,19 @@ public void testDefaultField() throws Exception { ignoreException("unknown field 'meta'"); // TODO: should this exception be happening? expectThrows( SolrException.class, - () -> loadLocal( - "extraction/simple.html", - "literal.id", - "simple2", - "lowernames", - "true", - "captureAttr", - "true", - // "fmap.content_type", "abcxyz", - "commit", - "true" // test immediate commit - )); + () -> + loadLocal( + "extraction/simple.html", + "literal.id", + "simple2", + "lowernames", + "true", + "captureAttr", + "true", + // "fmap.content_type", "abcxyz", + "commit", + "true" // test immediate commit + )); } finally { resetExceptionIgnores(); } From 5aa381f072283f524da8f0b92c29599e1a01dedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 23 Sep 2025 19:16:40 +0200 Subject: [PATCH 13/47] Split test in two sub classes Add common metadata Adjust some tests with dc:title instead of title Support passwords in TikaServer backend --- .../extraction/DummyExtractionBackend.java | 2 +- .../handler/extraction/ExtractionBackend.java | 20 +++ .../LocalTikaExtractionBackend.java | 19 +-- .../RegexRulesPasswordProvider.java | 11 ++ .../TikaServerExtractionBackend.java | 20 ++- .../ExtractingRequestHandlerLocalTest.java | 19 +++ ...ExtractingRequestHandlerTestAbstract.java} | 81 ++---------- ...xtractingRequestHandlerTikaServerTest.java | 117 ++++++++++++++++++ 8 files changed, 198 insertions(+), 91 deletions(-) create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java rename solr/modules/extraction/src/test/org/apache/solr/handler/extraction/{ExtractingRequestHandlerTest.java => ExtractingRequestHandlerTestAbstract.java} (92%) create mode 100644 solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index 33ae55c63c8..745216eb31a 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -29,7 +29,7 @@ public String name() { @Override public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) { - ExtractionMetadata metadata = new ExtractionMetadata(); + ExtractionMetadata metadata = buildMetadataFromRequest(request); metadata.add("Dummy-Backend", "true"); metadata.add( "Content-Type", diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java index 715c73636e2..fd5c5409113 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -17,6 +17,8 @@ package org.apache.solr.handler.extraction; import java.io.InputStream; +import org.apache.tika.metadata.HttpHeaders; +import org.apache.tika.metadata.TikaMetadataKeys; /** Strategy interface for content extraction backends. */ public interface ExtractionBackend { @@ -46,6 +48,24 @@ void parseToSolrContentHandler( ExtractionMetadata outMetadata) throws Exception; + /** Build ExtractionMetadata from the request context */ + default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) { + ExtractionMetadata md = new ExtractionMetadata(); + if (request.resourceName != null) + md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); + if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType); + if (request.streamName != null) + md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName); + if (request.streamSourceInfo != null) + md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); + if (request.streamSize != null) + md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); + if (request.contentType != null) + md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); + if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset); + return md; + } + /** A short name for debugging/config, e.g., "local" or "dummy". */ String name(); } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java index e91716e1652..d39011cf5a2 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -21,9 +21,7 @@ import java.util.Locale; import org.apache.solr.core.SolrCore; import org.apache.tika.config.TikaConfig; -import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DefaultParser; @@ -112,19 +110,12 @@ private Parser selectParser(ExtractionRequest request) { } private Metadata buildMetadata(ExtractionRequest request) { + ExtractionMetadata extractionMetadata = buildMetadataFromRequest(request); Metadata md = new Metadata(); - if (request.resourceName != null) - md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); - if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType); - if (request.streamName != null) - md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName); - if (request.streamSourceInfo != null) - md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); - if (request.streamSize != null) - md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); - if (request.contentType != null) - md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); - if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset); + for (String name : extractionMetadata.names()) { + String[] vals = extractionMetadata.getValues(name); + if (vals != null) for (String v : vals) md.add(name, v); + } return md; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java index 84b4e94171c..8e7f876da83 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java @@ -55,6 +55,17 @@ public String getPassword(Metadata meta) { return null; } + public String getPassword(ExtractionMetadata extractionMetadata) { + if (getExplicitPassword() != null) { + return getExplicitPassword(); + } + + if (passwordMap.size() > 0) + return lookupPasswordFromMap(extractionMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); + + return null; + } + private String lookupPasswordFromMap(String fileName) { if (fileName != null && fileName.length() > 0) { for (Entry e : passwordMap.entrySet()) { diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index ba12680ce7b..c7b0adaf0f5 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -76,6 +76,21 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque if (contentType != null) { b.header("Content-Type", contentType); } + ExtractionMetadata md = buildMetadataFromRequest(request); + if (request.resourcePassword != null || request.passwordsMap != null) { + RegexRulesPasswordProvider passwordProvider = new RegexRulesPasswordProvider(); + if (request.resourcePassword != null) { + passwordProvider.setExplicitPassword(request.resourcePassword); + } + if (request.passwordsMap != null) { + passwordProvider.setPasswordMap(request.passwordsMap); + } + + String pwd = passwordProvider.getPassword(md); + if (pwd != null) { + b.header("Password", pwd); + } + } if (request.resourceName != null) { b.header("Content-Disposition", "attachment; filename=\"" + request.resourceName + "\""); } @@ -90,7 +105,7 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque throw new IOException("TikaServer " + url + " returned status " + code + " body: " + preview); } String body = resp.body(); - return parseCombinedJson(body); + return parseCombinedJson(body, md); } @Override @@ -181,9 +196,8 @@ private static void skipObject(JSONParser p) throws java.io.IOException { // and metadata. Supports two shapes: // 1) {"content": "...", "metadata": { ... }} // 2) {"content": "...", } - private static ExtractionResult parseCombinedJson(String json) { + private ExtractionResult parseCombinedJson(String json, ExtractionMetadata md) { String content = ""; - ExtractionMetadata md = new ExtractionMetadata(); if (json == null) return new ExtractionResult(content, md); try { JSONParser p = new JSONParser(json); diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java new file mode 100644 index 00000000000..64dc90c1b50 --- /dev/null +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerLocalTest.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +public class ExtractingRequestHandlerLocalTest extends ExtractingRequestHandlerTestAbstract {} diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java similarity index 92% rename from solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java rename to solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java index 947860337f8..b9ed368e1ed 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java @@ -16,108 +16,39 @@ */ package org.apache.solr.handler.extraction; -import com.carrotsearch.randomizedtesting.ThreadFilter; -import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.TimeZone; -import org.apache.lucene.tests.util.QuickPatchThreadsFilter; -import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; -import org.apache.solr.common.util.EnvUtils; import org.apache.solr.common.util.NamedList; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.BufferingRequestProcessor; -import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.wait.strategy.Wait; - -/** Generic tests, randomized between local and tikaserver backends */ -@ThreadLeakFilters( - defaultFilters = true, - filters = { - SolrIgnoredThreadsFilter.class, - QuickPatchThreadsFilter.class, - ExtractingRequestHandlerTest.TestcontainersThreadsFilter.class - }) -public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 { - // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test - @SuppressWarnings("NewClassNamingConvention") - public static class TestcontainersThreadsFilter implements ThreadFilter { - @Override - public boolean reject(Thread t) { - if (t == null || t.getName() == null) return false; - String n = t.getName(); - return n.startsWith("testcontainers-ryuk") - || n.startsWith("testcontainers-wait-") - || n.startsWith("HttpClient-") - || n.startsWith("HttpClient-TestContainers"); - } - } +public abstract class ExtractingRequestHandlerTestAbstract extends SolrTestCaseJ4 { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private static GenericContainer tika; - private static boolean useTikaServer; - @SuppressWarnings("resource") @BeforeClass public static void beforeClass() throws Exception { // Is the JDK/env affected by a known bug? final String tzDisplayName = TimeZone.getDefault().getDisplayName(false, TimeZone.SHORT, Locale.US); - if (!tzDisplayName.matches("[A-Za-z]{3,}([+-]\\d\\d(:\\d\\d)?)?")) { - assertTrue( - "Is some other JVM affected? Or bad regex? TzDisplayName: " + tzDisplayName, - EnvUtils.getProperty("java.version").startsWith("11")); - assumeTrue( - "SOLR-12759 JDK 11 (1st release) and Tika 1.x can result in extracting dates in a bad format.", - false); - } - - useTikaServer = random().nextBoolean(); - if (useTikaServer) { - String baseUrl; - tika = - new GenericContainer<>("apache/tika:3.2.3.0-full") - .withExposedPorts(9998) - .waitingFor(Wait.forListeningPort()); - tika.start(); - baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998); - System.setProperty("solr.test.tikaserver.url", baseUrl); - System.setProperty("solr.test.extraction.backend", "tikaserver"); - log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl); - } else { - log.info("Using extraction backend 'local'"); - } - initCore("solrconfig.xml", "schema.xml", getFile("extraction/solr")); } - @AfterClass - public static void afterClass() { - System.clearProperty("solr.test.tikaserver.url"); - System.clearProperty("solr.test.extraction.backend"); - if (useTikaServer && tika != null) { - tika.stop(); - tika.close(); - tika = null; - } - } - @Override @Before public void setUp() throws Exception { @@ -754,9 +685,12 @@ public void testExtractOnly() throws Exception { NamedList nl = (NamedList) list.get("solr-word.pdf_metadata"); assertNotNull("nl is null and it shouldn't be", nl); - Object title = nl.get("title"); + // TODO: Tika server v3.x has normalized metadata and do not return the 'title' key. Consider + // backcompat mode mapping dc:title to title??? + Object title = nl.get("dc:title"); assertNotNull("title is null and it shouldn't be", title); - assertTrue(extraction.contains(") list.get("solr-word.pdf_metadata"); assertNotNull("nl is null and it shouldn't be", nl); - title = nl.get("title"); + // TODO: See above + title = nl.get("dc:title"); assertNotNull("title is null and it shouldn't be", title); } diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java new file mode 100644 index 00000000000..14cc89d0cc6 --- /dev/null +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import com.carrotsearch.randomizedtesting.ThreadFilter; +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; +import java.lang.invoke.MethodHandles; +import org.apache.lucene.tests.util.QuickPatchThreadsFilter; +import org.apache.solr.SolrIgnoredThreadsFilter; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; + +/** Generic tests, randomized between local and tikaserver backends */ +@ThreadLeakFilters( + defaultFilters = true, + filters = { + SolrIgnoredThreadsFilter.class, + QuickPatchThreadsFilter.class, + ExtractingRequestHandlerTikaServerTest.TestcontainersThreadsFilter.class + }) +public class ExtractingRequestHandlerTikaServerTest extends ExtractingRequestHandlerTestAbstract { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static GenericContainer tika; + + // Ignore known non-daemon threads spawned by Testcontainers and Java HttpClient in this test + @SuppressWarnings("NewClassNamingConvention") + public static class TestcontainersThreadsFilter implements ThreadFilter { + @Override + public boolean reject(Thread t) { + if (t == null || t.getName() == null) return false; + String n = t.getName(); + return n.startsWith("testcontainers-ryuk") + || n.startsWith("testcontainers-wait-") + || n.startsWith("HttpClient-") + || n.startsWith("HttpClient-TestContainers"); + } + } + + @BeforeClass + public static void beforeClassTika() throws Exception { + String baseUrl = null; + tika = + new GenericContainer<>("apache/tika:3.2.3.0-full") + .withExposedPorts(9998) + .waitingFor(Wait.forListeningPort()); + try { + tika.start(); + baseUrl = "http://" + tika.getHost() + ":" + tika.getMappedPort(9998); + System.setProperty("solr.test.tikaserver.url", baseUrl); + System.setProperty("solr.test.extraction.backend", "tikaserver"); + log.info("Using extraction backend 'tikaserver'. Tika server running on {}", baseUrl); + ExtractingRequestHandlerTestAbstract.beforeClass(); + } catch (Throwable t) { + // Best-effort cleanup to avoid leaking resources if class initialization fails + try { + System.clearProperty("solr.test.tikaserver.url"); + System.clearProperty("solr.test.extraction.backend"); + } catch (Throwable ignored) { + } + try { + // Ensure any partially initialized core and clients are released + org.apache.solr.SolrTestCaseJ4.deleteCore(); + } catch (Throwable ignored) { + } + if (tika != null) { + try { + tika.stop(); + } catch (Throwable ignored) { + } + try { + tika.close(); + } catch (Throwable ignored) { + } + tika = null; + } + throw t; + } + } + + @AfterClass + public static void afterClassTika() throws Exception { + // TODO: There are still thread leaks after these tests, probably due to failing tests + deleteCore(); + // Stop and dispose of the Tika container if it was started + if (tika != null) { + try { + tika.stop(); + } finally { + try { + tika.close(); + } catch (Throwable ignore2) { + } + tika = null; + } + } + System.clearProperty("solr.test.tikaserver.url"); + System.clearProperty("solr.test.extraction.backend"); + } +} From ef7850d0cb0c95ef1f6d5ec358be8eba4a5478cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 23 Sep 2025 21:32:02 +0200 Subject: [PATCH 14/47] Some error handling --- .../handler/extraction/ExtractingDocumentLoader.java | 6 ++++++ .../extraction/TikaServerExtractionBackend.java | 10 ++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 2214059e2f9..770076cd548 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -171,6 +171,12 @@ public void load( factory.createSolrContentHandler(neutral, params, req.getSchema()); try { backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral); + } catch (UnsupportedOperationException uoe) { + // For backends that don't support parseToSolrContentHandler + log.warn("skip extracting text due to {}.", uoe.getMessage()); + throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, + "The requested operation is not supported by backend '" + backend.name() + "'."); } catch (Exception e) { if (ignoreTikaException) { if (log.isWarnEnabled()) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index c7b0adaf0f5..3a39caf57e7 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -16,7 +16,6 @@ */ package org.apache.solr.handler.extraction; -import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.http.HttpClient; @@ -26,6 +25,7 @@ import java.time.Duration; import java.util.Arrays; import java.util.Set; +import org.apache.solr.common.SolrException; import org.noggit.JSONParser; /** @@ -96,13 +96,15 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque } b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream)); + // TODO: Consider getting the InputStream instead HttpResponse resp = httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); int code = resp.statusCode(); if (code < 200 || code >= 300) { - String body = resp.body(); - String preview = body == null ? "" : body.substring(0, Math.min(body.length(), 512)); - throw new IOException("TikaServer " + url + " returned status " + code + " body: " + preview); + // TODO: Parse error message from response? + throw new SolrException( + SolrException.ErrorCode.getErrorCode(code), + "TikaServer " + url + " returned status " + code); } String body = resp.body(); return parseCombinedJson(body, md); From f29751447a4c1a1544c1976e5c3a995956ba6e03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 23 Sep 2025 21:58:59 +0200 Subject: [PATCH 15/47] Properly skip test if Docker not available --- ...xtractingRequestHandlerTikaServerTest.java | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java index 14cc89d0cc6..a73081ec042 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTikaServerTest.java @@ -22,6 +22,7 @@ import org.apache.lucene.tests.util.QuickPatchThreadsFilter; import org.apache.solr.SolrIgnoredThreadsFilter; import org.junit.AfterClass; +import org.junit.Assume; import org.junit.BeforeClass; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -70,28 +71,10 @@ public static void beforeClassTika() throws Exception { ExtractingRequestHandlerTestAbstract.beforeClass(); } catch (Throwable t) { // Best-effort cleanup to avoid leaking resources if class initialization fails - try { - System.clearProperty("solr.test.tikaserver.url"); - System.clearProperty("solr.test.extraction.backend"); - } catch (Throwable ignored) { - } - try { - // Ensure any partially initialized core and clients are released - org.apache.solr.SolrTestCaseJ4.deleteCore(); - } catch (Throwable ignored) { - } - if (tika != null) { - try { - tika.stop(); - } catch (Throwable ignored) { - } - try { - tika.close(); - } catch (Throwable ignored) { - } - tika = null; - } - throw t; + System.clearProperty("solr.test.tikaserver.url"); + System.clearProperty("solr.test.extraction.backend"); + // Skip tests if Docker/Testcontainers are not available in the environment + Assume.assumeNoException("Docker/Testcontainers not available; skipping test", t); } } From b1840eebdbbae0548f7ac6a423bbcccd94837f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 23 Sep 2025 22:25:02 +0200 Subject: [PATCH 16/47] Fix precommit --- .../solr/handler/extraction/ExtractingDocumentLoader.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 770076cd548..ef68427099e 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -173,14 +173,17 @@ public void load( backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral); } catch (UnsupportedOperationException uoe) { // For backends that don't support parseToSolrContentHandler - log.warn("skip extracting text due to {}.", uoe.getMessage()); + if (log.isWarnEnabled()) { + log.warn("skip extracting text since tika backend does not yet support this option"); + } throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "The requested operation is not supported by backend '" + backend.name() + "'."); } catch (Exception e) { if (ignoreTikaException) { - if (log.isWarnEnabled()) + if (log.isWarnEnabled()) { log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + } // Index a document with literals only (no extracted content/metadata) addDoc(handler); return; From 6ec9ddabd815ee82450c827f1a6548b9dbb539bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Wed, 24 Sep 2025 00:21:51 +0200 Subject: [PATCH 17/47] Review feedback. ID -> NAME --- .../extraction/DummyExtractionBackend.java | 4 ++-- .../extraction/ExtractingRequestHandler.java | 2 +- .../extraction/ExtractionBackendFactory.java | 20 +++++++------------ .../LocalTikaExtractionBackend.java | 4 ++-- .../TikaServerExtractionBackend.java | 4 ++-- .../ExtractingRequestHandlerTestAbstract.java | 2 +- 6 files changed, 15 insertions(+), 21 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index 745216eb31a..e85844ff46f 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -20,11 +20,11 @@ /** Dummy backend that emits predictable test data without actually parsing input content. */ public class DummyExtractionBackend implements ExtractionBackend { - public static final String ID = "dummy"; + public static final String NAME = "dummy"; @Override public String name() { - return ID; + return NAME; } @Override diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 224ee54f0ac..09e2dddb0e0 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -73,7 +73,7 @@ public void inform(SolrCore core) { String backendName = (String) initArgs.get(ExtractingParams.EXTRACTION_BACKEND); defaultBackendName = (backendName == null || backendName.trim().isEmpty()) - ? LocalTikaExtractionBackend.ID + ? LocalTikaExtractionBackend.NAME : backendName; } catch (Exception e) { diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java index 38033d8b935..abe3ab726f0 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java @@ -58,23 +58,17 @@ public ExtractionBackend getBackend(String name) { } private String normalize(String name) { - if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.ID; + if (name == null || name.trim().isEmpty()) return LocalTikaExtractionBackend.NAME; return name.trim().toLowerCase(Locale.ROOT); } /** Creates a new backend instance for the given normalized name. */ protected ExtractionBackend create(String normalizedName) throws Exception { - switch (normalizedName) { - case LocalTikaExtractionBackend.ID: - return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); - case DummyExtractionBackend.ID: - return new DummyExtractionBackend(); - case TikaServerExtractionBackend.ID: - return new TikaServerExtractionBackend( - tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); - default: - // Fallback to local for unknown names - return new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); - } + return switch (normalizedName) { + case DummyExtractionBackend.NAME -> new DummyExtractionBackend(); + case TikaServerExtractionBackend.NAME -> new TikaServerExtractionBackend( + tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); + default -> new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + }; } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java index d39011cf5a2..28470ff7024 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -94,11 +94,11 @@ public LocalTikaExtractionBackend( this.autoDetectParser = new AutoDetectParser(cfg); } - public static final String ID = "local"; + public static final String NAME = "local"; @Override public String name() { - return ID; + return NAME; } private Parser selectParser(ExtractionRequest request) { diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index 3a39caf57e7..196d9397cf4 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -54,11 +54,11 @@ public TikaServerExtractionBackend(String baseUrl) { this.httpClient = httpClient; } - public static final String ID = "tikaserver"; + public static final String NAME = "tikaserver"; @Override public String name() { - return ID; + return NAME; } @Override diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java index b9ed368e1ed..349dae50ce6 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java @@ -1068,7 +1068,7 @@ public void testDummyBackendExtractOnly() throws Exception { loadLocal( "extraction/version_control.txt", ExtractingParams.EXTRACTION_BACKEND, - DummyExtractionBackend.ID, + DummyExtractionBackend.NAME, ExtractingParams.EXTRACT_ONLY, "true", ExtractingParams.EXTRACT_FORMAT, From 902355d7cc0300f82228cb7588e007ff2fe343d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Wed, 24 Sep 2025 00:29:47 +0200 Subject: [PATCH 18/47] Review feedback. Simplify metadata add code --- .../handler/extraction/ExtractionBackend.java | 19 +++++++------------ .../extraction/ExtractionMetadata.java | 6 ++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java index fd5c5409113..9647d0f843b 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -51,18 +51,13 @@ void parseToSolrContentHandler( /** Build ExtractionMetadata from the request context */ default ExtractionMetadata buildMetadataFromRequest(ExtractionRequest request) { ExtractionMetadata md = new ExtractionMetadata(); - if (request.resourceName != null) - md.add(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); - if (request.contentType != null) md.add(HttpHeaders.CONTENT_TYPE, request.contentType); - if (request.streamName != null) - md.add(ExtractingMetadataConstants.STREAM_NAME, request.streamName); - if (request.streamSourceInfo != null) - md.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); - if (request.streamSize != null) - md.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); - if (request.contentType != null) - md.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); - if (request.charset != null) md.add(HttpHeaders.CONTENT_ENCODING, request.charset); + md.addIfNotNull(TikaMetadataKeys.RESOURCE_NAME_KEY, request.resourceName); + md.addIfNotNull(HttpHeaders.CONTENT_TYPE, request.contentType); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_NAME, request.streamName); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_SOURCE_INFO, request.streamSourceInfo); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(request.streamSize)); + md.addIfNotNull(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, request.contentType); + md.addIfNotNull(HttpHeaders.CONTENT_ENCODING, request.charset); return md; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java index 6229089d502..c400bc90fb1 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java @@ -31,6 +31,12 @@ public void add(String name, String value) { map.computeIfAbsent(name, k -> new ArrayList<>()).add(value); } + public void addIfNotNull(String resourceNameKey, String resourceName) { + if (resourceName != null) { + add(resourceNameKey, resourceName); + } + } + public String[] getValues(String name) { List vals = map.get(name); if (vals == null) return new String[0]; From 1cfcce9f7474d7e69bdd2ac4ea364893140e1c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Wed, 24 Sep 2025 00:37:10 +0200 Subject: [PATCH 19/47] Error handling for factory --- .../handler/extraction/ExtractionBackendFactory.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java index abe3ab726f0..7ee0c163152 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackendFactory.java @@ -19,6 +19,7 @@ import java.util.Locale; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import org.apache.solr.common.SolrException; import org.apache.solr.core.SolrCore; /** @@ -52,7 +53,10 @@ public ExtractionBackend getBackend(String name) { try { return create(k); } catch (Exception e) { - throw new RuntimeException("Failed to create extraction backend '" + k + "'", e); + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "Failed to create extraction backend '" + k + "'", + e); } }); } @@ -68,7 +72,10 @@ protected ExtractionBackend create(String normalizedName) throws Exception { case DummyExtractionBackend.NAME -> new DummyExtractionBackend(); case TikaServerExtractionBackend.NAME -> new TikaServerExtractionBackend( tikaServerUrl != null ? tikaServerUrl : "http://localhost:9998"); - default -> new LocalTikaExtractionBackend(core, tikaConfigLoc, parseContextConfig); + case LocalTikaExtractionBackend.NAME -> new LocalTikaExtractionBackend( + core, tikaConfigLoc, parseContextConfig); + default -> throw new SolrException( + SolrException.ErrorCode.BAD_REQUEST, "Unknown extraction backend: " + normalizedName); }; } } From b769c06c35bfa679953867a957aa43aca6c466b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Wed, 24 Sep 2025 01:18:32 +0200 Subject: [PATCH 20/47] More documentation --- .../pages/indexing-with-tika.adoc | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc index 183af23e30b..c7f67b9968b 100644 --- a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc +++ b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-with-tika.adoc @@ -51,7 +51,41 @@ The next step after any update handler is the xref:configuration-guide:update-re == Tika Server -TODO: Add documentation about Tika Server backend. +The `tikaserver` backend lets Solr delegate content extraction to an external Apache Tika Server process instead of running Tika parsers inside the Solr JVM. This can improve operational isolation (crashes or heavy parsing won’t impact Solr), simplify dependency management, and allow you to scale Tika independently of Solr. + +Example handler configuration: + +[source,xml] +---- + + + + tikaserver + + + http://localhost:9998 + +---- + +=== Starting Tika Server with Docker + +The quickest way to run Tika Server for development is using Docker. The examples below expose Tika on port 9998 on localhost, matching the default value when `tikaserver.url` is not explicitly set. + +[,bash] +---- +docker run --rm -p 9998:9998 apache/tika:3.2.3.0-full +---- + +NOTE: If Solr runs in Docker too, ensure both containers share a network and use the Tika container name as the host in `tikaserver.url`. + +=== Limitations +Currently, the `tikaserver` option lacks some features and will return HTTP 400 in these cases: + +- `capture` and `captureAttr`: Selecting specific XHTML elements/attributes during indexing requires Solr’s SAX ContentHandler and is not supported by the `tikaserver` backend. +- `xpath`: Server-side XPath filtering of the XHTML is not supported. +- `passwordsFile` and `resource.password` for the indexing path: these options trigger the legacy SAX path in Solr and are not currently supported. + +Metadata produced by Tika Server can differ slightly from local Tika, particularly in key names and the presence/absence of certain fields. Adjust your `fmap.*` mappings accordingly. == Module @@ -61,7 +95,7 @@ The "techproducts" example included with Solr is pre-configured to have Solr Cel If you are not using the example, you will want to pay attention to the section <> below. -=== Solr Cell Performance Implications +=== Solr Cell Performance Implications (local mode) Rich document formats are frequently not well documented, and even in cases where there is documentation for the format, not everyone who creates documents will follow the specifications faithfully. @@ -76,7 +110,8 @@ the request handler is running in the same JVM that Solr uses for other operatio Indexing can also consume all available Solr resources, particularly with large PDFs, presentations, or other files that have a lot of rich media embedded in them. -For these reasons, Solr Cell is not recommended for use in a production system. +For these reasons, Solr Cell with `local` backend is not recommended for use in a production system. Prefer the +`tikaserver` backend, which is more robust and isolates failures from Solr itself. It is a best practice to use Solr Cell as a proof-of-concept tool during development and then run Tika as an external process that sends the extracted documents to Solr (via xref:deployment-guide:solrj.adoc[]) for indexing. @@ -181,7 +216,7 @@ These parameters can be set for each indexing request (as request parameters), o |=== + Choose the backend to use for extraction. The options are `local` or `tikaserver`. -The `local` backend uses Tika libraries included with Solr to do the extraction, and is the default in Solr 9. +The `local` backend uses Tika libraries included with Solr to do the extraction, and is the default in Solr 9.x. The `tikaserver` backend uses an external Tika server process to do the extraction. **The `local` backend is deprecated and will be removed in a future release.** + @@ -195,9 +230,9 @@ Example: In `solrconfig.xml`: `tikaserver`. |=== + Specifies the URL of the Tika server to use when the `extraction.backend` parameter is set to `tikaserver`. -This parameter is required when using the `tikaserver` backend. +This parameter is required when using the `tikaserver` backend. Defaults to `http://localhost:9998` if not specified. + -Example: In `solrconfig.xml`: `http://my.tika.server`. +Example: In `solrconfig.xml`: `http://localhost:9998`. `capture`:: + @@ -500,6 +535,8 @@ So you can use the other URPs without worrying about unexpected field additions. === Parser-Specific Properties +NOTE: This setting applies to `local` backend only. + Parsers used by Tika may have specific properties to govern how data is extracted. These can be passed through Solr for special parsing situations. @@ -521,6 +558,8 @@ Consult the Tika Java API documentation for configuration parameters that can be === Indexing Encrypted Documents +NOTE: The `tikaserver` backend does not currently support indexing encrypted documents. + The ExtractingRequestHandler will decrypt encrypted files and index their content if you supply a password in either `resource.password` in the request, or in a `passwordsFile` file. In the case of `passwordsFile`, the file supplied must be formatted so there is one line per rule. @@ -658,6 +697,7 @@ public class SolrCellRequestDemo { req.setParam(ExtractingParams.EXTRACT_ONLY, "true"); NamedList result = client.request(req); System.out.println("Result: " + result); + } } ---- From 83296a9822b15184d7adb6067e3b4a9e85f6284c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 25 Sep 2025 16:30:04 +0200 Subject: [PATCH 21/47] * Refactor some logic back to ExtractingDocumentLoader * Add back-compat option for metadata * Fix true SAX streaming parser for Tika XML response * Simplify ExtractionBackend interface --- .../extraction/DummyExtractionBackend.java | 32 +-- .../extraction/ExtractingDocumentLoader.java | 108 +++++++-- .../handler/extraction/ExtractingParams.java | 3 + .../handler/extraction/ExtractionBackend.java | 20 +- .../extraction/ExtractionMetadata.java | 13 ++ .../LocalTikaExtractionBackend.java | 67 +----- .../TikaServerExtractionBackend.java | 213 ++++-------------- .../extraction/TikaServerXmlParser.java | 126 +++++++++++ .../extraction/XmlSanitizingReader.java | 168 ++++++++++++++ .../TikaServerExtractionBackendTest.java | 29 +-- 10 files changed, 480 insertions(+), 299 deletions(-) create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java create mode 100644 solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java index e85844ff46f..9bdad267147 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/DummyExtractionBackend.java @@ -17,10 +17,12 @@ package org.apache.solr.handler.extraction; import java.io.InputStream; +import org.xml.sax.ContentHandler; /** Dummy backend that emits predictable test data without actually parsing input content. */ public class DummyExtractionBackend implements ExtractionBackend { public static final String NAME = "dummy"; + private final String text = "This is dummy extracted content"; @Override public String name() { @@ -37,32 +39,20 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque if (request.resourceName != null) { metadata.add("resourcename", request.resourceName); } - String text = "This is dummy extracted content"; return new ExtractionResult(text, metadata); } @Override - public ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String xpathExpr) { - if (xpathExpr != null) { - throw new UnsupportedOperationException("XPath not supported by dummy backend"); - } - return extract(inputStream, request); - } - - @Override - public void parseToSolrContentHandler( + public void extractWithSaxHandler( InputStream inputStream, ExtractionRequest request, - SolrContentHandler handler, - ExtractionMetadata outMetadata) { - // Fill metadata - ExtractionResult r = extract(inputStream, request); - for (String name : r.getMetadata().names()) { - String[] vals = r.getMetadata().getValues(name); - if (vals != null) for (String v : vals) outMetadata.add(name, v); - } - // Append content - handler.appendToContent(r.getContent()); + ExtractionMetadata md, + ContentHandler saxContentHandler) + throws Exception { + + ExtractionResult res = extract(inputStream, request); + md.putAll(res.getMetadata().asMap()); + // Append the content to the SAX handler + saxContentHandler.characters(res.getContent().toCharArray(), 0, res.getContent().length()); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index ef68427099e..d968583ee0b 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -51,6 +51,7 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { final SolrParams params; final UpdateRequestProcessor processor; final boolean ignoreTikaException; + final boolean backCompat; private final AddUpdateCommand templateAdd; @@ -65,10 +66,12 @@ public ExtractingDocumentLoader( this.params = req.getParams(); this.core = req.getCore(); this.processor = processor; + this.backCompat = params.getBool(ExtractingParams.BACK_COMPATIBILITY, true); templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); + templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); this.factory = factory; this.backend = backend; @@ -139,13 +142,48 @@ public void load( if (extractOnly) { try { - ExtractionResult result = backend.extractOnly(inputStream, extractionRequest, xpathExpr); + ExtractionMetadata md = backend.buildMetadataFromRequest(extractionRequest); + String content; + if (ExtractingDocumentLoader.TEXT_FORMAT.equals(extractionRequest.extractFormat) + || xpathExpr != null) { + org.apache.tika.sax.ToTextContentHandler textHandler = + new org.apache.tika.sax.ToTextContentHandler(); + org.xml.sax.ContentHandler ch = textHandler; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); + } + backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch); + content = textHandler.toString(); + } else { // XML format + org.apache.tika.sax.ToXMLContentHandler toXml = + new org.apache.tika.sax.ToXMLContentHandler(); + org.xml.sax.ContentHandler ch = toXml; + if (xpathExpr != null) { + org.apache.tika.sax.xpath.XPathParser xparser = + new org.apache.tika.sax.xpath.XPathParser( + "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); + org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); + ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher); + } + backend.extractWithSaxHandler(inputStream, extractionRequest, md, ch); + content = toXml.toString(); + if (!content.startsWith("\n" + content; + } + } + + appendBackCompatTikaMetadata(md); + // Write content - rsp.add(stream.getName(), result.getContent()); + rsp.add(stream.getName(), content); // Write metadata NamedList metadataNL = new NamedList<>(); - for (String name : result.getMetadata().names()) { - metadataNL.add(name, result.getMetadata().getValues(name)); + for (String name : md.names()) { + metadataNL.add(name, md.getValues(name)); } rsp.add(stream.getName() + "_metadata", metadataNL); } catch (UnsupportedOperationException uoe) { @@ -166,11 +204,11 @@ public void load( if (needLegacySax) { // Indexing with capture/xpath/etc: delegate SAX parse to backend - ExtractionMetadata neutral = new ExtractionMetadata(); + ExtractionMetadata metadata = backend.buildMetadataFromRequest(extractionRequest); SolrContentHandler handler = - factory.createSolrContentHandler(neutral, params, req.getSchema()); + factory.createSolrContentHandler(metadata, params, req.getSchema()); try { - backend.parseToSolrContentHandler(inputStream, extractionRequest, handler, neutral); + backend.extractWithSaxHandler(inputStream, extractionRequest, metadata, handler); } catch (UnsupportedOperationException uoe) { // For backends that don't support parseToSolrContentHandler if (log.isWarnEnabled()) { @@ -183,13 +221,13 @@ public void load( if (ignoreTikaException) { if (log.isWarnEnabled()) { log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); + return; } - // Index a document with literals only (no extracted content/metadata) - addDoc(handler); - return; } throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } + appendBackCompatTikaMetadata(handler.metadata); + addDoc(handler); return; } @@ -202,16 +240,15 @@ public void load( if (ignoreTikaException) { if (log.isWarnEnabled()) log.warn("skip extracting text due to {}.", e.getLocalizedMessage(), e); - // Index a document with literals only (no extracted content/metadata) - SolrContentHandler handler = - factory.createSolrContentHandler(new ExtractionMetadata(), params, req.getSchema()); - addDoc(handler); return; } throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } ExtractionMetadata metadata = result.getMetadata(); + + appendBackCompatTikaMetadata(metadata); + String content = result.getContent(); SolrContentHandler handler = @@ -220,4 +257,47 @@ public void load( addDoc(handler); } } + + private void appendBackCompatTikaMetadata(ExtractionMetadata md) { + if (!backCompat) { + return; + } + + if (md.get("dc:title") != null) { + md.addValues("title", md.getValues("dc:title")); + } + if (md.get("dc:creator") != null) { + md.addValues("author", md.getValues("dc:creator")); + } + if (md.get("dc:description") != null) { + md.addValues("description", md.getValues("dc:description")); + } + if (md.get("dc:subject") != null) { + md.addValues("subject", md.getValues("dc:subject")); + } + if (md.get("dc:language") != null) { + md.addValues("language", md.getValues("dc:language")); + } + if (md.get("dc:publisher") != null) { + md.addValues("publisher", md.getValues("dc:publisher")); + } + if (md.get("dcterms:created") != null) { + md.addValues("created", md.getValues("dcterms:created")); + } + if (md.get("dcterms:modified") != null) { + md.addValues("modified", md.getValues("dcterms:modified")); + } + if (md.get("meta:author") != null) { + md.addValues("Author", md.getValues("meta:author")); + } + if (md.get("meta:creation-date") != null) { + md.addValues("Creation-Date", md.getValues("meta:creation-date")); + } + if (md.get("meta:save-date") != null) { + md.addValues("Last-Save-Date", md.getValues("meta:save-date")); + } + if (md.get("meta:keyword") != null) { + md.addValues("Keywords", md.getValues("meta:keyword")); + } + } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java index 840af280243..40bca51256d 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java @@ -139,4 +139,7 @@ public interface ExtractingParams { /** Backend selection parameter and */ public static final String EXTRACTION_BACKEND = "extraction.backend"; + + /** Fix metadata to match Tika 1.x */ + public static final String BACK_COMPATIBILITY = "backCompatibility"; } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java index 9647d0f843b..6c8962f7d60 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionBackend.java @@ -19,6 +19,7 @@ import java.io.InputStream; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.TikaMetadataKeys; +import org.xml.sax.ContentHandler; /** Strategy interface for content extraction backends. */ public interface ExtractionBackend { @@ -29,23 +30,14 @@ public interface ExtractionBackend { ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception; /** - * Perform extractOnly operation. If extractFormat equals ExtractingDocumentLoader.TEXT_FORMAT, - * return plain text. If XML, return XML body as string. Implementations may support optional - * xpathExpr; if unsupported and xpathExpr is not null, they should throw - * UnsupportedOperationException. + * Perform extraction of text from input stream with SAX handler. Sax handler can be + * SolrContentHandler, ToTextContentHandler, ToXMLContentHandler, MatchingContentHandler etc */ - ExtractionResult extractOnly(InputStream inputStream, ExtractionRequest request, String xpathExpr) - throws Exception; - - /** - * Parse the content and stream SAX events into the provided SolrContentHandler, while also - * filling outMetadata with extracted metadata. - */ - void parseToSolrContentHandler( + void extractWithSaxHandler( InputStream inputStream, ExtractionRequest request, - SolrContentHandler handler, - ExtractionMetadata outMetadata) + ExtractionMetadata md, + ContentHandler saxContentHandler) throws Exception; /** Build ExtractionMetadata from the request context */ diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java index c400bc90fb1..67592432fa0 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionMetadata.java @@ -31,12 +31,21 @@ public void add(String name, String value) { map.computeIfAbsent(name, k -> new ArrayList<>()).add(value); } + public void addValues(String name, String[] values) { + if (name == null || values == null || values.length == 0) return; + map.computeIfAbsent(name, k -> new ArrayList<>()).addAll(List.of(values)); + } + public void addIfNotNull(String resourceNameKey, String resourceName) { if (resourceName != null) { add(resourceNameKey, resourceName); } } + public void putAll(Map> map) { + this.map.putAll(map); + } + public String[] getValues(String name) { List vals = map.get(name); if (vals == null) return new String[0]; @@ -57,6 +66,10 @@ public void remove(String name) { map.remove(name); } + public Map> asMap() { + return map; + } + @Override public String toString() { StringBuilder sb = new StringBuilder("ExtractionMetadata{"); diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java index 28470ff7024..9f762bfa375 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/LocalTikaExtractionBackend.java @@ -30,6 +30,7 @@ import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; /** * Extraction backend using local in-process Apache Tika. This encapsulates the previous direct @@ -134,7 +135,7 @@ private ParseContext buildContext(Parser parser, ExtractionRequest request) { return context; } - private static ExtractionMetadata copyToNeutral(Metadata md) { + private static ExtractionMetadata tikaMetadataToExtractionMetadata(Metadata md) { ExtractionMetadata out = new ExtractionMetadata(); for (String name : md.names()) { String[] vals = md.getValues(name); @@ -150,74 +151,30 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque if (parser == null) { throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); } - Metadata md = buildMetadata(request); ParseContext context = buildContext(parser, request); + Metadata md = buildMetadata(request); BodyContentHandler textHandler = new BodyContentHandler(-1); parser.parse(inputStream, textHandler, md, context); - return new ExtractionResult(textHandler.toString(), copyToNeutral(md)); - } - - @Override - public ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception { - Parser parser = selectParser(request); - if (parser == null) { - throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); - } - Metadata md = buildMetadata(request); - ParseContext context = buildContext(parser, request); - - String content; - if (ExtractingDocumentLoader.TEXT_FORMAT.equals(request.extractFormat) || xpathExpr != null) { - org.apache.tika.sax.ToTextContentHandler textHandler = - new org.apache.tika.sax.ToTextContentHandler(); - org.xml.sax.ContentHandler ch = textHandler; - if (xpathExpr != null) { - org.apache.tika.sax.xpath.XPathParser xparser = - new org.apache.tika.sax.xpath.XPathParser( - "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); - org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); - ch = new org.apache.tika.sax.xpath.MatchingContentHandler(textHandler, matcher); - } - parser.parse(inputStream, ch, md, context); - content = textHandler.toString(); - } else { // XML format - org.apache.tika.sax.ToXMLContentHandler toXml = new org.apache.tika.sax.ToXMLContentHandler(); - org.xml.sax.ContentHandler ch = toXml; - if (xpathExpr != null) { - org.apache.tika.sax.xpath.XPathParser xparser = - new org.apache.tika.sax.xpath.XPathParser( - "xhtml", org.apache.tika.sax.XHTMLContentHandler.XHTML); - org.apache.tika.sax.xpath.Matcher matcher = xparser.parse(xpathExpr); - ch = new org.apache.tika.sax.xpath.MatchingContentHandler(toXml, matcher); - } - parser.parse(inputStream, ch, md, context); - content = toXml.toString(); - if (!content.startsWith("\n" + content; - } - } - return new ExtractionResult(content, copyToNeutral(md)); + return new ExtractionResult(textHandler.toString(), tikaMetadataToExtractionMetadata(md)); } @Override - public void parseToSolrContentHandler( + public void extractWithSaxHandler( InputStream inputStream, ExtractionRequest request, - SolrContentHandler handler, - ExtractionMetadata outMetadata) + ExtractionMetadata md, + ContentHandler saxContentHandler) throws Exception { Parser parser = selectParser(request); if (parser == null) { throw new IllegalArgumentException("No Tika parser for stream type: " + request.streamType); } - Metadata md = buildMetadata(request); ParseContext context = buildContext(parser, request); - parser.parse(inputStream, handler, md, context); - // populate outMetadata - for (String name : md.names()) { - String[] vals = md.getValues(name); - if (vals != null) for (String v : vals) outMetadata.add(name, v); + Metadata tikaMetadata = buildMetadata(request); + parser.parse(inputStream, saxContentHandler, tikaMetadata, context); + for (String name : tikaMetadata.names()) { + String[] vals = tikaMetadata.getValues(name); + if (vals != null) for (String v : vals) md.add(name, v); } } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index 196d9397cf4..401e4268969 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -16,17 +16,16 @@ */ package org.apache.solr.handler.extraction; +import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; -import java.nio.charset.StandardCharsets; import java.time.Duration; -import java.util.Arrays; -import java.util.Set; import org.apache.solr.common.SolrException; -import org.noggit.JSONParser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; /** * Extraction backend that delegates parsing to a remote Apache Tika Server. @@ -39,6 +38,7 @@ public class TikaServerExtractionBackend implements ExtractionBackend { private final HttpClient httpClient; private final String baseUrl; // e.g., http://localhost:9998 private final Duration timeout = Duration.ofSeconds(30); + private final TikaServerXmlParser tikaServerXmlParser = new TikaServerXmlParser(); public TikaServerExtractionBackend(String baseUrl) { this(HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build(), baseUrl); @@ -64,14 +64,40 @@ public String name() { @Override public ExtractionResult extract(InputStream inputStream, ExtractionRequest request) throws Exception { - String url = - baseUrl - + "/tika/" - + (Set.of("html", "xml").contains(request.extractFormat) ? "html" : "text"); + try (InputStream tikaResponse = callTikaServer(inputStream, request)) { + ExtractionMetadata md = buildMetadataFromRequest(request); + BodyContentHandler textHandler = new BodyContentHandler(-1); + tikaServerXmlParser.parse(tikaResponse, textHandler, md); + return new ExtractionResult(textHandler.toString(), md); + } + } + + @Override + public void extractWithSaxHandler( + InputStream inputStream, + ExtractionRequest request, + ExtractionMetadata md, + ContentHandler saxContentHandler) + throws Exception { + try (InputStream tikaResponse = callTikaServer(inputStream, request)) { + tikaServerXmlParser.parse(tikaResponse, saxContentHandler, md); + } + } + + private static String firstNonNull(String a, String b) { + return a != null ? a : b; + } + + /** + * Call the Tika Server /tika endpoint to extract text and metadata. + * + * @return InputStream of the response body, which is XML format + */ + private InputStream callTikaServer(InputStream inputStream, ExtractionRequest request) + throws IOException, InterruptedException { + String url = baseUrl + "/tika"; HttpRequest.Builder b = - HttpRequest.newBuilder(URI.create(url)) - .timeout(timeout) - .header("Accept", "application/json"); + HttpRequest.newBuilder(URI.create(url)).timeout(timeout).header("Accept", "text/xml"); String contentType = firstNonNull(request.streamType, request.contentType); if (contentType != null) { b.header("Content-Type", contentType); @@ -96,173 +122,14 @@ public ExtractionResult extract(InputStream inputStream, ExtractionRequest reque } b.PUT(HttpRequest.BodyPublishers.ofInputStream(() -> inputStream)); - // TODO: Consider getting the InputStream instead - HttpResponse resp = - httpClient.send(b.build(), HttpResponse.BodyHandlers.ofString(StandardCharsets.UTF_8)); + HttpResponse resp = + httpClient.send(b.build(), HttpResponse.BodyHandlers.ofInputStream()); int code = resp.statusCode(); if (code < 200 || code >= 300) { - // TODO: Parse error message from response? throw new SolrException( SolrException.ErrorCode.getErrorCode(code), "TikaServer " + url + " returned status " + code); } - String body = resp.body(); - return parseCombinedJson(body, md); - } - - @Override - public ExtractionResult extractOnly( - InputStream inputStream, ExtractionRequest request, String xpathExpr) throws Exception { - if (xpathExpr != null) { - throw new UnsupportedOperationException( - "XPath filtering is not supported by TikaServer backend"); - } - return extract(inputStream, request); - } - - @Override - public void parseToSolrContentHandler( - InputStream inputStream, - ExtractionRequest request, - SolrContentHandler handler, - ExtractionMetadata outMetadata) { - throw new UnsupportedOperationException( - "Legacy SAX-based parsing is not supported by TikaServer backend"); - } - - private static String firstNonNull(String a, String b) { - return a != null ? a : b; - } - - // Reads key-values of the current object into md. Assumes the parser is positioned - // right after OBJECT_START of that object. - private static ExtractionMetadata parseMetadataObject(JSONParser p) throws java.io.IOException { - ExtractionMetadata md = new ExtractionMetadata(); - String currentKey; - while (true) { - int ev = p.nextEvent(); - if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) { - break; - } - if (ev == JSONParser.STRING && p.wasKey()) { - currentKey = p.getString(); - ev = p.nextEvent(); - if (ev == JSONParser.STRING) { - md.add(currentKey, p.getString()); - } else if (ev == JSONParser.ARRAY_START) { - while (true) { - ev = p.nextEvent(); - if (ev == JSONParser.ARRAY_END) break; - if (ev == JSONParser.STRING) { - md.add(currentKey, p.getString()); - } else if (ev == JSONParser.LONG - || ev == JSONParser.NUMBER - || ev == JSONParser.BIGNUMBER) { - md.add(currentKey, p.getNumberChars().toString()); - } else if (ev == JSONParser.BOOLEAN) { - md.add(currentKey, String.valueOf(p.getBoolean())); - } else if (ev == JSONParser.NULL) { - // ignore nulls - } else { - // skip nested objects or unsupported types within arrays - } - } - } else if (ev == JSONParser.LONG || ev == JSONParser.NUMBER || ev == JSONParser.BIGNUMBER) { - md.add(currentKey, p.getNumberChars().toString()); - } else if (ev == JSONParser.BOOLEAN) { - md.add(currentKey, String.valueOf(p.getBoolean())); - } else if (ev == JSONParser.NULL) { - // ignore nulls - } else if (ev == JSONParser.OBJECT_START) { - // Unexpected nested object; skip it entirely - skipObject(p); - } else { - // skip unsupported value types - } - } - } - return md; - } - - private static void skipObject(JSONParser p) throws java.io.IOException { - int depth = 1; - while (depth > 0) { - int ev = p.nextEvent(); - if (ev == JSONParser.OBJECT_START) depth++; - else if (ev == JSONParser.OBJECT_END) depth--; - else if (ev == JSONParser.EOF) break; - } - } - - // Parses combined JSON from /tika/text with Accept: application/json and returns both content - // and metadata. Supports two shapes: - // 1) {"content": "...", "metadata": { ... }} - // 2) {"content": "...", } - private ExtractionResult parseCombinedJson(String json, ExtractionMetadata md) { - String content = ""; - if (json == null) return new ExtractionResult(content, md); - try { - JSONParser p = new JSONParser(json); - int ev = p.nextEvent(); - if (ev != JSONParser.OBJECT_START) { - return new ExtractionResult(content, md); - } - while (true) { - ev = p.nextEvent(); - if (ev == JSONParser.OBJECT_END || ev == JSONParser.EOF) break; - if (ev == JSONParser.STRING && p.wasKey()) { - String key = p.getString(); - ev = p.nextEvent(); - if ("X-TIKA:content".equals(key)) { - if (ev == JSONParser.STRING) { - content = p.getString(); - } else { - // Skip non-string content - if (ev == JSONParser.OBJECT_START) skipObject(p); - } - } else if ("metadata".equals(key)) { - if (ev == JSONParser.OBJECT_START) { - md = parseMetadataObject(p); - } else { - // unexpected shape; skip - if (ev == JSONParser.OBJECT_START) skipObject(p); - } - } else { - // Treat as flat metadata field - if (ev == JSONParser.STRING) { - md.add(key, p.getString()); - } else if (ev == JSONParser.ARRAY_START) { - while (true) { - ev = p.nextEvent(); - if (ev == JSONParser.ARRAY_END) break; - if (ev == JSONParser.STRING) md.add(key, p.getString()); - else if (ev == JSONParser.LONG - || ev == JSONParser.NUMBER - || ev == JSONParser.BIGNUMBER) md.add(key, p.getNumberChars().toString()); - else if (ev == JSONParser.BOOLEAN) md.add(key, String.valueOf(p.getBoolean())); - else if (ev == JSONParser.NULL) { - // ignore - } - } - } else if (ev == JSONParser.LONG - || ev == JSONParser.NUMBER - || ev == JSONParser.BIGNUMBER) { - md.add(key, p.getNumberChars().toString()); - } else if (ev == JSONParser.BOOLEAN) { - md.add(key, String.valueOf(p.getBoolean())); - } else if (ev == JSONParser.NULL) { - // ignore - } else if (ev == JSONParser.OBJECT_START) { - // skip nested object for unknown key - skipObject(p); - } - } - } - } - } catch (java.io.IOException ioe) { - // ignore, return what we have - } - Arrays.stream(md.names()).filter(k -> k.startsWith("X-TIKA:Parsed-")).forEach(md::remove); - return new ExtractionResult(content, md); + return resp.body(); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java new file mode 100644 index 00000000000..591fb3a1475 --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerXmlParser.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.IOException; +import java.io.InputStream; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import org.apache.solr.common.SolrException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class TikaServerXmlParser { + private final SAXParser saxParser; + + public TikaServerXmlParser() { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + try { + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + } catch (Throwable ignore) { + // Some parsers may not support all features; ignore + } + try { + saxParser = factory.newSAXParser(); + } catch (Exception e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } + } + + /** + * Parses response in XML format from Tika Server /tika endpoint. The result is that the metadata + * object is populated and the content handler is called with extracted text. + */ + public void parse(InputStream inputStream, ContentHandler handler, ExtractionMetadata metadata) + throws IOException, SAXException { + DefaultHandler myHandler = new TikaXmlResponseSaxContentHandler(handler, metadata); + InputStream sanitizedStream = XmlSanitizingReader.sanitize(inputStream); + saxParser.parse(sanitizedStream, myHandler); + } + + /** Custom SAX handler that will extract meta tags from the tika xml and delegate */ + static class TikaXmlResponseSaxContentHandler extends DefaultHandler { + private final ContentHandler delegate; + private final ExtractionMetadata metadata; + private boolean inHead = false; + + public TikaXmlResponseSaxContentHandler(ContentHandler delegate, ExtractionMetadata metadata) { + this.delegate = delegate; + this.metadata = metadata; + } + + @Override + public void startDocument() throws SAXException { + if (delegate != null) delegate.startDocument(); + } + + @Override + public void endDocument() throws SAXException { + if (delegate != null) delegate.endDocument(); + } + + @Override + public void startElement( + String uri, String localName, String qName, org.xml.sax.Attributes attributes) + throws SAXException { + String ln = localName != null && !localName.isEmpty() ? localName : qName; + if ("head".equalsIgnoreCase(ln)) { + inHead = true; + } else if (inHead && "meta".equalsIgnoreCase(ln) && attributes != null) { + String name = attributes.getValue("name"); + String content = attributes.getValue("content"); + if (name != null && content != null) { + metadata.add(name, content); + } + } + if (delegate != null) delegate.startElement(uri, localName, qName, attributes); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + String ln = localName != null && !localName.isEmpty() ? localName : qName; + if ("head".equalsIgnoreCase(ln)) { + inHead = false; + } + if (delegate != null) delegate.endElement(uri, localName, qName); + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (delegate != null) delegate.characters(ch, start, length); + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (delegate != null) delegate.ignorableWhitespace(ch, start, length); + } + + @Override + public void startPrefixMapping(String prefix, String uri) throws SAXException { + if (delegate != null) delegate.startPrefixMapping(prefix, uri); + } + + @Override + public void endPrefixMapping(String prefix) throws SAXException { + if (delegate != null) delegate.endPrefixMapping(prefix); + } + } +} diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java new file mode 100644 index 00000000000..39f6b79a4cd --- /dev/null +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.extraction; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringWriter; +import org.apache.commons.io.IOUtils; + +/** + * Make sure the XHTML input is valid XML. Pipe text through this reader before passing it to an XML + * parser + */ +final class XmlSanitizingReader extends java.io.Reader { + private final java.io.Reader in; + private final StringBuilder entityBuf = new StringBuilder(); + private boolean inEntity = false; // after reading '&' + + // For surrogate tracking to evaluate XML validity by code point + private int pendingHighSurrogate = -1; + + public XmlSanitizingReader(java.io.Reader in) { + this.in = in; + } + + @Override + public int read(char[] cbuf, int off, int len) throws java.io.IOException { + int written = 0; + while (written < len) { + int ci = in.read(); + if (ci == -1) break; + char ch = (char) ci; + + // Handle numeric entity stripping for � and � variants + if (!inEntity) { + if (ch == '&') { + inEntity = true; + entityBuf.setLength(0); + entityBuf.append(ch); + continue; // don't write yet + } + } else { + entityBuf.append(ch); + // stop conditions for entity buffering + if (ch == ';' || entityBuf.length() > 12) { // entities are short; cap length defensively + String ent = entityBuf.toString(); + boolean drop = isNullNumericEntity(ent); + inEntity = false; + if (!drop) { + // flush buffered entity to output + for (int i = 0; i < ent.length() && written < len; i++) { + cbuf[off + written++] = ent.charAt(i); + } + } + continue; + } + // Keep buffering alphanumerics and '#', 'x' + continue; + } + + // Filter invalid XML 1.0 characters by code point + if (Character.isHighSurrogate(ch)) { + pendingHighSurrogate = ch; + continue; // need next char to form code point + } + if (Character.isLowSurrogate(ch) && pendingHighSurrogate != -1) { + int cp = Character.toCodePoint((char) pendingHighSurrogate, ch); + pendingHighSurrogate = -1; + if (isAllowedXmlChar(cp)) { + // encode back as surrogate pair + cbuf[off + written++] = Character.highSurrogate(cp); + if (written < len) { + cbuf[off + written++] = Character.lowSurrogate(cp); + } else { + // If no space for low surrogate, keep it pending (edge, unlikely with reasonable len) + // Fallback: buffer low surrogate into a small one-char pushback by using a field + // For simplicity, write only if space available; otherwise, return and next read + // continues + // But to avoid corruption, store it + pushbackChar = Character.lowSurrogate(cp); + } + } + continue; + } else { + // previous high surrogate without low surrogate -> invalid; drop it + pendingHighSurrogate = -1; + } + + int cp = ch; + if (!Character.isSurrogate(ch) && isAllowedXmlChar(cp)) { + cbuf[off + written++] = ch; + } + } + return (written == 0) ? -1 : written; + } + + private Character pushbackChar = null; + + @Override + public boolean ready() throws java.io.IOException { + return in.ready(); + } + + @Override + public void close() throws java.io.IOException { + in.close(); + } + + private static boolean isNullNumericEntity(String ent) { + // Accept patterns like '�', '�', '�', '�' (case-insensitive) + if (ent == null) return false; + if (!ent.startsWith("&#") || !ent.endsWith(";")) return false; + String mid = ent.substring(2, ent.length() - 1); + if (mid.isEmpty()) return false; + if (mid.charAt(0) == 'x' || mid.charAt(0) == 'X') { + // hex + for (int i = 1; i < mid.length(); i++) { + char c = mid.charAt(i); + if (c != '0') return false; + } + return mid.length() > 1; // at least one zero after x + } else { + // decimal + for (int i = 0; i < mid.length(); i++) { + char c = mid.charAt(i); + if (c != '0') return false; + } + return true; // one or more zeros + } + } + + private static boolean isAllowedXmlChar(int cp) { + return cp == 0x9 + || cp == 0xA + || cp == 0xD + || (cp >= 0x20 && cp <= 0xD7FF) + || (cp >= 0xE000 && cp <= 0xFFFD) + || (cp >= 0x10000 && cp <= 0x10FFFF); + } + + public static InputStream sanitize(InputStream in) throws IOException { + try (Reader reader = new XmlSanitizingReader(new InputStreamReader(in)); + StringWriter writer = new StringWriter()) { + + IOUtils.copy(reader, writer); // copy all sanitized chars to writer + + byte[] bytes = writer.toString().getBytes("UTF-8"); + return new ByteArrayInputStream(bytes); + } + } +} diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java index 15f54707638..4d583b67769 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java @@ -25,6 +25,7 @@ import org.apache.solr.SolrIgnoredThreadsFilter; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.util.ExecutorUtil; +import org.apache.tika.sax.ToXMLContentHandler; import org.junit.AfterClass; import org.junit.Assume; import org.junit.BeforeClass; @@ -137,15 +138,16 @@ public void testExtractTextAndMetadata() throws Exception { } @Test - public void testExtractOnlyXml() throws Exception { + public void testExtractWithSaxHandlerXml() throws Exception { Assume.assumeTrue("Tika server container not started", tika != null); TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl); byte[] data = "Hello XML".getBytes(java.nio.charset.StandardCharsets.UTF_8); + ExtractionRequest request = newRequest("test.txt", "text/plain", "xml"); try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { - ExtractionResult res = - backend.extractOnly(in, newRequest("test.txt", "text/plain", "xml"), null); - assertNotNull(res); - String c = res.getContent(); + ToXMLContentHandler xmlHandler = new ToXMLContentHandler(); + ExtractionMetadata md = backend.buildMetadataFromRequest(request); + backend.extractWithSaxHandler(in, request, md, xmlHandler); + String c = xmlHandler.toString(); assertNotNull(c); // Tika Server may return XHTML without XML declaration; be flexible assertTrue( @@ -155,21 +157,4 @@ public void testExtractOnlyXml() throws Exception { assertTrue(c.contains("Hello XML")); } } - - @Test - public void testParseToSolrContentHandlerUnsupported() throws Exception { - Assume.assumeTrue("Tika server container not started", tika != null); - TikaServerExtractionBackend backend = new TikaServerExtractionBackend(client, baseUrl); - byte[] data = "dummy".getBytes(java.nio.charset.StandardCharsets.UTF_8); - try (ByteArrayInputStream in = new ByteArrayInputStream(data)) { - expectThrows( - UnsupportedOperationException.class, - () -> - backend.parseToSolrContentHandler( - in, - newRequest("test.txt", "text/plain", "text"), - new SolrContentHandler(new ExtractionMetadata(), params(), null), - new ExtractionMetadata())); - } - } } From 14b556bf256756ebbcae3aa0211234d48c679e68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 25 Sep 2025 17:40:27 +0200 Subject: [PATCH 22/47] Fix forbiddenAPI --- .../extraction/XmlSanitizingReader.java | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java index 39f6b79a4cd..9451972ee01 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java @@ -16,13 +16,15 @@ */ package org.apache.solr.handler.extraction; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; import java.io.Reader; -import java.io.StringWriter; -import org.apache.commons.io.IOUtils; +import java.io.Writer; +import java.nio.charset.StandardCharsets; /** * Make sure the XHTML input is valid XML. Pipe text through this reader before passing it to an XML @@ -156,13 +158,28 @@ private static boolean isAllowedXmlChar(int cp) { } public static InputStream sanitize(InputStream in) throws IOException { - try (Reader reader = new XmlSanitizingReader(new InputStreamReader(in)); - StringWriter writer = new StringWriter()) { - - IOUtils.copy(reader, writer); // copy all sanitized chars to writer - - byte[] bytes = writer.toString().getBytes("UTF-8"); - return new ByteArrayInputStream(bytes); - } + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream pipedIn = new PipedInputStream(out); + + Reader reader = new XmlSanitizingReader(new InputStreamReader(in, StandardCharsets.UTF_8)); + Writer writer = new OutputStreamWriter(out, StandardCharsets.UTF_8); + + Thread worker = + new Thread( + () -> { + try (reader; + writer) { + reader.transferTo(writer); + } catch (IOException e) { + try { + pipedIn.close(); + } catch (IOException ignored) { + } + } + }); + worker.setDaemon(true); + worker.start(); + + return pipedIn; } } From 45e7e4104fc27ee41794c18a77b98b1639af4874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 25 Sep 2025 22:27:48 +0200 Subject: [PATCH 23/47] better back-compat metadata logic --- .../extraction/ExtractingDocumentLoader.java | 58 ++++++++----------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index d968583ee0b..9b3eafa38e4 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -20,6 +20,7 @@ import java.io.InputStream; import java.lang.invoke.MethodHandles; import java.util.LinkedHashMap; +import java.util.Map; import java.util.regex.Pattern; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; @@ -258,46 +259,33 @@ public void load( } } + private final Map fieldMappings = new LinkedHashMap<>(); + { + fieldMappings.put("dc:title", "title"); + fieldMappings.put("dc:creator", "author"); + fieldMappings.put("dc:description", "description"); + fieldMappings.put("dc:subject", "subject"); + fieldMappings.put("dc:language", "language"); + fieldMappings.put("dc:publisher", "publisher"); + fieldMappings.put("dcterms:created", "created"); + fieldMappings.put("dcterms:modified", "modified"); + fieldMappings.put("meta:author", "Author"); + fieldMappings.put("meta:creation-date", "Creation-Date"); + fieldMappings.put("meta:save-date", "Last-Save-Date"); + fieldMappings.put("meta:keyword", "Keywords"); + } + private void appendBackCompatTikaMetadata(ExtractionMetadata md) { if (!backCompat) { return; } - if (md.get("dc:title") != null) { - md.addValues("title", md.getValues("dc:title")); - } - if (md.get("dc:creator") != null) { - md.addValues("author", md.getValues("dc:creator")); - } - if (md.get("dc:description") != null) { - md.addValues("description", md.getValues("dc:description")); - } - if (md.get("dc:subject") != null) { - md.addValues("subject", md.getValues("dc:subject")); - } - if (md.get("dc:language") != null) { - md.addValues("language", md.getValues("dc:language")); - } - if (md.get("dc:publisher") != null) { - md.addValues("publisher", md.getValues("dc:publisher")); - } - if (md.get("dcterms:created") != null) { - md.addValues("created", md.getValues("dcterms:created")); - } - if (md.get("dcterms:modified") != null) { - md.addValues("modified", md.getValues("dcterms:modified")); - } - if (md.get("meta:author") != null) { - md.addValues("Author", md.getValues("meta:author")); - } - if (md.get("meta:creation-date") != null) { - md.addValues("Creation-Date", md.getValues("meta:creation-date")); - } - if (md.get("meta:save-date") != null) { - md.addValues("Last-Save-Date", md.getValues("meta:save-date")); - } - if (md.get("meta:keyword") != null) { - md.addValues("Keywords", md.getValues("meta:keyword")); + for (Map.Entry mapping : fieldMappings.entrySet()) { + String sourceField = mapping.getKey(); + String targetField = mapping.getValue(); + if (md.get(sourceField) != null && md.get(targetField) == null) { + md.addValues(targetField, md.getValues(sourceField)); + } } } } From 5ba93912f3356d5e4cc8c908d4845e3c2e9a1e72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 26 Sep 2025 01:04:59 +0200 Subject: [PATCH 24/47] More tests pass --- .../extraction/ExtractingDocumentLoader.java | 5 ++++- .../handler/extraction/ExtractionRequest.java | 8 +++++++- .../TikaServerExtractionBackend.java | 3 +++ .../extraction/XmlSanitizingReader.java | 3 ++- .../ExtractingRequestHandlerTestAbstract.java | 16 +++++++++++++++ .../LocalTikaExtractionBackendTest.java | 20 +++++++++++++------ .../TikaServerExtractionBackendTest.java | 15 +++++++++++--- 7 files changed, 58 insertions(+), 12 deletions(-) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index 9b3eafa38e4..ecfac595613 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.InputStream; import java.lang.invoke.MethodHandles; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; import java.util.regex.Pattern; @@ -129,7 +130,8 @@ public void load( stream.getSize(), params.get(ExtractingParams.RESOURCE_PASSWORD, null), pwMap, - extractFormat); + extractFormat, + Collections.emptyMap()); boolean captureAttr = params.getBool(ExtractingParams.CAPTURE_ATTRIBUTES, false); String[] captureElems = params.getParams(ExtractingParams.CAPTURE_ELEMENTS); @@ -260,6 +262,7 @@ public void load( } private final Map fieldMappings = new LinkedHashMap<>(); + { fieldMappings.put("dc:title", "title"); fieldMappings.put("dc:creator", "author"); diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java index 010f6633472..bb23975afec 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractionRequest.java @@ -16,6 +16,9 @@ */ package org.apache.solr.handler.extraction; +import java.util.HashMap; +import java.util.Map; + /** Immutable request info needed by extraction backends. */ public class ExtractionRequest { public final String streamType; // explicit MIME type (optional) @@ -29,6 +32,7 @@ public class ExtractionRequest { public final java.util.LinkedHashMap passwordsMap; // optional passwords map public final String extractFormat; + public final Map tikaRequestHeaders = new HashMap<>(); public ExtractionRequest( String streamType, @@ -40,7 +44,8 @@ public ExtractionRequest( Long streamSize, String resourcePassword, java.util.LinkedHashMap passwordsMap, - String extractFormat) { + String extractFormat, + Map tikaRequestHeaders) { this.streamType = streamType; this.resourceName = resourceName; this.contentType = contentType; @@ -51,5 +56,6 @@ public ExtractionRequest( this.resourcePassword = resourcePassword; this.passwordsMap = passwordsMap; this.extractFormat = extractFormat; + this.tikaRequestHeaders.putAll(tikaRequestHeaders); } } diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java index 401e4268969..e06fc89e213 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/TikaServerExtractionBackend.java @@ -102,6 +102,9 @@ private InputStream callTikaServer(InputStream inputStream, ExtractionRequest re if (contentType != null) { b.header("Content-Type", contentType); } + if (!request.tikaRequestHeaders.isEmpty()) { + request.tikaRequestHeaders.forEach(b::header); + } ExtractionMetadata md = buildMetadataFromRequest(request); if (request.resourcePassword != null || request.passwordsMap != null) { RegexRulesPasswordProvider passwordProvider = new RegexRulesPasswordProvider(); diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java index 9451972ee01..ee7ec8cda08 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/XmlSanitizingReader.java @@ -176,7 +176,8 @@ public static InputStream sanitize(InputStream in) throws IOException { } catch (IOException ignored) { } } - }); + }, + "XmlSanitizingReaderWorker"); worker.setDaemon(true); worker.start(); diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java index 349dae50ce6..843aa4889d7 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTestAbstract.java @@ -458,6 +458,8 @@ public void testLiterals() throws Exception { "two", "fmap.X-Parsed-By", "ignored_parser", + "fmap.X-TIKA:Parsed-By", + "ignored_parser", "fmap.Last-Modified", "extractedDate"); // TODO: original author did not specify why an exception should be thrown... how to fix? @@ -488,6 +490,8 @@ public void testLiterals() throws Exception { "one", "fmap.X-Parsed-By", "ignored_parser", + "fmap.X-TIKA:Parsed-By", + "ignored_parser", "fmap.Last-Modified", "extractedDate"); assertU(commit()); @@ -594,6 +598,12 @@ public void testPlainTextSpecifyingMimeType() throws Exception { "extractedLanguage", "fmap.X-Parsed-By", "ignored_parser", + "fmap.X-TIKA:Parsed-By", + "ignored_parser", + "fmap.X-TIKA:detectedEncoding", + "ignored_parser", + "fmap.X-TIKA:encodingDetector", + "ignored_parser", "fmap.content", "extractedContent", ExtractingParams.STREAM_TYPE, @@ -628,6 +638,12 @@ public void testPlainTextSpecifyingResourceName() throws Exception { "extractedLanguage", "fmap.X-Parsed-By", "ignored_parser", + "fmap.X-TIKA:Parsed-By", + "ignored_parser", + "fmap.X-TIKA:detectedEncoding", + "ignored_parser", + "fmap.X-TIKA:encodingDetector", + "ignored_parser", "fmap.content", "extractedContent", ExtractingParams.RESOURCE_NAME, diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java index 4974f5a1903..825439cad83 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/LocalTikaExtractionBackendTest.java @@ -18,6 +18,8 @@ import java.io.InputStream; import java.nio.file.Files; +import java.util.Collections; +import java.util.Map; import org.apache.solr.SolrTestCaseJ4; import org.apache.tika.config.TikaConfig; import org.junit.BeforeClass; @@ -54,7 +56,8 @@ private ExtractionRequest newRequest( String streamSourceInfo, Long streamSize, String resourcePassword, - String returnType) { + String returnType, + Map tikaRequestHeaders) { return new ExtractionRequest( streamType, resourceName, @@ -65,7 +68,8 @@ private ExtractionRequest newRequest( streamSize, resourcePassword, null, - returnType); + returnType, + tikaRequestHeaders); } @Test @@ -83,7 +87,8 @@ public void testWrongStreamTypeThrows() throws Exception { null, null, null, - "text"); + "text", + Collections.emptyMap()); expectThrows(IllegalArgumentException.class, () -> backend.extract(in, req)); } @@ -99,7 +104,8 @@ public void testWrongStreamTypeThrows() throws Exception { null, null, null, - "text"); + "text", + Collections.emptyMap()); expectThrows(Exception.class, () -> backend.extract(in, req)); } } @@ -118,7 +124,8 @@ public void testPasswordProtectedDocxWithoutPasswordThrows() throws Exception { null, null, null, - "text"); + "text", + Collections.emptyMap()); expectThrows(Exception.class, () -> backend.extract(in, req)); } } @@ -137,7 +144,8 @@ public void testPasswordProtectedDocxWithPasswordSucceeds() throws Exception { null, null, "Word2010", - "text"); + "text", + Collections.emptyMap()); ExtractionResult res = backend.extract(in, req); assertNotNull(res); assertNotNull(res.getMetadata()); diff --git a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java index 4d583b67769..e28fcb6a832 100644 --- a/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java +++ b/solr/modules/extraction/src/test/org/apache/solr/handler/extraction/TikaServerExtractionBackendTest.java @@ -20,6 +20,8 @@ import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import java.io.ByteArrayInputStream; import java.net.http.HttpClient; +import java.util.Collections; +import java.util.Map; import java.util.concurrent.ExecutorService; import org.apache.lucene.tests.util.QuickPatchThreadsFilter; import org.apache.solr.SolrIgnoredThreadsFilter; @@ -103,7 +105,10 @@ public static void stopTikaServer() { } private static ExtractionRequest newRequest( - String resourceName, String contentType, String extractFormat) { + String resourceName, + String contentType, + String extractFormat, + Map tikaRequestHeaders) { return new ExtractionRequest( contentType, // streamType resourceName, // resourceName @@ -114,8 +119,8 @@ private static ExtractionRequest newRequest( null, // size null, // resourcePassword null, // passwordsMap - extractFormat // extraction format xml or text - ); + extractFormat, // extraction format xml or text + tikaRequestHeaders); } @Test @@ -157,4 +162,8 @@ public void testExtractWithSaxHandlerXml() throws Exception { assertTrue(c.contains("Hello XML")); } } + + private ExtractionRequest newRequest(String file, String contentType, String xml) { + return newRequest(file, contentType, xml, Collections.emptyMap()); + } } From dbca2346076d41407b4870d3c9ccbc9c3d173565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 26 Sep 2025 01:24:49 +0200 Subject: [PATCH 25/47] Fix test testLiteralsOverride --- .../apache/solr/handler/extraction/ExtractingDocumentLoader.java | 1 + 1 file changed, 1 insertion(+) diff --git a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index ecfac595613..941b086e298 100644 --- a/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/modules/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -276,6 +276,7 @@ public void load( fieldMappings.put("meta:creation-date", "Creation-Date"); fieldMappings.put("meta:save-date", "Last-Save-Date"); fieldMappings.put("meta:keyword", "Keywords"); + fieldMappings.put("pdf:docinfo:keywords", "Keywords"); } private void appendBackCompatTikaMetadata(ExtractionMetadata md) { From e6ee7066bccef6f20fc3df85ec340f23943fd666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Fri, 26 Sep 2025 02:57:20 +0200 Subject: [PATCH 26/47] Rewrite tests to use h1 instead of div --- .../src/test-files/extraction/example.html | 4 ++-- .../src/test-files/extraction/simple.html | 2 +- .../ExtractingRequestHandlerTestAbstract.java | 22 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/solr/modules/extraction/src/test-files/extraction/example.html b/solr/modules/extraction/src/test-files/extraction/example.html index 5732f6214bc..2801c3c97d8 100644 --- a/solr/modules/extraction/src/test-files/extraction/example.html +++ b/solr/modules/extraction/src/test-files/extraction/example.html @@ -6,8 +6,8 @@

Here is some text

-
Here is some text in a div
-
This has a link.
+

a h1 tag

+

This has a link in a paragraph.

News
  • diff --git a/solr/modules/extraction/src/test-files/extraction/simple.html b/solr/modules/extraction/src/test-files/extraction/simple.html index 3c807fb1d98..3ec4d4e0d01 100644 --- a/solr/modules/extraction/src/test-files/extraction/simple.html +++ b/solr/modules/extraction/src/test-files/extraction/simple.html @@ -10,7 +10,7 @@ Here is some text

    distinct
    words

    -
    Here is some text in a div
    +

    Here is some text in a h1

    This has a link.