Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,14 @@ Create `/tmp/tika-app-test/my-config.json`:
"emitterId": "fse"
}
},
"parse-context": {
"timeout-limits": {
"progressTimeoutMillis": 60000
}
},
"pipes": {
"parseMode": "RMETA",
"numClients": 2,
"timeoutMillis": 60000
"numClients": 2
},
"plugin-roots": "/tmp/tika-app-test/plugins"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ The `/tika` endpoint no longer routes based on `Accept` headers. Use explicit pa
The following `TikaServerConfig` options have been removed:
* `taskTimeoutMillis` - Now configured via `pipes.timeoutMillis`
* `taskTimeoutMillis` - Now configured via `parse-context.timeout-limits.progressTimeoutMillis` (and optionally `totalTaskTimeoutMillis`); see xref:pipes/timeouts.adoc[Timeouts].
* `taskPulseMillis` - No longer needed
* `minimumTimeoutMillis` - No longer needed
Expand All @@ -125,9 +125,13 @@ All tika-server configurations must now include a `pipes` section and a `file-sy
}
}
},
"parse-context": {
"timeout-limits": {
"progressTimeoutMillis": 30000
}
},
"pipes": {
"numClients": 2,
"timeoutMillis": 30000
"numClients": 2
},
"plugin-roots": "path/to/plugins"
}
Expand Down
4 changes: 2 additions & 2 deletions docs/modules/ROOT/pages/migration-to-4x/migrating-to-4x.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ The converter currently supports:

NOTE: When you configure a parser with specific settings in JSON, the loader automatically
excludes it from SPI loading. The parser (e.g., `pdf-parser`) is not even instantiated in
`default-parser` if there's a definition for it in the tika-config.json. Explicit `_exclude`
`default-parser` if there's a definition for it in the tika-config.json. Explicit `exclude`
directives are only needed when you want to disable a parser entirely without providing
custom configuration.

Expand All @@ -103,7 +103,7 @@ custom configuration.

|Exclusions
|`<parser-exclude class="..."/>`
|`"_exclude": ["component-name"]` (only needed to disable a parser entirely)
|`"exclude": ["component-name"]` (only needed to disable a parser entirely)
|===

=== Limitations
Expand Down
4 changes: 0 additions & 4 deletions docs/modules/ROOT/pages/pipes/configuration.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,6 @@ See also xref:pipes/timeouts.adoc[Timeouts] for the full timeout model.
|`1000`
|Interval (ms) between heartbeats sent from the forked process. Must be significantly less than `socketTimeoutMs`.

|`startupTimeoutMillis`
|`240000`
|Maximum time (ms) to wait for a forked process to start up.

|`shutdownClientAfterMillis`
|`300000`
|Shut down an idle forked process after this many milliseconds of inactivity.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
* },
* {
* "default-parser": {
* "_exclude": ["pdf-parser"]
* "exclude": ["pdf-parser"]
* }
* }
* ]
Expand Down Expand Up @@ -257,9 +257,9 @@ private static void checkForRedundantExclusions(List<Map<String, Object>> parser
for (Map<String, Object> parserEntry : parsersList) {
if (parserEntry.containsKey("default-parser")) {
Map<?, ?> config = (Map<?, ?>) parserEntry.get("default-parser");
if (config.containsKey("_exclude")) {
if (config.containsKey("exclude")) {
@SuppressWarnings("unchecked")
List<String> excludes = (List<String>) config.get("_exclude");
List<String> excludes = (List<String>) config.get("exclude");
excludedParsers.addAll(excludes);
}
}
Expand Down Expand Up @@ -364,7 +364,7 @@ private static Map<String, Object> convertParserElement(Element parserElement,
}

if (excludes != null && !excludes.isEmpty()) {
config.put("_exclude", excludes);
config.put("exclude", excludes);
}

Map<String, Object> result = new LinkedHashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ public void testParserWithExcludes(@TempDir Path tempDir) throws Exception {

String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

// Verify exclude is at the correct level (with underscore prefix)
assertTrue(json.contains("\"_exclude\""), "Should have _exclude array");
// Verify exclude is at the correct level (no underscore prefix; SPI loader reads "exclude")
assertTrue(json.contains("\"exclude\""), "Should have exclude array");
assertFalse(json.contains("\"_decorate\""), "_decorate should not be used for parser excludes");
assertTrue(json.contains("\"jsoup-parser\""), "Should exclude jsoup-parser");
assertTrue(json.contains("\"pdf-parser\""), "Should exclude pdf-parser");
Expand Down Expand Up @@ -218,7 +218,7 @@ public void testRedundantExclusionWarning(@TempDir Path tempDir) throws Exceptio
String json = new String(Files.readAllBytes(jsonPath), StandardCharsets.UTF_8);

// Verify the JSON still contains the exclusions (we don't remove them, just inform)
assertTrue(json.contains("\"_exclude\""), "Should still have _exclude array");
assertTrue(json.contains("\"exclude\""), "Should still have exclude array");
assertTrue(json.contains("\"pdf-parser\""), "Should have pdf-parser configured");
assertTrue(json.contains("\"jsoup-parser\""), "Should have jsoup-parser configured");

Expand Down
2 changes: 0 additions & 2 deletions tika-app/src/test/resources/configs/config-template.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
"queueSize": 10000,
"numEmitters": 1,
"emitIntermediateResults": false,
"startupTimeoutMillis": 240000,
"sleepOnStartupTimeoutMillis": 240000,
"shutdownClientAfterMillis": 300000,
"numClients": 4,
"maxFilesProcessedPerProcess": 10000,
Expand Down
2 changes: 1 addition & 1 deletion tika-app/src/test/resources/configs/tika-config2.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
"_exclude": ["executable-parser"],
"exclude": ["executable-parser"],
"_mime-exclude": ["image/jpeg", "application/pdf"]
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
"_exclude": [
"exclude": [
"tesseract-ocr-parser"
]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
"parser-excludes": ["pdf-parser"]
"exclude": ["pdf-parser"]
}
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
"_exclude": [
"exclude": [
"outlook-pst-parser",
"pst-mail-item-parser"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"parsers": [
{
"default-parser": {
"_exclude": [
"exclude": [
"outlook-pst-parser",
"pst-mail-item-parser"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,50 +43,35 @@ public PluginsWriter(SimpleAsyncConfig simpleAsyncConfig, Path pluginsConfig) {
}

void write(Path output) throws IOException {
Path baseInput = StringUtils.isBlank(simpleAsyncConfig.getInputDir())
? Paths.get(".").toAbsolutePath()
: Paths.get(simpleAsyncConfig.getInputDir());
Path baseOutput = StringUtils.isBlank(simpleAsyncConfig.getOutputDir())
? null
: Paths.get(simpleAsyncConfig.getOutputDir());
if (Files.isRegularFile(baseInput)) {
boolean userConfigProvided = !StringUtils.isBlank(simpleAsyncConfig.getTikaConfig());
boolean inputExplicit = !StringUtils.isBlank(simpleAsyncConfig.getInputDir());
boolean outputExplicit = !StringUtils.isBlank(simpleAsyncConfig.getOutputDir());

// Resolve baseInput. If -i is explicit, use it. If not and the user
// didn't supply --config, fall back to '.' so the template's
// FETCHER_BASE_PATH placeholder gets a sane default. If --config is
// supplied and -i isn't, baseInput stays null so we don't trample the
// user's own basePath values.
Path baseInput = null;
if (inputExplicit) {
baseInput = Paths.get(simpleAsyncConfig.getInputDir());
} else if (!userConfigProvided) {
baseInput = Paths.get(".").toAbsolutePath();
}
if (baseInput != null && Files.isRegularFile(baseInput)) {
baseInput = baseInput.toAbsolutePath().getParent();
if (baseInput == null) {
throw new IllegalArgumentException("File must be at least one directory below root");
}
}
Path baseOutput = outputExplicit
? Paths.get(simpleAsyncConfig.getOutputDir())
: null;
try {
ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper();
ObjectNode root = (ObjectNode) objectMapper.readTree(
getClass().getResourceAsStream("/config-template.json"));

// Set fetcher basePath
ObjectNode fetchers = (ObjectNode) root.get("fetchers");
if (fetchers != null && fetchers.has("fsf")) {
ObjectNode fsf = (ObjectNode) fetchers.get("fsf");
if (fsf != null && fsf.has("file-system-fetcher")) {
ObjectNode fsFetcher = (ObjectNode) fsf.get("file-system-fetcher");
fsFetcher.put("basePath", baseInput.toAbsolutePath().toString());
}
}

// Set emitter basePath
ObjectNode emitters = (ObjectNode) root.get("emitters");
if (baseOutput != null && emitters != null && emitters.has("fse")) {
ObjectNode fse = (ObjectNode) emitters.get("fse");
if (fse != null && fse.has("file-system-emitter")) {
ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter");
fsEmitter.put("basePath", baseOutput.toAbsolutePath().toString());
}
}

// Set pipes-iterator basePath
ObjectNode pipesIterator = (ObjectNode) root.get("pipes-iterator");
if (pipesIterator != null && pipesIterator.has("file-system-pipes-iterator")) {
ObjectNode fsIterator = (ObjectNode) pipesIterator.get("file-system-pipes-iterator");
fsIterator.put("basePath", baseInput.toAbsolutePath().toString());
}

// Set plugin-roots
String pluginString;
if (!StringUtils.isBlank(simpleAsyncConfig.getPluginsDir())) {
Expand All @@ -100,18 +85,36 @@ void write(Path output) throws IOException {
}
root.put("plugin-roots", pluginString);

// If the user provided a -c config, merge their settings first.
// This brings in parsers, parse-context, metadata-filters, and
// optionally pipes config (e.g. forkedJvmArgs with log4j settings).
// Merge user's --config first so the CLI overrides below land on
// the final merged document. Doing this in the other order means
// mergeUserConfig's shallow replace silently wipes any patch we
// applied before the merge — exactly the bug behind TIKA-4739
// ("-i/-o don't override basePath as documented").
if (!StringUtils.isBlank(simpleAsyncConfig.getTikaConfig())) {
Path userConfigPath = Paths.get(simpleAsyncConfig.getTikaConfig());
JsonNode userRoot = objectMapper.readTree(userConfigPath.toFile());
mergeUserConfig(root, (ObjectNode) userRoot);
}

// Now apply CLI overrides on top of whatever pipes config exists.
// This lets the user have forkedJvmArgs in their config (e.g. log4j)
// while still controlling numClients and Xmx from the command line.
// Apply -i / -o on top of the merged document by component TYPE
// rather than hardcoded id ("fsf"/"fse"). This way users who
// renamed their filesystem fetcher/emitter still get the override,
// and non-filesystem fetchers/emitters (S3, GCS, etc.) are left
// untouched. baseInput/baseOutput are null when the user supplied
// --config without -i/-o, in which case their basePath values stay
// intact.
if (baseInput != null) {
patchFileSystemBasePath(root, "fetchers", "file-system-fetcher",
baseInput.toAbsolutePath().toString());
patchSingletonFileSystemBasePath(root, "pipes-iterator",
"file-system-pipes-iterator", baseInput.toAbsolutePath().toString());
}
if (baseOutput != null) {
patchFileSystemBasePath(root, "emitters", "file-system-emitter",
baseOutput.toAbsolutePath().toString());
}

// CLI overrides on the pipes section.
ObjectNode pipesNode = root.has("pipes")
? (ObjectNode) root.get("pipes")
: objectMapper.createObjectNode();
Expand Down Expand Up @@ -142,23 +145,14 @@ void write(Path output) throws IOException {
// For content-only mode, change the emitter file extension based on handler type
if (simpleAsyncConfig.isContentOnly()) {
String ext = getFileExtensionForHandlerType(simpleAsyncConfig.getHandlerType());
if (emitters != null && emitters.has("fse")) {
ObjectNode fse = (ObjectNode) emitters.get("fse");
if (fse != null && fse.has("file-system-emitter")) {
ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter");
fsEmitter.put("fileExtension", ext);
}
}
patchFileSystemField(root, "emitters", "file-system-emitter",
"fileExtension", ext);
}

// Override the emitter's onExists policy if set on the CLI (--on-exists)
if (!StringUtils.isBlank(simpleAsyncConfig.getOnExists())
&& emitters != null && emitters.has("fse")) {
ObjectNode fse = (ObjectNode) emitters.get("fse");
if (fse != null && fse.has("file-system-emitter")) {
ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter");
fsEmitter.put("onExists", simpleAsyncConfig.getOnExists());
}
if (!StringUtils.isBlank(simpleAsyncConfig.getOnExists())) {
patchFileSystemField(root, "emitters", "file-system-emitter",
"onExists", simpleAsyncConfig.getOnExists());
}

// Write timeout limits to parse-context if configured on CLI
Expand All @@ -178,6 +172,52 @@ void write(Path output) throws IOException {
}
}

/**
* Sets {@code basePath} on every entry in an id-keyed section
* ({@code fetchers}, {@code emitters}) whose wrapper type matches
* {@code typeName}. Other component types in the section are left
* untouched so a config that mixes filesystem + S3 still works.
*/
private static void patchFileSystemBasePath(ObjectNode root, String section,
String typeName, String basePath) {
patchFileSystemField(root, section, typeName, "basePath", basePath);
}

/**
* Sets a single field on every id-keyed entry in {@code section} whose
* wrapper type matches {@code typeName}.
*/
private static void patchFileSystemField(ObjectNode root, String section,
String typeName, String field, String value) {
JsonNode sectionNode = root.get(section);
if (sectionNode == null || !sectionNode.isObject()) {
return;
}
Iterator<Map.Entry<String, JsonNode>> ids = sectionNode.fields();
while (ids.hasNext()) {
Map.Entry<String, JsonNode> idEntry = ids.next();
JsonNode typed = idEntry.getValue();
if (typed.isObject() && typed.has(typeName)) {
ObjectNode target = (ObjectNode) typed.get(typeName);
target.put(field, value);
}
}
}

/**
* Sets {@code basePath} on a singleton section ({@code pipes-iterator})
* whose wrapper type matches {@code typeName}.
*/
private static void patchSingletonFileSystemBasePath(ObjectNode root, String section,
String typeName, String basePath) {
JsonNode sectionNode = root.get(section);
if (sectionNode == null || !sectionNode.isObject() || !sectionNode.has(typeName)) {
return;
}
ObjectNode target = (ObjectNode) sectionNode.get(typeName);
target.put("basePath", basePath);
}

/**
* Merges user config fields into the auto-generated root.
* All user fields override the auto-generated template values.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
"queueSize": 10000,
"numEmitters": 1,
"emitIntermediateResults": false,
"startupTimeoutMillis": 240000,
"sleepOnStartupTimeoutMillis": 240000,
"shutdownClientAfterMillis": 300000,
"numClients": 2,
"maxFilesProcessedPerProcess": 10000,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
"queueSize": 10000,
"numEmitters": 1,
"emitIntermediateResults": false,
"startupTimeoutMillis": 240000,
"sleepOnStartupTimeoutMillis": 240000,
"shutdownClientAfterMillis": 300000,
"numClients": 2,
"maxFilesProcessedPerProcess": 10000,
Expand Down
Loading
Loading