diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index b24b27e60d..d2b31530df 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -162,8 +162,8 @@ public enum PARSE_ERROR_TYPE { * @param p path to the common_tokens directory. If this is null, try to load from classPath * @throws IOException */ - public static void loadCommonTokens(Path p) throws IOException { - commonTokenCountManager = new CommonTokenCountManager(p); + public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException { + commonTokenCountManager = new CommonTokenCountManager(p, defaultLangCode); } public AbstractProfiler(ArrayBlockingQueue fileQueue, @@ -541,16 +541,29 @@ void langid(Metadata metadata, Map data) { } List probabilities = langIder.getProbabilities(s); if (probabilities.size() > 0) { - data.put(Cols.LANG_ID_1, probabilities.get(0).getLocale().getLanguage()); + data.put(Cols.LANG_ID_1, getLangString(probabilities.get(0))); data.put(Cols.LANG_ID_PROB_1, Double.toString(probabilities.get(0).getProbability())); } if (probabilities.size() > 1) { - data.put(Cols.LANG_ID_2, probabilities.get(1).getLocale().getLanguage()); + data.put(Cols.LANG_ID_2, getLangString(probabilities.get(1))); data.put(Cols.LANG_ID_PROB_2, Double.toString(probabilities.get(1).getProbability())); } + } + private String getLangString(DetectedLanguage detectedLanguage) { + //So that we have mapping between lang id and common-tokens file names + String lang = detectedLanguage.getLocale().getLanguage(); + if ("zh".equals(lang)) { + if (detectedLanguage.getLocale().getRegion().isPresent()) { + lang += "-" + detectedLanguage.getLocale().getRegion().get().toLowerCase(Locale.US); + } else { + //hope for the best + lang += "-cn"; + } + } + return lang; } void getFileTypes(Metadata metadata, Map output) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java index 65606d0ceb..7b006dfed8 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java @@ -83,6 +83,7 @@ public class ExtractComparer extends AbstractProfiler { .addOption("maxTokens", true, "maximum tokens to process, default=200000") .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000") .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000") + .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result") ; } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java index 514778faca..d5f9af3c97 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java @@ -71,6 +71,7 @@ public class ExtractProfiler extends AbstractProfiler { .addOption("maxTokens", true, "maximum tokens to process, default=200000") .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000") .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000") + .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result") ; diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java index 55bb5236e5..a5a912ff29 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java @@ -70,9 +70,13 @@ public ConsumersManager build(Node node, Map runtimeAttributes, } Path commonTokens = getPath(localAttrs, "commonTokens"); + String defaultLangCode = localAttrs.get("defaultLangCode"); + if (defaultLangCode == null || "".equals(defaultLangCode)) { + defaultLangCode = "en"; + } //can be null, in which case will load from memory try { - AbstractProfiler.loadCommonTokens(commonTokens); + AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java index d4aaa65417..a03e241d4b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java @@ -47,9 +47,10 @@ public class CommonTokenCountManager { //if we have no model or if no langid is passed in //make this configurable - String defaultLangCode = "en"; + private final String defaultLangCode; - public CommonTokenCountManager(Path commonTokensDir) throws IOException { + public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) throws IOException { + this.defaultLangCode = defaultLangCode; this.commonTokensDir = commonTokensDir; tryToLoad(defaultLangCode); //if you couldn't load it, make sure to add an empty diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml index 887a3e74e5..1ddcda24cf 100644 --- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml +++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml @@ -64,6 +64,8 @@ description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>