Skip to content

Commit

Permalink
TIKA-2325 -- allow configuration of default language code for "common…
Browse files Browse the repository at this point in the history
… words" metric
  • Loading branch information
tballison committed Apr 12, 2017
1 parent 2a2e631 commit f3db573
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 11 deletions.
21 changes: 17 additions & 4 deletions tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ public enum PARSE_ERROR_TYPE {
* @param p path to the common_tokens directory. If this is null, try to load from classPath
* @throws IOException
*/
public static void loadCommonTokens(Path p) throws IOException {
commonTokenCountManager = new CommonTokenCountManager(p);
public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
commonTokenCountManager = new CommonTokenCountManager(p, defaultLangCode);
}

public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
Expand Down Expand Up @@ -541,16 +541,29 @@ void langid(Metadata metadata, Map<Cols, String> data) {
}
List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
if (probabilities.size() > 0) {
data.put(Cols.LANG_ID_1, probabilities.get(0).getLocale().getLanguage());
data.put(Cols.LANG_ID_1, getLangString(probabilities.get(0)));
data.put(Cols.LANG_ID_PROB_1,
Double.toString(probabilities.get(0).getProbability()));
}
if (probabilities.size() > 1) {
data.put(Cols.LANG_ID_2, probabilities.get(1).getLocale().getLanguage());
data.put(Cols.LANG_ID_2, getLangString(probabilities.get(1)));
data.put(Cols.LANG_ID_PROB_2,
Double.toString(probabilities.get(1).getProbability()));
}
}

private String getLangString(DetectedLanguage detectedLanguage) {
//So that we have mapping between lang id and common-tokens file names
String lang = detectedLanguage.getLocale().getLanguage();
if ("zh".equals(lang)) {
if (detectedLanguage.getLocale().getRegion().isPresent()) {
lang += "-" + detectedLanguage.getLocale().getRegion().get().toLowerCase(Locale.US);
} else {
//hope for the best
lang += "-cn";
}
}
return lang;
}

void getFileTypes(Metadata metadata, Map<Cols, String> output) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ public class ExtractComparer extends AbstractProfiler {
.addOption("maxTokens", true, "maximum tokens to process, default=200000")
.addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
.addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
.addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ public class ExtractProfiler extends AbstractProfiler {
.addOption("maxTokens", true, "maximum tokens to process, default=200000")
.addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
.addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
.addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")

;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,13 @@ public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
}

Path commonTokens = getPath(localAttrs, "commonTokens");
String defaultLangCode = localAttrs.get("defaultLangCode");
if (defaultLangCode == null || "".equals(defaultLangCode)) {
defaultLangCode = "en";
}
//can be null, in which case will load from memory
try {
AbstractProfiler.loadCommonTokens(commonTokens);
AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ public class CommonTokenCountManager {

//if we have no model or if no langid is passed in
//make this configurable
String defaultLangCode = "en";
private final String defaultLangCode;

public CommonTokenCountManager(Path commonTokensDir) throws IOException {
public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) throws IOException {
this.defaultLangCode = defaultLangCode;
this.commonTokensDir = commonTokensDir;
tryToLoad(defaultLangCode);
//if you couldn't load it, make sure to add an empty
Expand Down
2 changes: 2 additions & 0 deletions tika-eval/src/main/resources/tika-eval-comparison-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
<option opt="maxContentLengthForLangId" hasArg="true"
description="truncate content beyond this length for language id, default=50000"/>
<option opt="defaultLangCode" hasArg="true"
description="which language to use for common words if no 'common words' file exists for the langid result"/>


</commandline>
Expand Down
2 changes: 2 additions & 0 deletions tika-eval/src/main/resources/tika-eval-profiler-config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
<option opt="maxContentLengthForLangId" hasArg="true"
description="truncate content beyond this length for language id, default=50000"/>
<option opt="defaultLangCode" hasArg="true"
description="which language to use for common words if no 'common words' file exists for the langid result"/>



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public void setUp() throws Exception {
new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
IGNORE_LENGTH, IGNORE_LENGTH),
writer);
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath());
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath(), "en");
LanguageIDWrapper.loadBuiltInModels();
}

Expand Down Expand Up @@ -137,6 +137,30 @@ public void testBasicSpanish() throws Exception {

}

@Test
public void testChinese() throws Exception {
//make sure that language id matches common words
//file names. The test file contains MT'd Simplified Chinese with
//known "common words" appended at end.

EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file13_attachANotB.doc.json"),
getResourceAsFile("/test-dirs/extractsA/file13_attachANotB.doc.json").toPath()
);
EvalFilePaths fpsB = new EvalFilePaths(
Paths.get("non-existent.json"),
getResourceAsFile("/test-dirs/extractsB/non-existent.json").toPath());

comparer.compareFiles(fpsA, fpsB);

List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);

Map<Cols, String> row = tableInfos.get(0);
assertEquals("122", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("3", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("zh-cn", row.get(Cols.COMMON_TOKENS_LANG));

}

@Test
public void testEmpty() throws Exception {
Expand Down Expand Up @@ -245,7 +269,7 @@ public void testAttachmentCounts() {
@Ignore
public void testDebug() throws Exception {
Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
AbstractProfiler.loadCommonTokens(commonTokens);
AbstractProfiler.loadCommonTokens(commonTokens, "en");
EvalFilePaths fpsA = new EvalFilePaths(
Paths.get("file1.pdf.json"),
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@

public class TikaEvalCLITest extends TikaTest {
//TODO: these barely reach the minimal acceptable stage for unit tests
//but we have to start somewhere on the integration tests

private static Path extractsDir = Paths.get("src/test/resources/test-dirs");

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[{
"Content-Type":"text/plain",
"X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
"_comment" : "simplified",
"X-TIKA:content":"调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚.调整每一个心脏和每个声音,投标每个护理提取;让大家一起欢乐,赞美老拿骚 狐狸狐狸狐狸 "
},
{
"Content-Type":"text/plain",
Expand Down

0 comments on commit f3db573

Please sign in to comment.