Skip to content

Commit

Permalink
TIKA-2323
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Apr 11, 2017
1 parent 3b33da2 commit 2a2e631
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ protected void writeProfileData(EvalFilePaths fps, int i, Metadata m,
data.put(Cols.ELAPSED_TIME_MILLIS,
getTime(m));

String content = getContent(m, maxContentLength);
String content = getContent(m);
if (content == null || content.trim().length() == 0) {
data.put(Cols.HAS_CONTENT, FALSE);
} else {
Expand Down Expand Up @@ -458,15 +458,15 @@ void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
*/
protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
String c = getContent(metadata, maxLength);
if (c.length() > maxLength) {
String c = getContent(metadata);
if (maxLength > -1 && c.length() > maxLength) {
c = c.substring(0, maxLength);
data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
}
return c;

}
protected static String getContent(Metadata metadata, int maxLength) {
protected static String getContent(Metadata metadata) {
if (metadata == null) {
return "";
}
Expand All @@ -478,7 +478,7 @@ protected static String getContent(Metadata metadata, int maxLength) {
}

void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
String content = getContent(metadata, maxContentLengthForLangId);
String content = getContent(metadata);
if (content.length() < 200) {
return;
}
Expand Down Expand Up @@ -531,7 +531,7 @@ public int compare(Pair<String, Integer> o1, Pair<String, Integer> o2) {
}

void langid(Metadata metadata, Map<Cols, String> data) {
String content = getContent(metadata, maxContentLengthForLangId);
String content = getContent(metadata);
if (content.length() < 50) {
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,9 @@ public class ExtractComparer extends AbstractProfiler {
.addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
.addOption("drop", true, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
.addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats")
.addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id")
.addOption("maxTokens", true, "maximum tokens to process, default=200000")
.addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
.addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")

;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,18 +176,18 @@ protected ExtractReader buildExtractReader(Map<String, String> localAttrs) {

FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {

int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -1);
if (maxContentLength > -1) {
int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -2);
if (maxContentLength > -2) {
abstractProfiler.setMaxContentLength(maxContentLength);
}

int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -1);
if (maxContentLengthForLangId > -1) {
int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -2);
if (maxContentLengthForLangId > -2) {
abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
}

int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -1);
if (maxTokens > -1) {
int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -2);
if (maxTokens > -2) {
abstractProfiler.setMaxTokens(maxTokens);
}

Expand Down

0 comments on commit 2a2e631

Please sign in to comment.