From 3d944e0edf3283b55faab8201d9a636d76ff4bd4 Mon Sep 17 00:00:00 2001 From: Chris Beer Date: Sat, 12 Aug 2017 10:32:25 -0700 Subject: [PATCH 1/2] SOLR-11231 Guard against unset fields when performing language detection --- .../TikaLanguageIdentifierUpdateProcessor.java | 15 ++++++++++----- ...eIdentifierUpdateProcessorFactoryTestCase.java | 13 +++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java index 836a3bf6795a..c02b176a03bc 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java @@ -112,13 +112,18 @@ protected String concatFields(SolrInputDocument doc) { private int getExpectedSize(SolrInputDocument doc, String[] fields) { int docSize = 0; for (String field : fields) { - Collection contents = doc.getFieldValues(field); - for (Object content : contents) { - if (content instanceof String) { - docSize += Math.min(((String) content).length(), maxFieldValueChars); + if (doc.containsKey(field)) { + Collection contents = doc.getFieldValues(field); + if (contents != null) { + for (Object content : contents) { + if (content instanceof String) { + docSize += Math.min(((String) content).length(), maxFieldValueChars); + } + } + + docSize = Math.min(docSize, maxTotalChars); } } - docSize = Math.min(docSize, maxTotalChars); } return docSize; } diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java index c7381a918e73..b90f54a4d3f6 100644 --- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java @@ -211,6 +211,19 @@ public void testDefaultFallbackEmptyString() throws Exception { assertEquals("", liProcessor.process(doc).getFieldValue("language")); } + @Test + public void testMissingFieldEmptyString() throws Exception { + SolrInputDocument doc; + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "no_such_field"); + parameters.add("langid.langField", "language"); + parameters.add("langid.enforceSchema", "false"); + liProcessor = createLangIdProcessor(parameters); + + doc = new SolrInputDocument(); + assertEquals("", liProcessor.process(doc).getFieldValue("language")); + } + @Test public void testFallback() throws Exception { SolrInputDocument doc; From 5a0106b5fecdb3b9cbeb3cda7e641cffd05b1694 Mon Sep 17 00:00:00 2001 From: Chris Beer Date: Sat, 12 Aug 2017 10:36:14 -0700 Subject: [PATCH 2/2] Short-circuit expected size calculation when it exceeds the maximum length --- .../processor/TikaLanguageIdentifierUpdateProcessor.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java index c02b176a03bc..df0e5f7fa252 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java @@ -121,7 +121,10 @@ private int getExpectedSize(SolrInputDocument doc, String[] fields) { } } - docSize = Math.min(docSize, maxTotalChars); + if (docSize > maxTotalChars) { + docSize = maxTotalChars; + break; + } } } }