From 5f6df64f76139c282a4eac641feb09eab4bc08d6 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Sat, 27 Jul 2019 17:16:31 +0900 Subject: [PATCH 1/5] LUCENE-8933: Validate JapaneseTokenizer user dictionary entry if the concatenated segment is same as its surface form. --- lucene/MIGRATE.txt | 12 ++++++++++++ .../lucene/analysis/ja/dict/UserDictionary.java | 8 ++++++++ .../analysis/ja/dict/UserDictionaryTest.java | 16 ++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 7cb7dd973551..b753088fe860 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -1,5 +1,17 @@ # Apache Lucene Migration Guide +## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ## + +User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids +unexpected runtime exceptions or behaviours. +For example, those entries are not allowed at all and an exception is thrown when loading the dictionary file. + +# concatenated "日本経済新聞" does not match the surface form "日経新聞" +日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 + +# concatenated "日経新聞" does not match the surface form "日本経済新聞" +日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞 + ## TermsEnum is now fully abstract (LUCENE-8292) ## TermsEnum has been changed to be fully abstract, so non-abstract subclass must implement all it's methods. diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index eaa5badd17a6..480a69d60608 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -104,6 +104,8 @@ public int compare(String[] left, String[] right) { long ord = 0; for (String[] values : featureEntries) { + String surface = values[0].replaceAll(" ", ""); + String concatenatedSegment = values[1].replaceAll(" ", ""); String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; @@ -113,6 +115,12 @@ public int compare(String[] left, String[] right) { " - the number of segmentations (" + segmentation.length + ")" + " does not the match number of readings (" + readings.length + ")"); } + + if (!surface.equals(concatenatedSegment)) { + throw new RuntimeException("Illegal user dictionary entry " + values[0] + + " - the concatenated segmentation (" + concatenatedSegment + ")" + + " does not match the surface form (" + surface + ")"); + } int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java index 88a366f87f51..08a21c85a59c 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java @@ -18,6 +18,7 @@ import java.io.IOException; +import java.io.StringReader; import org.apache.lucene.analysis.ja.TestJapaneseTokenizer; import org.apache.lucene.util.LuceneTestCase; @@ -77,4 +78,19 @@ public void testRead() throws IOException { UserDictionary dictionary = TestJapaneseTokenizer.readDict(); assertNotNull(dictionary); } + + @Test(expected = RuntimeException.class) + public void testReadInvalid1() throws IOException { + // the concatenated segment must be the same as the surface form + String invalidEntry = "日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞"; + UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry)); + } + + @Test(expected = RuntimeException.class) + public void testReadInvalid2() throws IOException { + // the concatenated segment must be the same as the surface form + String invalidEntry = "日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞"; + UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry)); + } + } From f7cca9eef892ef3ebb26d9cb5722a3e8548af251 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Sat, 27 Jul 2019 17:42:24 +0900 Subject: [PATCH 2/5] fix typo. --- lucene/MIGRATE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index b753088fe860..0f37e6ee20f5 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -4,7 +4,7 @@ User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids unexpected runtime exceptions or behaviours. -For example, those entries are not allowed at all and an exception is thrown when loading the dictionary file. +For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file. # concatenated "日本経済新聞" does not match the surface form "日経新聞" 日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 From 090bdc7030c057e301af9cc3a5392cab0b785d16 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Sun, 28 Jul 2019 11:19:14 +0900 Subject: [PATCH 3/5] Use whitespace character class to remove all whitespace. --- .../org/apache/lucene/analysis/ja/dict/UserDictionary.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 480a69d60608..515c1d116636 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -104,8 +104,8 @@ public int compare(String[] left, String[] right) { long ord = 0; for (String[] values : featureEntries) { - String surface = values[0].replaceAll(" ", ""); - String concatenatedSegment = values[1].replaceAll(" ", ""); + String surface = values[0].replaceAll("\\s", ""); + String concatenatedSegment = values[1].replaceAll("\\s", ""); String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; From 8e7e5a8474cd9db66b6c70531d655dd2ada0c2e8 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Wed, 14 Aug 2019 11:52:54 +0900 Subject: [PATCH 4/5] lucene/MIGRATE.txt --- lucene/MIGRATE.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 0f37e6ee20f5..5890e0d8bbd3 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -12,6 +12,21 @@ For example, these entries are not allowed at all and an exception is thrown whe # concatenated "日経新聞" does not match the surface form "日本経済新聞" 日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞 +## Analysis factories now have customizable symbolic names (LUCENE-8778) ## + +The SPI names for concrete subclasses of TokenizerFactory, TokenFilterFactory, and CharfilterFactory are no longer +derived from their class name. Instead, each factory must have a static "NAME" field like this: + + /** o.a.l.a.standard.StandardTokenizerFactory's SPI name */ + public static final String NAME = "standard"; + +A factory can be resolved/instantiated with its NAME by using methods such as TokenizerFactory#lookupClass(String) +or TokenizerFactory#forName(String, Map). + +If there are any user-defined factory classes that don't have proper NAME field, an exception will be thrown +when (re)loading factories. e.g., when calling TokenizerFactory#reloadTokenizers(ClassLoader). + + ## TermsEnum is now fully abstract (LUCENE-8292) ## TermsEnum has been changed to be fully abstract, so non-abstract subclass must implement all it's methods. From df390db5a63575bb0bab1d7368328b028c3427ae Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Wed, 14 Aug 2019 12:02:46 +0900 Subject: [PATCH 5/5] Update changes --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1db0da0c5ac5..46b0bd95ca20 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -33,6 +33,8 @@ API Changes * LUCENE-8948: Change "name" argument in ICU factories to "form". Here, "form" is named after "Unicode Normalization Form". (Tomoko Uchida) +* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry. (Tomoko Uchida) + Improvements * LUCENE-8757: When provided with an ExecutorService to run queries across