From 73e0e11807f2c62a7620e53648cb379b18a0d1ee Mon Sep 17 00:00:00 2001 From: Christophe Bismuth Date: Wed, 21 Nov 2018 13:58:44 +0100 Subject: [PATCH 1/2] LUCENE-8548: Add Cyrillic word test --- .../lucene/analysis/ko/TestKoreanAnalyzer.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java index fd574cede49b..34b6ffe825b9 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java @@ -106,4 +106,15 @@ public void testUserDict() throws IOException { new int[]{1, 1, 1} ); } -} \ No newline at end of file + + public void testCyrillicWord() throws IOException { + final Analyzer analyzer = new KoreanAnalyzer(TestKoreanTokenizer.readDict(), + KoreanTokenizer.DEFAULT_DECOMPOUND, KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS, false); + assertAnalyzesTo(analyzer, "мoscow", + new String[]{"мoscow"}, + new int[]{0}, + new int[]{6}, + new int[]{1} + ); + } +} From 4c79ca6271537bb9ca347c6128a3a5ad016e5d97 Mon Sep 17 00:00:00 2001 From: Christophe Bismuth Date: Fri, 23 Nov 2018 17:16:05 +0100 Subject: [PATCH 2/2] LUCENE-8548: Break on script boundaries and track character classes --- .../lucene/analysis/ko/KoreanTokenizer.java | 46 +++++++++++++++++-- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java index ab3205f212e2..a01c43929c35 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java @@ -102,7 +102,7 @@ public enum DecompoundMode { */ public static final DecompoundMode DEFAULT_DECOMPOUND = DecompoundMode.DISCARD; - private static final boolean VERBOSE = false; + private static final boolean VERBOSE = Boolean.parseBoolean(System.getProperty("tests.verbose", "false")); // For safety: private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; @@ -194,6 +194,10 @@ public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); + + if (VERBOSE) { + setGraphvizFormatter(new GraphvizFormatter(ConnectionCosts.getInstance())); + } } private GraphvizFormatter dotOut; @@ -208,6 +212,12 @@ public void setGraphvizFormatter(GraphvizFormatter dotOut) { public void close() throws IOException { super.close(); buffer.reset(input); + if (dotOut != null) { + final String dotString = dotOut.finish(); + if (VERBOSE) { + System.out.println(dotString); + } + } } @Override @@ -719,10 +729,13 @@ private void parse() throws IOException { // Find unknown match: final int characterId = characterDefinition.getCharacterClass(firstCharacter); - final boolean isPunct = isPunctuation(firstCharacter); + final int scriptId = Character.UnicodeScript.of(characterId).ordinal(); + byte[] characterClasses = new byte[0]; + int characterClassesCount = 0; // NOTE: copied from UnknownDictionary.lookup: int unknownWordLength; + int previousScriptId = scriptId; if (!characterDefinition.isGroup(firstCharacter)) { unknownWordLength = 1; } else { @@ -733,8 +746,22 @@ private void parse() throws IOException { if (ch == -1) { break; } - if (characterId == characterDefinition.getCharacterClass((char) ch) && - isPunctuation((char) ch) == isPunct) { + characterClasses = ArrayUtil.grow(characterClasses, ++characterClassesCount); + characterClasses[characterClassesCount] = (characterDefinition.getCharacterClass((char) ch)); + int currentScriptId = Character.UnicodeScript.of(ch).ordinal(); + if (isCommonOrInherited(currentScriptId)) { + currentScriptId = previousScriptId; + } else { + previousScriptId = currentScriptId; + } + /* + * From UTR #24: Implementations that determine the boundaries between + * characters of given scripts should never break between a non-spacing + * mark and its base character. Thus for boundary determinations and + * similar sorts of processing, a non-spacing mark — whatever its script + * value — should inherit the script value of its base character. + */ + if (!isSameScript(currentScriptId, scriptId) && Character.getType(ch) != Character.NON_SPACING_MARK) { unknownWordLength++; } else { break; @@ -958,4 +985,15 @@ private static boolean isPunctuation(char ch) { return false; } } + + private static boolean isCommonOrInherited(int script) { + return script == Character.UnicodeScript.COMMON.ordinal() || + script == Character.UnicodeScript.INHERITED.ordinal(); + } + + private static boolean isSameScript(int scriptOne, int scriptTwo) { + return scriptOne == scriptTwo || + isCommonOrInherited(scriptOne) || + isCommonOrInherited(scriptTwo); + } }