From d98260508810070094da81f2c5239483b849b8e8 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Fri, 19 May 2017 15:22:37 -0400 Subject: [PATCH 1/3] LUCENE-7841: Normalize ghe with upturn --- .../lucene/analysis/uk/UkrainianMorfologikAnalyzer.java | 2 ++ .../apache/lucene/analysis/uk/TestUkrainianAnalyzer.java | 9 ++++++++- lucene/ivy-versions.properties | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java index 6955fe334fc1..cd502fd82916 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java @@ -116,6 +116,8 @@ protected Reader initReader(String fieldName, Reader reader) { // ignored characters builder.add("\u0301", ""); builder.add("\u00AD", ""); + builder.add("ґ", "г"); + builder.add("Ґ", "Г"); NormalizeCharMap normMap = builder.build(); reader = new MappingCharFilter(normMap, reader); diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java index 15b247d5af70..e9a010212e63 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java @@ -52,10 +52,17 @@ public void testSpecialCharsTokenStream() throws Exception { public void testCapsTokenStream() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); assertAnalyzesTo(a, "Цих Чайковського і Ґете.", - new String[] { "Чайковське", "Чайковський", "Ґете" }); + new String[] { "Чайковське", "Чайковський", "Гете" }); a.close(); } + public void testCharNormalization() throws Exception { + Analyzer a = new UkrainianMorfologikAnalyzer(); + assertAnalyzesTo(a, "Ґюмрі та Гюмрі.", + new String[] { "Гюмрі", "Гюмрі" }); + a.close(); + } + public void testSampleSentence() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); assertAnalyzesTo(a, "Це — проект генерування словника з тегами частин мови для української мови.", diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 7e0e7c72f587..ca5141e85d53 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -276,7 +276,7 @@ org.slf4j.version = 1.7.7 /org.tukaani/xz = 1.5 /rome/rome = 1.0 -ua.net.nlp.morfologik-ukrainian-search.version = 3.7.5 +ua.net.nlp.morfologik-ukrainian-search.version = 3.7.6 /ua.net.nlp/morfologik-ukrainian-search = ${ua.net.nlp.morfologik-ukrainian-search.version} /xerces/xercesImpl = 2.9.1 From 2659a3f6b2b44ad7618189ba9a1a76ba4a2ff5c8 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Tue, 23 May 2017 08:05:36 -0400 Subject: [PATCH 2/3] LUCENE-7841: update jar checksum --- lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 | 1 - lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 create mode 100644 lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 diff --git a/lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 b/lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 deleted file mode 100644 index 8794e71fbe9b..000000000000 --- a/lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -2b8c8fbd740164d220ca7d18605b8b2092e163e9 diff --git a/lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 b/lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 new file mode 100644 index 000000000000..6f0b86c82908 --- /dev/null +++ b/lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 @@ -0,0 +1 @@ +8d2c4bf006f59227bcba8885b4602b3a8b5bd799 From e8a9275ac7a1b188d1c7f98322d9bf63b7ad19f7 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Tue, 23 May 2017 08:09:10 -0400 Subject: [PATCH 3/3] LUCENE-7841: update changes file --- lucene/CHANGES.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8010decd115b..889d9208c25a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -5,6 +5,10 @@ http://s.apache.org/luceneversions ======================= Lucene 6.7.0 ======================= +Improvements + +* LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss) + Other * LUCENE-7800: Remove code that potentially rethrows checked exceptions @@ -58,7 +62,6 @@ Bug Fixes * LUCENE-7833: ToParentBlockJoinQuery computed the min score instead of the max score with ScoreMode.MAX. (Adrien Grand) -Improvements * LUCENE-7782: OfflineSorter now passes the total number of items it will write to getWriter (Mike McCandless) @@ -76,6 +79,10 @@ Improvements * LUCENE-7811: Sorted set facets now use sparse storage when collecting hits, when appropriate. (Mike McCandless) +Improvements + +* LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss) + Optimizations * LUCENE-7787: spatial-extras HeatmapFacetCounter will now short-circuit it's