From c0b35fd455b17fbbde03757cffaea72e475ab993 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Fri, 19 May 2017 15:22:37 -0400 Subject: [PATCH 1/3] LUCENE-7841: Normalize ghe with upturn --- .../lucene/analysis/uk/UkrainianMorfologikAnalyzer.java | 2 ++ .../apache/lucene/analysis/uk/TestUkrainianAnalyzer.java | 9 ++++++++- lucene/ivy-versions.properties | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java index 6955fe334fc1..cd502fd82916 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/uk/UkrainianMorfologikAnalyzer.java @@ -116,6 +116,8 @@ protected Reader initReader(String fieldName, Reader reader) { // ignored characters builder.add("\u0301", ""); builder.add("\u00AD", ""); + builder.add("ґ", "г"); + builder.add("Ґ", "Г"); NormalizeCharMap normMap = builder.build(); reader = new MappingCharFilter(normMap, reader); diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java index 15b247d5af70..e9a010212e63 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/uk/TestUkrainianAnalyzer.java @@ -52,10 +52,17 @@ public void testSpecialCharsTokenStream() throws Exception { public void testCapsTokenStream() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); assertAnalyzesTo(a, "Цих Чайковського і Ґете.", - new String[] { "Чайковське", "Чайковський", "Ґете" }); + new String[] { "Чайковське", "Чайковський", "Гете" }); a.close(); } + public void testCharNormalization() throws Exception { + Analyzer a = new UkrainianMorfologikAnalyzer(); + assertAnalyzesTo(a, "Ґюмрі та Гюмрі.", + new String[] { "Гюмрі", "Гюмрі" }); + a.close(); + } + public void testSampleSentence() throws Exception { Analyzer a = new UkrainianMorfologikAnalyzer(); assertAnalyzesTo(a, "Це — проект генерування словника з тегами частин мови для української мови.", diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 3a1efbb665b2..a5a05c606de7 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -275,7 +275,7 @@ org.slf4j.version = 1.7.7 /org.tukaani/xz = 1.5 /rome/rome = 1.0 -ua.net.nlp.morfologik-ukrainian-search.version = 3.7.5 +ua.net.nlp.morfologik-ukrainian-search.version = 3.7.6 /ua.net.nlp/morfologik-ukrainian-search = ${ua.net.nlp.morfologik-ukrainian-search.version} /xerces/xercesImpl = 2.9.1 From 28e033a0be809b29f32b5b663bc58721ddf8ef1f Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Tue, 23 May 2017 08:05:36 -0400 Subject: [PATCH 2/3] LUCENE-7841: update jar checksum --- lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 | 1 - lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 create mode 100644 lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 diff --git a/lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 b/lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 deleted file mode 100644 index 8794e71fbe9b..000000000000 --- a/lucene/licenses/morfologik-ukrainian-search-3.7.5.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -2b8c8fbd740164d220ca7d18605b8b2092e163e9 diff --git a/lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 b/lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 new file mode 100644 index 000000000000..6f0b86c82908 --- /dev/null +++ b/lucene/licenses/morfologik-ukrainian-search-3.7.6.jar.sha1 @@ -0,0 +1 @@ +8d2c4bf006f59227bcba8885b4602b3a8b5bd799 From 524bd94c4e4efe1bca30a532f27ef2ae32b1d07a Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Tue, 23 May 2017 08:09:55 -0400 Subject: [PATCH 3/3] LUCENE-7841: update changes file --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5dd6bce2b7c1..41ed0287bf7f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -68,6 +68,8 @@ Improvements * LUCENE-7811: Sorted set facets now use sparse storage when collecting hits, when appropriate. (Mike McCandless) +* LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss) + Optimizations * LUCENE-7787: spatial-extras HeatmapFacetCounter will now short-circuit it's