From 167090a3215b264731401c117d2418c0bb417851 Mon Sep 17 00:00:00 2001 From: Andriy Rysin Date: Fri, 15 Dec 2023 16:44:41 -0500 Subject: [PATCH] stemmer update --- .../groovy/org/dict_uk/tools/Stemmer.groovy | 31 +++++++++++++++++-- .../org/dict_uk/tools/StemmerTest.groovy | 21 ++++++++----- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/main/groovy/org/dict_uk/tools/Stemmer.groovy b/src/main/groovy/org/dict_uk/tools/Stemmer.groovy index 14a960f2..500f6bed 100755 --- a/src/main/groovy/org/dict_uk/tools/Stemmer.groovy +++ b/src/main/groovy/org/dict_uk/tools/Stemmer.groovy @@ -14,6 +14,8 @@ import groovy.transform.CompileStatic class Stemmer { + public static final boolean REMOVE_PREFIXES = false + Map> roots = [:].withDefault { [] as Set }.asSynchronized() Map> rootsPref = [:].withDefault { [] as Set }.asSynchronized() Map preStems = [:] @@ -49,6 +51,21 @@ class Stemmer { stemSet << stem } + if( ! REMOVE_PREFIXES ) { + def parts = words.split(/ +/) + if( parts.length > 1 ) { + words = parts[0] + def wordsWithPrefix = parts[1] + + wordsWithPrefix.split(/ /).findAll{ it }.each { w -> + if( w in preStems ) println "duplicate word: $w" + int idx = w.indexOf(stem) + def stem2 = w[0..idx] + stem + preStems[w] = stem2 + } + } + } + words.split(/ /).findAll{ it }.each { w -> if( w in preStems ) println "duplicate word: $w" preStems[w] = stem @@ -219,7 +236,10 @@ class Stemmer { // (Pattern.compile(/(мебл|магл)ьований/)): '$1', (Pattern.compile(/(бу|секре|компози|зекуц|бі|ди|ститу|моц)(ційний|торний)$/)): '$1т', (Pattern.compile(/((?