diff --git a/src/main/groovy/org/dict_uk/tools/Stemmer.groovy b/src/main/groovy/org/dict_uk/tools/Stemmer.groovy index 14a960f2..500f6bed 100755 --- a/src/main/groovy/org/dict_uk/tools/Stemmer.groovy +++ b/src/main/groovy/org/dict_uk/tools/Stemmer.groovy @@ -14,6 +14,8 @@ import groovy.transform.CompileStatic class Stemmer { + public static final boolean REMOVE_PREFIXES = false + Map> roots = [:].withDefault { [] as Set }.asSynchronized() Map> rootsPref = [:].withDefault { [] as Set }.asSynchronized() Map preStems = [:] @@ -49,6 +51,21 @@ class Stemmer { stemSet << stem } + if( ! REMOVE_PREFIXES ) { + def parts = words.split(/ +/) + if( parts.length > 1 ) { + words = parts[0] + def wordsWithPrefix = parts[1] + + wordsWithPrefix.split(/ /).findAll{ it }.each { w -> + if( w in preStems ) println "duplicate word: $w" + int idx = w.indexOf(stem) + def stem2 = w[0..idx] + stem + preStems[w] = stem2 + } + } + } + words.split(/ /).findAll{ it }.each { w -> if( w in preStems ) println "duplicate word: $w" preStems[w] = stem @@ -219,7 +236,10 @@ class Stemmer { // (Pattern.compile(/(мебл|магл)ьований/)): '$1', (Pattern.compile(/(бу|секре|компози|зекуц|бі|ди|ститу|моц)(ційний|торний)$/)): '$1т', (Pattern.compile(/((?