Skip to content

Commit

Permalink
generateReplacements task cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
arysin committed Jan 27, 2024
1 parent f0e18fe commit 3a0a22f
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 101 deletions.
126 changes: 25 additions & 101 deletions distr/morfologik-ukrainian/build.gradle
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
plugins {
id 'java'
// id 'groovy'
id 'groovy'
id 'eclipse'
id 'base'
id 'maven-publish'
Expand Down Expand Up @@ -214,48 +214,33 @@ task createOutRulesDir {
}


def headText =
'''# Simple replace table%s
# Format: word=suggestion1|suggestion2|suggestion3...
'''

import org.dict_uk.morfologik.Replacements

task createReplacementDict(dependsOn: createOutRulesDir) {
def srcDir="${inputDir}/../data/dict"
def outFile="$outRulesDir/replace.txt"
def allFiles = Arrays.asList(new File(srcDir).listFiles())
def srcFiles = allFiles.findAll{ it.name =~ /(twisters|invalid|subst).*\.lst/ }

inputs.files "$srcDir/twisters.lst", "$srcDir/invalid.lst", "$srcDir/invalid-compound.lst", "$srcDir/invalid-auto-replace.txt", "$srcDir/subst.lst", "$srcDir/invalid-composite.lst"
inputs.files srcFiles
outputs.file outFile


doLast {

def headText =
'''# Simple replace table
# Format: word=suggestion1|suggestion2|suggestion3...
'''
def outLines = []
inputs.files.each { File file ->
file.eachLine "UTF-8", {
if( it.startsWith('#') || ! it.contains(' #>') )
return

it = it.replaceFirst(/\s*# rv[^\s]+/, '')

if( file.name.contains('composite') ) {
it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*? - ([а-яіїєґА-ЯІЇЄҐ'-]+).*#> *(.*)/, '$1-$2=$3')
outLines << it
}
else {
it = it.replace(' +cs=', '')
it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#> *(.*)(#ok:.*)?/, '$1=$2')
outLines << it
}
}
}

def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #>') && ! it.contains('ua_1992')})

new File(outFile).text = headText + outLines.join("\n") + '\n'
println "Wrote ${outLines.size()} replacements"
}

}



task createSoftReplacementDict(dependsOn: createOutRulesDir) {
def srcDir="${inputDir}/../data/dict"
def outFile="${outRulesDir}/replace_soft.txt"
Expand All @@ -267,16 +252,10 @@ task createSoftReplacementDict(dependsOn: createOutRulesDir) {


doLast {
def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #> ') && ! it.contains('ua_1992')})

def headText =
'''# Simple replace table for soft suggestions
# Format: word=suggestion1|suggestion2|suggestion3...
'''
def outLines = getReplacements(srcDir, srcFiles, { ! it.contains('ua_1992')})

println "Wrote ${outLines.size()} replacements"
new File(outFile).text = headText + outLines.join('\n') + "\n"
println "Wrote ${outLines.size()} soft replacements"
new File(outFile).text = String.format(headText, " for soft suggestions") + outLines.join('\n') + "\n"
}
}

Expand All @@ -293,43 +272,14 @@ task createNewSpellingReplacementDict(dependsOn: createOutRulesDir) {
inputs.files srcFiles
outputs.file outFile


doLast {
def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #> ') && it.contains('ua_1992') })

def headText =
'''# Simple replace table for 2019 spelling suggestions
# Format: word=suggestion1|suggestion2|suggestion3...
'''
def outLines = getReplacements(srcDir, srcFiles, {it.contains('ua_1992')})

println "Wrote ${outLines.size} replacements"
new File(outFile).text = headText + outLines.join('\n') + "\n"
println "Wrote ${outLines.size()} ua_2019 replacements"
new File(outFile).text = String.format(headText, " for 2019 spelling suggestions") + outLines.join('\n') + "\n"
}
}

List<String> getReplacements(String srcDir, List<File> files, Closure filter) {
def outLines = []

files.each{ srcFile ->
def rvLines = new File("$srcDir/$srcFile.name").readLines()
.findAll {
! it.startsWith('#') && it.contains(' #> ') && filter(it)
}.collect{
it = it.replace(' +cs=', '')
it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#> *(.*)(#ok:.*)?/, '$1=$2')
it = it.replaceFirst(/ *# rv_...(\|rv_...)* */, '')
}

outLines.addAll(rvLines)
}

java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA"))
Collections.sort(outLines, coll)

outLines
}


task createRenamedReplacementDict(dependsOn: [processResources, createOutRulesDir]) {
def srcDir="${inputDir}/../data/dict"
Expand All @@ -340,37 +290,11 @@ task createRenamedReplacementDict(dependsOn: [processResources, createOutRulesDi
inputs.files srcFiles
outputs.file outFile


def headText =
'''# Simple replace table for soft suggestions
# Format: name=replacemet|(optional) explanation
'''
doLast {
def outLines = []

srcFiles.each{ File srcFile ->
def rvLines = new File("$srcDir/$srcFile.name").text
.split('\n')
.findAll {
! it.startsWith('#') && it.contains(' #>> ')
}.collect{
if( srcFile.name.contains('composite') ) {
it = it.replaceFirst(/ \/.* - /, '-')
}
// it = it.replace(' +cs=', '')
it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#>> *(.*)(#ok:.*)?/, '$1=$2')
it = it.replaceFirst(/ *# rv_...(\|rv_...)* */, '')
}

outLines.addAll(rvLines)
}

java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA"))
Collections.sort(outLines, coll)

println "Wrote ${outLines.size()} replacements"
new File(outFile).text = headText + outLines.join('\n') + "\n"
def outLines = Replacements.getReplacements(srcDir, srcFiles, { it.contains(' #>> ')})

println "Wrote ${outLines.size()} rename replacements"
new File(outFile).text = String.format(headText, " for toponim renaming") + outLines.join('\n') + "\n"
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package org.dict_uk.morfologik

import groovy.transform.CompileStatic

public class Replacements {
private static final int MAX_REPLACEMENTS = 5

@CompileStatic
public static List<String> getReplacements(String srcDir, List<File> files, Closure filter) {
List<String> outLines = []

files.each{ srcFile ->
int tooManyReplacementsCount = 0
List<String> rvLines = new File("$srcDir/$srcFile.name").readLines()
.findAll {
! it.startsWith('#') && filter(it)
}.collect{
it = it.replaceFirst(/\s*# rv[^\s]+/, '')

if( srcFile.name.contains('composite') ) {
it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*? - ([а-яіїєґА-ЯІЇЄҐ'-]+).*#>>? *(.*)/, '$1-$2=$3')
}
else {
it = it.replace(' +cs=', '')
it = it.replaceFirst(/^([а-яіїєґА-ЯІЇЄҐ'-]+).*#>>? *(.*)(#ok:.*)?/, '$1=$2')
}

String[] lineParts = it.split("=")
String replStr = lineParts[1]
String[] parts = replStr.split(/\|/)
if( parts.length > MAX_REPLACEMENTS ) {
it = lineParts[0] + "=" + parts[0..3].join("|") + "|" + parts[4..-1].join("; ")
tooManyReplacementsCount++
// if( srcFile.name == "base.lst")
// println "Adjusted to $it"
}
it
}

outLines.addAll(rvLines)
if( tooManyReplacementsCount ) {
println "INFO: merged ${tooManyReplacementsCount} replacements to fit into 5 for ${srcFile.name}"
}
}

java.text.Collator coll = java.text.Collator.getInstance(new Locale("uk", "UA"))
coll.setStrength(java.text.Collator.IDENTICAL)
coll.setDecomposition(java.text.Collator.NO_DECOMPOSITION)
Collections.sort(outLines, coll)

outLines
}

}

0 comments on commit 3a0a22f

Please sign in to comment.