-
Notifications
You must be signed in to change notification settings - Fork 70
/
build.gradle
163 lines (117 loc) · 3.69 KB
/
build.gradle
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
apply plugin: 'java'
apply plugin: 'groovy'
String ltVersion = '4.3-SNAPSHOT'
String langCode="uk"
String packageDir="org/apache/lucene/analysis/" + langCode
String resourceDir=sourceSets.main.resources.srcDirs[0].toString() + "/" + packageDir
String outputDir=sourceSets.main.output.resourcesDir.toString() + "/" + packageDir
String tmpDir="build/tmp"
String inputDir="${projectDir}/../../out"
String inputDictFile="dict_corp_lt.txt"
String dstEncoding="cp1251"
String srcEncoding="utf-8"
repositories {
mavenLocal()
mavenCentral()
}
configurations{
provided {
description = 'Configuration for generating the dictionaries'
}
}
dependencies {
provided 'org.languagetool:languagetool-tools:' + ltVersion
compile group: 'org.codehaus.groovy', name: 'groovy-all', version: '3.0.1'
compile group: 'commons-cli', name: 'commons-cli', version: '1.4'
compile group: 'ch.qos.logback', name: 'logback-classic', version: '1.2.3'
}
sourceSets {
main {
java {
srcDir '../../src/main/java'
}
groovy {
srcDir 'src/main/groovy'
}
}
}
task prepareDict(type: Exec, dependsOn: processResources) {
def srcDict="${inputDir}/$inputDictFile"
def outFile="${tmpDir}/all.tagged.tmp"
inputs.file srcDict
outputs.file outFile
workingDir "$projectDir"
doFirst {
println "Preparing lucene dict from $inputDictFile"
new File("$workingDir/$tmpDir").mkdirs()
}
def normalizeCode =
'''
def prevKey=""
System.in.eachLine {
def (token, lemma, tags) = it.split()
def key = token + " " + lemma;
def ignore = tags.contains(":inf:coll")
// || token.contains("-") // hyphen may be useful if users use analyzers that do not break words on hyphen
if( ignore )
return
if( key != prevKey ) {
if( prevKey )
println ""
print it.replace((char)" ", (char)"\t")
prevKey = key
}
else {
print ";" + tags
}
}
println ""
'''
def cmd = "cat ${srcDict} | LC_ALL=POSIX sort -u | groovy -e '$normalizeCode' | iconv -f ${srcEncoding} -t ${dstEncoding} | LC_ALL=POSIX sort -u > ${outFile}"
commandLine "sh", "-c", "${cmd}"
}
task createStemmerDict(type: JavaExec, dependsOn: prepareDict) {
def outputDict="${outputDir}/ukrainian.dict"
inputs.file tasks.prepareDict.outputs.files
outputs.file outputDict
workingDir "$projectDir"
classpath = files(configurations.provided.files)
main = 'org.languagetool.tools.POSDictionaryBuilder'
args "-i", "${tmpDir}/all.tagged.tmp"
args "-info", "${resourceDir}/ukrainian.info"
args "-o", "${outputDict}"
doFirst {
new File("$outputDir").mkdirs()
}
}
task stopwords () {
def srcDict="${inputDir}/$inputDictFile"
def outFile = "${outputDir}/stopwords.txt"
inputs.file srcDict
outputs.file outFile
doFirst {
def stopWords = []
def lines = new File(srcDict).eachLine { line ->
def (word, lemma, tags) = line.split()
if( word.contains('-') )
return
if( (tags =~ /part|conj|prep/ && ! (tags =~ /:rare|:pers/)) \
|| (tags =~ /&pron/ && ! (tags =~ /:rare/)) \
|| (lemma in ['бути','могти','можна'] ) ) {
if( ! (word in stopWords) ) {
stopWords << word
}
}
}
logger.lifecycle("Written ${stopWords.size} stopwords into $outFile")
new File(outFile).text = stopWords.join("\n")
}
}
task createStemmerMap(type: JavaExec) {
classpath = sourceSets.main.runtimeClasspath
main = "org.dict_uk.tools.CreateStemmerMap"
// workingDir = file("${projectDir}/../../data/affix")
}
task lucene(dependsOn: [createStemmerDict, stopwords])
//classes.dependsOn(createStemmerDict, stopwords)
//compileJava.enabled = false