Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions gradle/generation/kuromoji.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,42 @@ configure(project(":lucene:analysis:kuromoji")) {
}
}

task compileUnidic(type: Download) {
description "Recompile dictionaries from UniDic data from https://clrd.ninjal.ac.jp/unidic_archive"
group "generation"

dependsOn deleteDictionaryData
dependsOn sourceSets.main.runtimeClasspath

def dictionaryName = "unidic-cwj-3.1.1-full"
def dictionarySource = "https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/${dictionaryName}.zip"
def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.zip")
def unpackedDir = file("${buildDir}/generate/${dictionaryName}")

src dictionarySource
dest dictionaryFile
onlyIfModified true

doLast {
// Unpack the downloaded archive.
delete unpackedDir
ant.unzip(src: dictionaryFile, dest: unpackedDir) {
ant.cutdirsmapper(dirs: "1")
}

// Compile the dictionary
recompileDictionary(project, dictionaryName, {
args += [
"unidic",
unpackedDir,
targetDir,
"UTF-8",
false
]
})
}
}

regenerate.dependsOn compileMecab
}
}
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ New Features
Improvements
---------------------

* LUCENE-4056: Japanese Tokenizer (Kuromoji) can build a UniDic dictionary (Jun Ohtani, Alexander Zagniotov)

* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
(Uihyun Kim)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
* Tool to build dictionaries. Usage:
*
* <pre>
* java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \
* java -cp [lucene classpath] org.apache.lucene.analysis.ja.dict.DictionaryBuilder \
* ${inputDir} ${outputDir} ${encoding} ${normalizeEntry}
* </pre>
*
Expand Down Expand Up @@ -66,7 +66,7 @@ public static void build(
.build(inputDir)
.write(outputDir);

new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir);
new UnknownDictionaryBuilder(format, encoding).build(inputDir).write(outputDir);

ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
.write(outputDir, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public TokenInfoDictionaryWriter build(Path dir) throws IOException {
}

private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(format, 10 * 1024 * 1024);
Charset cs = Charset.forName(encoding);
// all lines in the file
List<String[]> lines = new ArrayList<>(400000);
Expand All @@ -72,10 +72,7 @@ private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IO
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);

if (entry.length < 13) {
throw new IllegalArgumentException(
"Entry in CSV is not valid (13 field values expected): " + line);
}
validateEntryLengthWithThrow(line, entry);

lines.add(formatEntry(entry));

Expand Down Expand Up @@ -130,6 +127,16 @@ private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IO
return dictionary;
}

private void validateEntryLengthWithThrow(final String line, String[] entry) {
if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && entry.length < 13) {
throw new IllegalArgumentException(
"Entry in CSV is not valid (13 field values expected): " + line);
} else if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && entry.length < 21) {
throw new IllegalArgumentException(
"Entry in CSV is not valid (21 field values expected): " + line);
}
}

/*
* IPADIC features
*
Expand All @@ -150,9 +157,10 @@ private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IO
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
* 11 - base form
* 11 - lexeme - not used
* 12 - surface form
* 13 - surface reading
* 14 - orthographic form
*/

private String[] formatEntry(String[] features) {
Expand All @@ -170,7 +178,7 @@ private String[] formatEntry(String[] features) {
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
features2[10] = features[11];
features2[10] = features[14];

// If the surface reading is non-existent, use surface form for reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly other cases as well
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@

/** Writes system dictionary entries */
class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
private static final int ID_LIMIT = 8192;
private static final int IPADIC_ID_LIMIT = 8192;

TokenInfoDictionaryEntryWriter(int size) {
// E.g.: unidic-cwj-3.1.1-full: 15388
// E.g.: unidic-cwj-202302_full: 18859
private static final int UNIDIC_ID_LIMIT = 18859;

private final DictionaryBuilder.DictionaryFormat format;

TokenInfoDictionaryEntryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
super(size);
this.format = format;
}

/**
Expand All @@ -47,6 +54,21 @@ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
* 11 - reading
* 12 - pronounciation
* </pre>
*
* <p>unidic features
*
* <pre>
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
* 11 - lexeme - not used
* 12 - surface form
* 13 - surface reading
* 14 - orthographic form
* </pre>
*/
@Override
protected int putEntry(String[] entry) {
Expand Down Expand Up @@ -114,31 +136,29 @@ protected int putEntry(String[] entry) {
flags |= TokenInfoMorphData.HAS_PRONUNCIATION;
}

if (leftId != rightId) {
throw new IllegalArgumentException("rightId != leftId: " + rightId + " " + leftId);
}
if (leftId >= ID_LIMIT) {
throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId);
}
validateLeftRightIdsWithThrow(leftId, rightId);
// add pos mapping
int toFill = 1 + leftId - posDict.size();
for (int i = 0; i < toFill; i++) {
posDict.add(null);
}

String existing = posDict.get(leftId);
if (existing != null && existing.equals(fullPOSData) == false) {
// TODO: test me
throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId);
}
posDict.set(leftId, fullPOSData);

buffer.putShort((short) (leftId << 3 | flags));
buffer.putShort(wordCost);

if ((flags & TokenInfoMorphData.HAS_BASEFORM) != 0) {
if (baseForm.length() >= 16) {
throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16");
if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && baseForm.length() >= 16) {
throw new IllegalArgumentException(
"IPADIC base form length " + baseForm.length() + " is >= 16");
}

// Added the following check because when trying to build unidic-cwj-3.1.1-full,
// the base form length was greater than 16, thus, the original check was failing.
if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && baseForm.length() >= 35) {
throw new IllegalArgumentException(
"UNIDIC base form length " + baseForm.length() + " is >= 35");
}
int shared = sharedPrefix(entry[0], baseForm);
int suffix = baseForm.length() - shared;
Expand Down Expand Up @@ -179,6 +199,20 @@ protected int putEntry(String[] entry) {
return buffer.position();
}

private void validateLeftRightIdsWithThrow(short leftId, short rightId) {
if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId != rightId) {
throw new IllegalArgumentException("IpaDic rightId != leftId: " + rightId + " " + leftId);
}

if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId >= IPADIC_ID_LIMIT) {
throw new IllegalArgumentException("IpaDic leftId >= " + IPADIC_ID_LIMIT + ": " + leftId);
}

if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && leftId >= UNIDIC_ID_LIMIT) {
throw new IllegalArgumentException("UniDic leftId >= " + UNIDIC_ID_LIMIT + ": " + leftId);
}
}

private boolean isKatakana(String s) {
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class TokenInfoDictionaryWriter
extends org.apache.lucene.analysis.morph.BinaryDictionaryWriter<TokenInfoDictionary> {
private FST<Long> fst;

TokenInfoDictionaryWriter(int size) {
super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(size));
TokenInfoDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(format, size));
}

public void setFST(FST<Long> fst) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";

private final DictionaryBuilder.DictionaryFormat format;
private final String encoding;

UnknownDictionaryBuilder(String encoding) {
UnknownDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, String encoding) {
this.format = format;
this.encoding = encoding;
}

Expand All @@ -49,7 +51,7 @@ private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException

private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
throws IOException {
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(format, 5 * 1024 * 1024);

List<String[]> lines = new ArrayList<>();
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
Expand All @@ -60,11 +62,8 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
String line;
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading
// and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
final String[] parsed =
CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
lines.add(parsed);
// and pronunciation, even though the unknown dictionary returns hardcoded null here.
lines.add(parseCSVLine(line));
}
}

Expand All @@ -78,6 +77,14 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
return dictionary;
}

private String[] parseCSVLine(final String line) {
if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC) {
return CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column
} else {
return CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
}
}

private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary)
throws IOException {
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class UnknownDictionaryWriter extends BinaryDictionaryWriter<UnknownDictionary>
CharacterDefinition.CLASS_COUNT,
CharacterDefinition::lookupCharacterClass);

public UnknownDictionaryWriter(int size) {
super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(size));
public UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(format, size));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ public class TestUnknownDictionary extends LuceneTestCase {

@Test
public void testPutCharacterCategory() {
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
UnknownDictionaryWriter unkDic =
new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024);

expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME"));

Expand All @@ -40,7 +41,8 @@ public void testPutCharacterCategory() {

@Test
public void testPut() {
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
UnknownDictionaryWriter unkDic =
new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024);
expectThrows(
NumberFormatException.class,
() -> unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")));
Expand Down