apache · azagniotov · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle
@@ -132,6 +132,42 @@ configure(project(":lucene:analysis:kuromoji")) {
       }
     }
 
+    task compileUnidic(type: Download) {
+      description "Recompile dictionaries from UniDic data from https://clrd.ninjal.ac.jp/unidic_archive"
+      group "generation"
+
+      dependsOn deleteDictionaryData
+      dependsOn sourceSets.main.runtimeClasspath
+
+      def dictionaryName = "unidic-cwj-3.1.1-full"
+      def dictionarySource = "https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/${dictionaryName}.zip"
+      def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.zip")
+      def unpackedDir = file("${buildDir}/generate/${dictionaryName}")
+
+      src dictionarySource
+      dest dictionaryFile
+      onlyIfModified true
+
+      doLast {
+        // Unpack the downloaded archive.
+        delete unpackedDir
+        ant.unzip(src: dictionaryFile, dest: unpackedDir) {
+          ant.cutdirsmapper(dirs: "1")
+        }
+
+        // Compile the dictionary
+        recompileDictionary(project, dictionaryName, {
+          args += [
+                  "unidic",
+                  unpackedDir,
+                  targetDir,
+                  "UTF-8",
+                  false
+          ]
+        })
+      }
+    }
+
     regenerate.dependsOn compileMecab
   }
 }
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -102,6 +102,8 @@ New Features
 Improvements
 ---------------------
 
+* LUCENE-4056: Japanese Tokenizer (Kuromoji) can build a UniDic dictionary (Jun Ohtani, Alexander Zagniotov)
+
 * LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
   (Uihyun Kim)
 

diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java
@@ -25,7 +25,7 @@
  * Tool to build dictionaries. Usage:
  *
  * <pre>
- *    java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \
+ *    java -cp [lucene classpath] org.apache.lucene.analysis.ja.dict.DictionaryBuilder \
  *          ${inputDir} ${outputDir} ${encoding} ${normalizeEntry}
  * </pre>
  *
@@ -66,7 +66,7 @@ public static void build(
         .build(inputDir)
         .write(outputDir);
 
-    new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir);
+    new UnknownDictionaryBuilder(format, encoding).build(inputDir).write(outputDir);
 
     ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
         .write(outputDir, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION);

diff --git a/...ysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java b/...ysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java
@@ -62,7 +62,7 @@ public TokenInfoDictionaryWriter build(Path dir) throws IOException {
   }
 
   private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
-    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
+    TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(format, 10 * 1024 * 1024);
     Charset cs = Charset.forName(encoding);
     // all lines in the file
     List<String[]> lines = new ArrayList<>(400000);
@@ -72,10 +72,7 @@ private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IO
         while ((line = reader.readLine()) != null) {
           String[] entry = CSVUtil.parse(line);
 
-          if (entry.length < 13) {
-            throw new IllegalArgumentException(
-                "Entry in CSV is not valid (13 field values expected): " + line);
-          }
+          validateEntryLengthWithThrow(line, entry);
 
           lines.add(formatEntry(entry));
 
@@ -130,6 +127,16 @@ private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IO
     return dictionary;
   }
 
+  private void validateEntryLengthWithThrow(final String line, String[] entry) {
+    if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && entry.length < 13) {
+      throw new IllegalArgumentException(
+          "Entry in CSV is not valid (13 field values expected): " + line);
+    } else if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && entry.length < 21) {
+      throw new IllegalArgumentException(
+          "Entry in CSV is not valid (21 field values expected): " + line);
+    }
+  }
+
   /*
    * IPADIC features
    *
@@ -150,9 +157,10 @@ private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IO
    * 3   - word cost
    * 4-9 - pos
    * 10  - base form reading
-   * 11  - base form
+   * 11  - lexeme - not used
    * 12  - surface form
    * 13  - surface reading
+   * 14  - orthographic form
    */
 
   private String[] formatEntry(String[] features) {
@@ -170,7 +178,7 @@ private String[] formatEntry(String[] features) {
       features2[7] = features[7];
       features2[8] = features[8];
       features2[9] = features[9];
-      features2[10] = features[11];
+      features2[10] = features[14];
 
       // If the surface reading is non-existent, use surface form for reading and pronunciation.
       // This happens with punctuation in UniDic and there are possibly other cases as well

diff --git a/.../kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java b/.../kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java
@@ -26,10 +26,17 @@
 
 /** Writes system dictionary entries */
 class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
-  private static final int ID_LIMIT = 8192;
+  private static final int IPADIC_ID_LIMIT = 8192;
 
-  TokenInfoDictionaryEntryWriter(int size) {
+  // E.g.: unidic-cwj-3.1.1-full:   15388
+  // E.g.: unidic-cwj-202302_full:  18859
+  private static final int UNIDIC_ID_LIMIT = 18859;
+
+  private final DictionaryBuilder.DictionaryFormat format;
+
+  TokenInfoDictionaryEntryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
     super(size);
+    this.format = format;
   }
 
   /**
@@ -47,6 +54,21 @@ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
    * 11  - reading
    * 12  - pronounciation
    * </pre>
+   *
+   * <p>unidic features
+   *
+   * <pre>
+   * 0   - surface
+   * 1   - left cost
+   * 2   - right cost
+   * 3   - word cost
+   * 4-9 - pos
+   * 10  - base form reading
+   * 11  - lexeme - not used
+   * 12  - surface form
+   * 13  - surface reading
+   * 14  - orthographic form
+   * </pre>
    */
   @Override
   protected int putEntry(String[] entry) {
@@ -114,31 +136,29 @@ protected int putEntry(String[] entry) {
       flags |= TokenInfoMorphData.HAS_PRONUNCIATION;
     }
 
-    if (leftId != rightId) {
-      throw new IllegalArgumentException("rightId != leftId: " + rightId + " " + leftId);
-    }
-    if (leftId >= ID_LIMIT) {
-      throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId);
-    }
+    validateLeftRightIdsWithThrow(leftId, rightId);
     // add pos mapping
     int toFill = 1 + leftId - posDict.size();
     for (int i = 0; i < toFill; i++) {
       posDict.add(null);
     }
 
-    String existing = posDict.get(leftId);
-    if (existing != null && existing.equals(fullPOSData) == false) {
-      // TODO: test me
-      throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId);
-    }
     posDict.set(leftId, fullPOSData);
 
     buffer.putShort((short) (leftId << 3 | flags));
     buffer.putShort(wordCost);
 
     if ((flags & TokenInfoMorphData.HAS_BASEFORM) != 0) {
-      if (baseForm.length() >= 16) {
-        throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16");
+      if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && baseForm.length() >= 16) {
+        throw new IllegalArgumentException(
+            "IPADIC base form length " + baseForm.length() + " is >= 16");
+      }
+
+      // Added the following check because when trying to build unidic-cwj-3.1.1-full,
+      // the base form length was greater than 16, thus, the original check was failing.
+      if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && baseForm.length() >= 35) {
+        throw new IllegalArgumentException(
+            "UNIDIC base form length " + baseForm.length() + " is >= 35");
       }
       int shared = sharedPrefix(entry[0], baseForm);
       int suffix = baseForm.length() - shared;
@@ -179,6 +199,20 @@ protected int putEntry(String[] entry) {
     return buffer.position();
   }
 
+  private void validateLeftRightIdsWithThrow(short leftId, short rightId) {
+    if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId != rightId) {
+      throw new IllegalArgumentException("IpaDic rightId != leftId: " + rightId + " " + leftId);
+    }
+
+    if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId >= IPADIC_ID_LIMIT) {
+      throw new IllegalArgumentException("IpaDic leftId >= " + IPADIC_ID_LIMIT + ": " + leftId);
+    }
+
+    if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && leftId >= UNIDIC_ID_LIMIT) {
+      throw new IllegalArgumentException("UniDic leftId >= " + UNIDIC_ID_LIMIT + ": " + leftId);
+    }
+  }
+
   private boolean isKatakana(String s) {
     for (int i = 0; i < s.length(); i++) {
       char ch = s.charAt(i);

diff --git a/...lysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java b/...lysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java
@@ -26,8 +26,8 @@ class TokenInfoDictionaryWriter
     extends org.apache.lucene.analysis.morph.BinaryDictionaryWriter<TokenInfoDictionary> {
   private FST<Long> fst;
 
-  TokenInfoDictionaryWriter(int size) {
-    super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(size));
+  TokenInfoDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
+    super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(format, size));
   }
 
   public void setFST(FST<Long> fst) {

diff --git a/...alysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java b/...alysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java
@@ -30,9 +30,11 @@
 class UnknownDictionaryBuilder {
   private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
 
+  private final DictionaryBuilder.DictionaryFormat format;
   private final String encoding;
 
-  UnknownDictionaryBuilder(String encoding) {
+  UnknownDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, String encoding) {
+    this.format = format;
     this.encoding = encoding;
   }
 
@@ -49,7 +51,7 @@ private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException
 
   private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
       throws IOException {
-    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
+    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(format, 5 * 1024 * 1024);
 
     List<String[]> lines = new ArrayList<>();
     try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
@@ -60,11 +62,8 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
       String line;
       while ((line = lineReader.readLine()) != null) {
         // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading
-        // and pronunciation,
-        // even though the unknown dictionary returns hardcoded null here.
-        final String[] parsed =
-            CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
-        lines.add(parsed);
+        // and pronunciation, even though the unknown dictionary returns hardcoded null here.
+        lines.add(parseCSVLine(line));
       }
     }
 
@@ -78,6 +77,14 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
     return dictionary;
   }
 
+  private String[] parseCSVLine(final String line) {
+    if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC) {
+      return CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column
+    } else {
+      return CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+    }
+  }
+
   private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary)
       throws IOException {
     try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));

diff --git a/...nalysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java b/...nalysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java
@@ -29,8 +29,8 @@ class UnknownDictionaryWriter extends BinaryDictionaryWriter<UnknownDictionary>
           CharacterDefinition.CLASS_COUNT,
           CharacterDefinition::lookupCharacterClass);
 
-  public UnknownDictionaryWriter(int size) {
-    super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(size));
+  public UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
+    super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(format, size));
   }
 
   @Override

diff --git a/.../analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java b/.../analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java
@@ -25,7 +25,8 @@ public class TestUnknownDictionary extends LuceneTestCase {
 
   @Test
   public void testPutCharacterCategory() {
-    UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
+    UnknownDictionaryWriter unkDic =
+        new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024);
 
     expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME"));
 
@@ -40,7 +41,8 @@ public void testPutCharacterCategory() {
 
   @Test
   public void testPut() {
-    UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
+    UnknownDictionaryWriter unkDic =
+        new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024);
     expectThrows(
         NumberFormatException.class,
         () -> unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")));