apache · benwtrent · Mar 18, 2024 · Dec 10, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/lucene/analysis/kuromoji/src/java/module-info.java b/lucene/analysis/kuromoji/src/java/module-info.java
@@ -40,5 +40,7 @@
       org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory,
       org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory,
       org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory,
-      org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory;
+      org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory,
+      org.apache.lucene.analysis.ja.JapaneseHiraganaUppercaseFilterFactory,
+      org.apache.lucene.analysis.ja.JapaneseKatakanaUppercaseFilterFactory;
 }
diff --git a/...ysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java b/...ysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilter.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that normalizes small letters (捨て仮名) in hiragana into normal letters. For
+ * instance, "ちょっとまって" will be translated to "ちよつとまつて".
+ *
+ * <p>This filter is useful if you want to search against old style Japanese text such as patents,
+ * legal, contract policies, etc.
+ */
+public final class JapaneseHiraganaUppercaseFilter extends TokenFilter {
+  private static final Map<Character, Character> s2l;
+
+  static {
+    // supported characters are:
+    // ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ゎ ゕ ゖ
+    s2l =
+        Map.ofEntries(
+            Map.entry('ぁ', 'あ'),
+            Map.entry('ぃ', 'い'),
+            Map.entry('ぅ', 'う'),
+            Map.entry('ぇ', 'え'),
+            Map.entry('ぉ', 'お'),
+            Map.entry('っ', 'つ'),
+            Map.entry('ゃ', 'や'),
+            Map.entry('ゅ', 'ゆ'),
+            Map.entry('ょ', 'よ'),
+            Map.entry('ゎ', 'わ'),
+            Map.entry('ゕ', 'か'),
+            Map.entry('ゖ', 'け'));
+  }
+
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+
+  public JapaneseHiraganaUppercaseFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      String term = termAttr.toString();
+      char[] src = term.toCharArray();
+      char[] result = new char[src.length];
+      for (int i = 0; i < src.length; i++) {
+        Character c = s2l.get(src[i]);
+        if (c != null) {
+          result[i] = c;
+        } else {
+          result[i] = src[i];
+        }
+      }
+      String resultTerm = String.copyValueOf(result);
+      termAttr.setEmpty().append(resultTerm);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/...romoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java b/...romoji/src/java/org/apache/lucene/analysis/ja/JapaneseHiraganaUppercaseFilterFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link JapaneseHiraganaUppercaseFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class JapaneseHiraganaUppercaseFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "japaneseHiraganaUppercase";
+
+  public JapaneseHiraganaUppercaseFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public JapaneseHiraganaUppercaseFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new JapaneseHiraganaUppercaseFilter(input);
+  }
+}
diff --git a/...ysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java b/...ysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilter.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that normalizes small letters (捨て仮名) in katakana into normal letters. For
+ * instance, "ストップウォッチ" will be translated to "ストツプウオツチ".
+ *
+ * <p>This filter is useful if you want to search against old style Japanese text such as patents,
+ * legal, contract policies, etc.
+ */
+public final class JapaneseKatakanaUppercaseFilter extends TokenFilter {
+  private static final Map<Character, Character> s2l;
+
+  static {
+    // supported characters are:
+    // ァ ィ ゥ ェ ォ ヵ ㇰ ヶ ㇱ ㇲ ッ ㇳ ㇴ ㇵ ㇶ ㇷ ㇷ゚ ㇸ ㇹ ㇺ ャ ュ ョ ㇻ ㇼ ㇽ ㇾ ㇿ ヮ
+    s2l =
+        Map.ofEntries(
+            Map.entry('ァ', 'ア'),
+            Map.entry('ィ', 'イ'),
+            Map.entry('ゥ', 'ウ'),
+            Map.entry('ェ', 'エ'),
+            Map.entry('ォ', 'オ'),
+            Map.entry('ヵ', 'カ'),
+            Map.entry('ㇰ', 'ク'),
+            Map.entry('ヶ', 'ケ'),
+            Map.entry('ㇱ', 'シ'),
+            Map.entry('ㇲ', 'ス'),
+            Map.entry('ッ', 'ツ'),
+            Map.entry('ㇳ', 'ト'),
+            Map.entry('ㇴ', 'ヌ'),
+            Map.entry('ㇵ', 'ハ'),
+            Map.entry('ㇶ', 'ヒ'),
+            Map.entry('ㇷ', 'フ'),
+            Map.entry('ㇸ', 'ヘ'),
+            Map.entry('ㇹ', 'ホ'),
+            Map.entry('ㇺ', 'ム'),
+            Map.entry('ャ', 'ヤ'),
+            Map.entry('ュ', 'ユ'),
+            Map.entry('ョ', 'ヨ'),
+            Map.entry('ㇻ', 'ラ'),
+            Map.entry('ㇼ', 'リ'),
+            Map.entry('ㇽ', 'ル'),
+            Map.entry('ㇾ', 'レ'),
+            Map.entry('ㇿ', 'ロ'),
+            Map.entry('ヮ', 'ワ'));
+  }
+
+  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+
+  public JapaneseKatakanaUppercaseFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      String term = termAttr.toString();
+      // Small letter "ㇷ゚" is not single character, so it should be converted to "プ" as String
+      term = term.replace("ㇷ゚", "プ");
+      char[] src = term.toCharArray();
+      char[] result = new char[src.length];
+      for (int i = 0; i < src.length; i++) {
+        Character c = s2l.get(src[i]);
+        if (c != null) {
+          result[i] = c;
+        } else {
+          result[i] = src[i];
+        }
+      }
+      String resultTerm = String.copyValueOf(result);
+      termAttr.setEmpty().append(resultTerm);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/...romoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java b/...romoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaUppercaseFilterFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.util.Map;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link JapaneseKatakanaUppercaseFilter}.
+ *
+ * @lucene.spi {@value #NAME}
+ */
+public class JapaneseKatakanaUppercaseFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "japaneseKatakanaUppercase";
+
+  public JapaneseKatakanaUppercaseFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public JapaneseKatakanaUppercaseFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new JapaneseKatakanaUppercaseFilter(input);
+  }
+}
diff --git a/...is/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/...is/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -19,3 +19,5 @@ org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory
 org.apache.lucene.analysis.ja.JapaneseNumberFilterFactory
 org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory
 org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory
+org.apache.lucene.analysis.ja.JapaneseHiraganaUppercaseFilterFactory
+org.apache.lucene.analysis.ja.JapaneseKatakanaUppercaseFilterFactory
diff --git a/.../kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java b/.../kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseHiraganaUppercaseFilter.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.ja;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.tests.analysis.MockTokenizer;
+
+/** Tests for {@link JapaneseHiraganaUppercaseFilter} */
+public class TestJapaneseHiraganaUppercaseFilter extends BaseTokenStreamTestCase {
+  private Analyzer keywordAnalyzer, japaneseAnalyzer;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    keywordAnalyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+            return new TokenStreamComponents(
+                tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer));
+          }
+        };
+    japaneseAnalyzer =
+        new Analyzer() {
+          @Override
+          protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tokenizer =
+                new JapaneseTokenizer(
+                    newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
+            return new TokenStreamComponents(
+                tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer));
+          }
+        };
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    keywordAnalyzer.close();
+    japaneseAnalyzer.close();
+    super.tearDown();
+  }
+
+  public void testKanaUppercase() throws IOException {
+    assertAnalyzesTo(keywordAnalyzer, "ぁぃぅぇぉっゃゅょゎゕゖ", new String[] {"あいうえおつやゆよわかけ"});
+    assertAnalyzesTo(keywordAnalyzer, "ちょっとまって", new String[] {"ちよつとまつて"});
+  }
+
+  public void testKanaUppercaseWithSurrogatePair() throws IOException {
+    // 𠀋 : \uD840\uDC0B
+    assertAnalyzesTo(
+        keywordAnalyzer,
+        "\uD840\uDC0Bちょっとまって ちょっと\uD840\uDC0Bまって ちょっとまって\uD840\uDC0B",
+        new String[] {"\uD840\uDC0Bちよつとまつて", "ちよつと\uD840\uDC0Bまつて", "ちよつとまつて\uD840\uDC0B"});
+  }
+
+  public void testKanaUppercaseWithJapaneseTokenizer() throws IOException {
+    assertAnalyzesTo(japaneseAnalyzer, "ちょっとまって", new String[] {"ちよつと", "まつ", "て"});
+  }
+
+  public void testRandomData() throws IOException {
+    checkRandomData(random(), keywordAnalyzer, 200 * RANDOM_MULTIPLIER);
+  }
+
+  public void testEmptyTerm() throws IOException {
+    assertAnalyzesTo(keywordAnalyzer, "", new String[] {});
+  }
+}