-
Notifications
You must be signed in to change notification settings - Fork 962
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add new token filters for Japanese sutegana (捨て仮名) #12915
Changes from 8 commits
409a80b
7408013
c617ef8
5acfa34
6c72ff7
a15b138
50e9916
b5b29d8
2f4463d
508b485
c610053
6a16ad1
e79893b
01e7d2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ja; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
import org.apache.lucene.analysis.TokenFilter; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
|
||
/** | ||
* A {@link TokenFilter} that normalizes small letters (捨て仮名) in hiragana into normal letters. For | ||
* instance, "ちょっとまって" will be translated to "ちよつとまつて". | ||
* | ||
* <p>This filter is useful if you want to search against old style Japanese text such as patents, | ||
* legal, contract policies, etc. | ||
*/ | ||
public final class JapaneseHiraganaUppercaseFilter extends TokenFilter { | ||
private static final Map<Character, Character> s2l; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the parameter should be in all-uppercase as it's a constant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also s2l is a bit cryptic, maybe we could use LETTER_MAPPINGS or something There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, let me do that. |
||
|
||
static { | ||
// supported characters are: | ||
// ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ゎ ゕ ゖ | ||
s2l = | ||
Map.ofEntries( | ||
Map.entry('ぁ', 'あ'), | ||
Map.entry('ぃ', 'い'), | ||
Map.entry('ぅ', 'う'), | ||
Map.entry('ぇ', 'え'), | ||
Map.entry('ぉ', 'お'), | ||
Map.entry('っ', 'つ'), | ||
Map.entry('ゃ', 'や'), | ||
Map.entry('ゅ', 'ゆ'), | ||
Map.entry('ょ', 'よ'), | ||
Map.entry('ゎ', 'わ'), | ||
Map.entry('ゕ', 'か'), | ||
Map.entry('ゖ', 'け')); | ||
} | ||
|
||
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); | ||
|
||
public JapaneseHiraganaUppercaseFilter(TokenStream input) { | ||
super(input); | ||
} | ||
|
||
@Override | ||
public boolean incrementToken() throws IOException { | ||
if (input.incrementToken()) { | ||
String term = termAttr.toString(); | ||
char[] src = term.toCharArray(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can iterate through the term attribute directly. These methods require byte-copy so might be inefficient
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, let me do that. |
||
char[] result = new char[src.length]; | ||
for (int i = 0; i < src.length; i++) { | ||
Character c = s2l.get(src[i]); | ||
if (c != null) { | ||
result[i] = c; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems all small characters are just 1 position ahead of the normal characters, so you can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's not correct. See There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, that makes sense. Thank you |
||
} else { | ||
result[i] = src[i]; | ||
} | ||
} | ||
String resultTerm = String.copyValueOf(result); | ||
termAttr.setEmpty().append(resultTerm); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can avoid making There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I couldn't find There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems you can modify the
This will eliminate all of the byte copy. I don't know if we are supposed to do that (but the API allow). Maybe @mikemccand could have some thought here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is indeed the intended usage for high performance -- directly alter that underlying |
||
return true; | ||
} else { | ||
return false; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ja; | ||
|
||
import java.util.Map; | ||
import org.apache.lucene.analysis.TokenFilterFactory; | ||
import org.apache.lucene.analysis.TokenStream; | ||
|
||
/** | ||
* Factory for {@link JapaneseHiraganaUppercaseFilter}. | ||
* | ||
* @lucene.spi {@value #NAME} | ||
*/ | ||
public class JapaneseHiraganaUppercaseFilterFactory extends TokenFilterFactory { | ||
|
||
/** SPI name */ | ||
public static final String NAME = "japaneseHiraganaUppercase"; | ||
|
||
public JapaneseHiraganaUppercaseFilterFactory(Map<String, String> args) { | ||
super(args); | ||
if (!args.isEmpty()) { | ||
throw new IllegalArgumentException("Unknown parameters: " + args); | ||
} | ||
} | ||
|
||
/** Default ctor for compatibility with SPI */ | ||
public JapaneseHiraganaUppercaseFilterFactory() { | ||
throw defaultCtorException(); | ||
} | ||
|
||
@Override | ||
public TokenStream create(TokenStream input) { | ||
return new JapaneseHiraganaUppercaseFilter(input); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ja; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
import org.apache.lucene.analysis.TokenFilter; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
|
||
/** | ||
* A {@link TokenFilter} that normalizes small letters (捨て仮名) in katakana into normal letters. For | ||
* instance, "ストップウォッチ" will be translated to "ストツプウオツチ". | ||
* | ||
* <p>This filter is useful if you want to search against old style Japanese text such as patents, | ||
* legal, contract policies, etc. | ||
*/ | ||
public final class JapaneseKatakanaUppercaseFilter extends TokenFilter { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to be mostly the same as the other filter, so maybe we can combine them? E.g you can either pass the mapping as a constructor parameter and provide 2 constants mapping There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dungba88 How should the constructor look like? Like this?
Note that Katakana has an exceptional character There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right, maybe we can consolidate them with a base class as a follow-up. This LGTM. |
||
private static final Map<Character, Character> s2l; | ||
|
||
static { | ||
// supported characters are: | ||
// ァ ィ ゥ ェ ォ ヵ ㇰ ヶ ㇱ ㇲ ッ ㇳ ㇴ ㇵ ㇶ ㇷ ㇷ゚ ㇸ ㇹ ㇺ ャ ュ ョ ㇻ ㇼ ㇽ ㇾ ㇿ ヮ | ||
s2l = | ||
Map.ofEntries( | ||
Map.entry('ァ', 'ア'), | ||
Map.entry('ィ', 'イ'), | ||
Map.entry('ゥ', 'ウ'), | ||
Map.entry('ェ', 'エ'), | ||
Map.entry('ォ', 'オ'), | ||
Map.entry('ヵ', 'カ'), | ||
Map.entry('ㇰ', 'ク'), | ||
Map.entry('ヶ', 'ケ'), | ||
Map.entry('ㇱ', 'シ'), | ||
Map.entry('ㇲ', 'ス'), | ||
Map.entry('ッ', 'ツ'), | ||
Map.entry('ㇳ', 'ト'), | ||
Map.entry('ㇴ', 'ヌ'), | ||
Map.entry('ㇵ', 'ハ'), | ||
Map.entry('ㇶ', 'ヒ'), | ||
Map.entry('ㇷ', 'フ'), | ||
Map.entry('ㇸ', 'ヘ'), | ||
Map.entry('ㇹ', 'ホ'), | ||
Map.entry('ㇺ', 'ム'), | ||
Map.entry('ャ', 'ヤ'), | ||
Map.entry('ュ', 'ユ'), | ||
Map.entry('ョ', 'ヨ'), | ||
Map.entry('ㇻ', 'ラ'), | ||
Map.entry('ㇼ', 'リ'), | ||
Map.entry('ㇽ', 'ル'), | ||
Map.entry('ㇾ', 'レ'), | ||
Map.entry('ㇿ', 'ロ'), | ||
Map.entry('ヮ', 'ワ')); | ||
} | ||
|
||
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); | ||
|
||
public JapaneseKatakanaUppercaseFilter(TokenStream input) { | ||
super(input); | ||
} | ||
|
||
@Override | ||
public boolean incrementToken() throws IOException { | ||
if (input.incrementToken()) { | ||
String term = termAttr.toString(); | ||
// Small letter "ㇷ゚" is not single character, so it should be converted to "プ" as String | ||
term = term.replace("ㇷ゚", "プ"); | ||
char[] src = term.toCharArray(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could instead call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, but it will affect length of result character array and break the tests. So let me keep current implementation. Here is the example of test result.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The buffer return the internal byte[] of the CharTermAttribute, which might has more bytes than the actual term length. You need to use term.length() as well. |
||
char[] result = new char[src.length]; | ||
for (int i = 0; i < src.length; i++) { | ||
Character c = s2l.get(src[i]); | ||
if (c != null) { | ||
result[i] = c; | ||
} else { | ||
result[i] = src[i]; | ||
} | ||
} | ||
String resultTerm = String.copyValueOf(result); | ||
termAttr.setEmpty().append(resultTerm); | ||
return true; | ||
} else { | ||
return false; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ja; | ||
|
||
import java.util.Map; | ||
import org.apache.lucene.analysis.TokenFilterFactory; | ||
import org.apache.lucene.analysis.TokenStream; | ||
|
||
/** | ||
* Factory for {@link JapaneseKatakanaUppercaseFilter}. | ||
* | ||
* @lucene.spi {@value #NAME} | ||
*/ | ||
public class JapaneseKatakanaUppercaseFilterFactory extends TokenFilterFactory { | ||
|
||
/** SPI name */ | ||
public static final String NAME = "japaneseKatakanaUppercase"; | ||
|
||
public JapaneseKatakanaUppercaseFilterFactory(Map<String, String> args) { | ||
super(args); | ||
if (!args.isEmpty()) { | ||
throw new IllegalArgumentException("Unknown parameters: " + args); | ||
} | ||
} | ||
|
||
/** Default ctor for compatibility with SPI */ | ||
public JapaneseKatakanaUppercaseFilterFactory() { | ||
throw defaultCtorException(); | ||
} | ||
|
||
@Override | ||
public TokenStream create(TokenStream input) { | ||
return new JapaneseKatakanaUppercaseFilter(input); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.ja; | ||
|
||
import java.io.IOException; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; | ||
import org.apache.lucene.tests.analysis.MockTokenizer; | ||
|
||
/** Tests for {@link JapaneseHiraganaUppercaseFilter} */ | ||
public class TestJapaneseHiraganaUppercaseFilter extends BaseTokenStreamTestCase { | ||
private Analyzer keywordAnalyzer, japaneseAnalyzer; | ||
|
||
@Override | ||
public void setUp() throws Exception { | ||
super.setUp(); | ||
keywordAnalyzer = | ||
new Analyzer() { | ||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) { | ||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); | ||
return new TokenStreamComponents( | ||
tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer)); | ||
} | ||
}; | ||
japaneseAnalyzer = | ||
new Analyzer() { | ||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) { | ||
Tokenizer tokenizer = | ||
new JapaneseTokenizer( | ||
newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH); | ||
return new TokenStreamComponents( | ||
tokenizer, new JapaneseHiraganaUppercaseFilter(tokenizer)); | ||
} | ||
}; | ||
} | ||
|
||
@Override | ||
public void tearDown() throws Exception { | ||
keywordAnalyzer.close(); | ||
japaneseAnalyzer.close(); | ||
super.tearDown(); | ||
} | ||
|
||
public void testKanaUppercase() throws IOException { | ||
assertAnalyzesTo(keywordAnalyzer, "ぁぃぅぇぉっゃゅょゎゕゖ", new String[] {"あいうえおつやゆよわかけ"}); | ||
assertAnalyzesTo(keywordAnalyzer, "ちょっとまって", new String[] {"ちよつとまつて"}); | ||
} | ||
|
||
public void testKanaUppercaseWithSurrogatePair() throws IOException { | ||
// 𠀋 : \uD840\uDC0B | ||
assertAnalyzesTo( | ||
keywordAnalyzer, | ||
"\uD840\uDC0Bちょっとまって ちょっと\uD840\uDC0Bまって ちょっとまって\uD840\uDC0B", | ||
new String[] {"\uD840\uDC0Bちよつとまつて", "ちよつと\uD840\uDC0Bまつて", "ちよつとまつて\uD840\uDC0B"}); | ||
} | ||
|
||
public void testKanaUppercaseWithJapaneseTokenizer() throws IOException { | ||
assertAnalyzesTo(japaneseAnalyzer, "ちょっとまって", new String[] {"ちよつと", "まつ", "て"}); | ||
} | ||
|
||
public void testRandomData() throws IOException { | ||
checkRandomData(random(), keywordAnalyzer, 200 * RANDOM_MULTIPLIER); | ||
} | ||
|
||
public void testEmptyTerm() throws IOException { | ||
assertAnalyzesTo(keywordAnalyzer, "", new String[] {}); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please add the standard Apache copyright header, if that's OK with you? Thanks! I think this will also make the GitHub actions checks (
./gradlew check
) happy.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm happy to do, thanks!