From d4e71d81dcc852ce156a733d7d9f5f9359bf6c95 Mon Sep 17 00:00:00 2001 From: tballison Date: Mon, 5 Mar 2018 14:31:02 -0500 Subject: [PATCH] LUCENE-8186 -- check for multitermaware tokenizer in CustomAnalyzer in normalize(). --- .../analysis/custom/CustomAnalyzer.java | 9 +++++++ .../analysis/custom/TestCustomAnalyzer.java | 24 +++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java index a697cced51af..38ab9f1ae7c1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java @@ -146,6 +146,15 @@ protected TokenStreamComponents createComponents(String fieldName) { @Override protected TokenStream normalize(String fieldName, TokenStream in) { TokenStream result = in; + + TokenizerFactory tokenizerFactory = getTokenizerFactory(); + if (tokenizerFactory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtTokenizerFactory = ((MultiTermAwareComponent)tokenizerFactory).getMultiTermComponent(); + if (mtTokenizerFactory instanceof TokenFilterFactory && mtTokenizerFactory instanceof MultiTermAwareComponent) { + TokenFilterFactory filter = (TokenFilterFactory)((MultiTermAwareComponent)mtTokenizerFactory).getMultiTermComponent(); + result = filter.create(result); + } + } for (TokenFilterFactory filter : tokenFilters) { if (filter instanceof MultiTermAwareComponent) { filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent(); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java index d9ea43c23453..ca55b0a96e8d 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java @@ -34,7 +34,9 @@ import org.apache.lucene.analysis.core.KeywordTokenizerFactory; import org.apache.lucene.analysis.core.LowerCaseFilterFactory; import org.apache.lucene.analysis.core.LowerCaseTokenizer; +import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory; import org.apache.lucene.analysis.core.StopFilterFactory; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; import org.apache.lucene.analysis.standard.ClassicTokenizerFactory; @@ -247,7 +249,24 @@ public void testStopWordsFromFileAbsolute() throws Exception { assertAnalyzesTo(a, "foo Foo Bar", new String[0]); a.close(); } - + + public void testNormalizeWithLCTokenizer() throws Exception { + CustomAnalyzer a = CustomAnalyzer.builder() + .withTokenizer(LowerCaseTokenizerFactory.class) + .build(); + assertEquals(new BytesRef("hello"), a.normalize("f", "HellO")); + a.close(); + } + + public void testNormalizeWithCharFilterAndLCTokenizer() throws Exception { + CustomAnalyzer a = CustomAnalyzer.builder() + .addCharFilter(DummyMultiTermAwareCharFilterFactory.class) + .withTokenizer(LowerCaseTokenizerFactory.class) + .build(); + assertEquals(new BytesRef("ab2c"), a.normalize("f", "AB0C")); + a.close(); + } + // Now test misconfigurations: public void testIncorrectOrder() throws Exception { @@ -431,7 +450,8 @@ public DummyMultiTermAwareTokenizerFactory(Map args) { @Override public AbstractAnalysisFactory getMultiTermComponent() { - return new KeywordTokenizerFactory(getOriginalArgs()); + Map modifiableArgs = new HashMap<>(getOriginalArgs()); + return new KeywordTokenizerFactory(modifiableArgs); } }