From c87778c50472ab81c6bfae7a5371f36a105544b3 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Thu, 11 Oct 2018 13:49:14 +0100 Subject: [PATCH] LUCENE-8526: Add javadocs in CJKBigramFilter explaining the behavior of the StandardTokenizer on Hangul syllables. --- .../org/apache/lucene/analysis/cjk/CJKBigramFilter.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java index bf4f6218bb91..7d79b8427cc5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java @@ -43,6 +43,14 @@ * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}. * This can be used for a combined unigram+bigram approach. *

+ * Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries. + * Korean Hangul characters are treated the same as many other scripts' + * letters, and as a result, StandardTokenizer can produce tokens that mix + * Hangul and non-Hangul characters, e.g. "한국abc". Such mixed-script tokens + * are typed as <ALPHANUM> rather than + * <HANGUL>, and as a result, will not be converted to + * bigrams by CJKBigramFilter. + * * In all cases, all non-CJK input is passed thru unmodified. */ public final class CJKBigramFilter extends TokenFilter {