-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Expand file tree
/
Copy pathStandardTokenizerImpl.jflex
More file actions
202 lines (176 loc) · 8.68 KB
/
StandardTokenizerImpl.jflex
File metadata and controls
202 lines (176 loc) · 8.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
package org.apache.lucene.analysis.standard;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements Word Break rules from the Unicode Text Segmentation
* algorithm, as specified in
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
* <p>
* Tokens produced are of the following types:
* <ul>
* <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
* <li><NUM>: A number</li>
* <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
* <li><HIRAGANA>: A single hiragana character</li>
* <li><KATAKANA>: A sequence of katakana characters</li>
* <li><HANGUL>: A sequence of Hangul characters</li>
* </ul>
*/
@SuppressWarnings("fallthrough")
%%
%unicode 6.3
%integer
%final
%public
%class StandardTokenizerImpl
%function getNextToken
%char
%buffer 255
// UAX#29 WB4. X (Extend | Format)* --> X
//
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
%{
/** Alphanumeric sequences */
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
/** Numbers */
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
* together as as a single token rather than broken up, because the logic
* required to break them at word boundaries is too complex for UAX#29.
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Sets the scanner buffer size in chars
*/
public final void setBufferSize(int numChars) {
ZZ_BUFFERSIZE = numChars;
char[] newZzBuffer = new char[ZZ_BUFFERSIZE];
System.arraycopy(zzBuffer, 0, newZzBuffer, 0, Math.min(zzBuffer.length, ZZ_BUFFERSIZE));
zzBuffer = newZzBuffer;
}
%}
%%
// UAX#29 WB1. sot ÷
// WB2. ÷ eot
//
<<EOF>> { return YYEOF; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} ) {HebrewOrALetterEx} )*
)+
)
)*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
// From UAX #29:
//
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
// boundary property values based on criteria outside of the scope of this
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
// Lao, etc.) are kept together. This grammar does the same below.
//
// See also the Unicode Line Breaking Algorithm:
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
{HanEx} { return IDEOGRAPHIC_TYPE; }
{HiraganaEx} { return HIRAGANA_TYPE; }
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
//
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }