/
TokenStream.cs
198 lines (187 loc) · 10.3 KB
/
TokenStream.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
using Lucene.Net.Analysis.TokenAttributes;
using System;
using System.IO;
namespace Lucene.Net.Analysis
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AttributeSource = Lucene.Net.Util.AttributeSource;
/// <summary>
/// A <see cref="TokenStream"/> enumerates the sequence of tokens, either from
/// <see cref="Documents.Field"/>s of a <see cref="Documents.Document"/> or from query text.
/// <para/>
/// this is an abstract class; concrete subclasses are:
/// <list type="bullet">
/// <item><description><see cref="Tokenizer"/>, a <see cref="TokenStream"/> whose input is a <see cref="TextReader"/>; and</description></item>
/// <item><description><see cref="TokenFilter"/>, a <see cref="TokenStream"/> whose input is another
/// <see cref="TokenStream"/>.</description></item>
/// </list>
/// A new <see cref="TokenStream"/> API has been introduced with Lucene 2.9. this API
/// has moved from being <see cref="Token"/>-based to <see cref="Util.IAttribute"/>-based. While
/// <see cref="Token"/> still exists in 2.9 as a convenience class, the preferred way
/// to store the information of a <see cref="Token"/> is to use <see cref="Attribute"/>s.
/// <para/>
/// <see cref="TokenStream"/> now extends <see cref="AttributeSource"/>, which provides
/// access to all of the token <see cref="Util.IAttribute"/>s for the <see cref="TokenStream"/>.
/// Note that only one instance per <see cref="Attribute"/> is created and reused
/// for every token. This approach reduces object creation and allows local
/// caching of references to the <see cref="Attribute"/>s. See
/// <see cref="IncrementToken()"/> for further details.
/// <para/>
/// <b>The workflow of the new <see cref="TokenStream"/> API is as follows:</b>
/// <list type="number">
/// <item><description>Instantiation of <see cref="TokenStream"/>/<see cref="TokenFilter"/>s which add/get
/// attributes to/from the <see cref="AttributeSource"/>.</description></item>
/// <item><description>The consumer calls <see cref="TokenStream.Reset()"/>.</description></item>
/// <item><description>The consumer retrieves attributes from the stream and stores local
/// references to all attributes it wants to access.</description></item>
/// <item><description>The consumer calls <see cref="IncrementToken()"/> until it returns false
/// consuming the attributes after each call.</description></item>
/// <item><description>The consumer calls <see cref="End()"/> so that any end-of-stream operations
/// can be performed.</description></item>
/// <item><description>The consumer calls <see cref="Dispose()"/> to release any resource when finished
/// using the <see cref="TokenStream"/>.</description></item>
/// </list>
/// To make sure that filters and consumers know which attributes are available,
/// the attributes must be added during instantiation. Filters and consumers are
/// not required to check for availability of attributes in
/// <see cref="IncrementToken()"/>.
/// <para/>
/// You can find some example code for the new API in the analysis
/// documentation.
/// <para/>
/// Sometimes it is desirable to capture a current state of a <see cref="TokenStream"/>,
/// e.g., for buffering purposes (see <see cref="CachingTokenFilter"/>,
/// TeeSinkTokenFilter). For this usecase
/// <see cref="AttributeSource.CaptureState"/> and <see cref="AttributeSource.RestoreState"/>
/// can be used.
/// <para/>The <see cref="TokenStream"/>-API in Lucene is based on the decorator pattern.
/// Therefore all non-abstract subclasses must be sealed or have at least a sealed
/// implementation of <see cref="IncrementToken()"/>! This is checked when assertions are enabled.
/// </summary>
public abstract class TokenStream : AttributeSource, IDisposable
{
/// <summary>
/// A <see cref="TokenStream"/> using the default attribute factory.
/// </summary>
protected TokenStream()
{
// LUCENENET: Rather than using AssertFinal() to run Reflection code at runtime,
// we are using a Roslyn code analyzer to ensure the rules are followed at compile time.
}
/// <summary>
/// A <see cref="TokenStream"/> that uses the same attributes as the supplied one.
/// </summary>
protected TokenStream(AttributeSource input)
: base(input)
{
// LUCENENET: Rather than using AssertFinal() to run Reflection code at runtime,
// we are using a Roslyn code analyzer to ensure the rules are followed at compile time.
}
/// <summary>
/// A <see cref="TokenStream"/> using the supplied <see cref="AttributeSource.AttributeFactory"/>
/// for creating new <see cref="Util.IAttribute"/> instances.
/// </summary>
protected TokenStream(AttributeFactory factory)
: base(factory)
{
// LUCENENET: Rather than using AssertFinal() to run Reflection code at runtime,
// we are using a Roslyn code analyzer to ensure the rules are followed at compile time.
}
/// <summary>
/// Consumers (i.e., <see cref="Index.IndexWriter"/>) use this method to advance the stream to
/// the next token. Implementing classes must implement this method and update
/// the appropriate <see cref="Lucene.Net.Util.IAttribute"/>s with the attributes of the next
/// token.
/// <para/>
/// The producer must make no assumptions about the attributes after the method
/// has been returned: the caller may arbitrarily change it. If the producer
/// needs to preserve the state for subsequent calls, it can use
/// <see cref="AttributeSource.CaptureState"/> to create a copy of the current attribute state.
/// <para/>
/// this method is called for every token of a document, so an efficient
/// implementation is crucial for good performance. To avoid calls to
/// <see cref="AttributeSource.AddAttribute{T}"/> and <see cref="AttributeSource.GetAttribute{T}"/>,
/// references to all <see cref="Lucene.Net.Util.IAttribute"/>s that this stream uses should be
/// retrieved during instantiation.
/// <para/>
/// To ensure that filters and consumers know which attributes are available,
/// the attributes must be added during instantiation. Filters and consumers
/// are not required to check for availability of attributes in
/// <see cref="IncrementToken()"/>.
/// </summary>
/// <returns> false for end of stream; true otherwise </returns>
public abstract bool IncrementToken();
/// <summary>
/// This method is called by the consumer after the last token has been
/// consumed, after <see cref="IncrementToken()"/> returned <c>false</c>
/// (using the new <see cref="TokenStream"/> API). Streams implementing the old API
/// should upgrade to use this feature.
/// <para/>
/// This method can be used to perform any end-of-stream operations, such as
/// setting the final offset of a stream. The final offset of a stream might
/// differ from the offset of the last token eg in case one or more whitespaces
/// followed after the last token, but a WhitespaceTokenizer was used.
/// <para/>
/// Additionally any skipped positions (such as those removed by a stopfilter)
/// can be applied to the position increment, or any adjustment of other
/// attributes where the end-of-stream value may be important.
/// <para/>
/// If you override this method, always call <c>base.End();</c>.
/// </summary>
/// <exception cref="IOException"> If an I/O error occurs </exception>
public virtual void End()
{
ClearAttributes(); // LUCENE-3849: don't consume dirty atts
if (HasAttribute<IPositionIncrementAttribute>())
{
var attr = GetAttribute<IPositionIncrementAttribute>();
attr.PositionIncrement = 0;
}
}
/// <summary>
/// This method is called by a consumer before it begins consumption using
/// <see cref="IncrementToken()"/>.
/// <para/>
/// Resets this stream to a clean state. Stateful implementations must implement
/// this method so that they can be reused, just as if they had been created fresh.
/// <para/>
/// If you override this method, always call <c>base.Reset()</c>, otherwise
/// some internal state will not be correctly reset (e.g., <see cref="Tokenizer"/> will
/// throw <see cref="InvalidOperationException"/> on further usage).
/// </summary>
public virtual void Reset()
{
}
// LUCENENET specific - implementing proper dispose pattern
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
/// <summary>
/// Releases resources associated with this stream.
/// <para/>
/// If you override this method, always call <c>base.Dispose(disposing)</c>, otherwise
/// some internal state will not be correctly reset (e.g., <see cref="Tokenizer"/> will
/// throw <see cref="InvalidOperationException"/> on reuse).
/// </summary>
protected virtual void Dispose(bool disposing)
{
}
}
}