-
Notifications
You must be signed in to change notification settings - Fork 624
/
Lucene40PostingsFormat.cs
279 lines (267 loc) · 17.4 KB
/
Lucene40PostingsFormat.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
using Lucene.Net.Diagnostics;
using System;
namespace Lucene.Net.Codecs.Lucene40
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using SegmentReadState = Lucene.Net.Index.SegmentReadState;
using SegmentWriteState = Lucene.Net.Index.SegmentWriteState;
/// <summary>
/// Lucene 4.0 Postings format.
/// <para>
/// Files:
/// <list type="bullet">
/// <item><description><tt>.tim</tt>: <a href="#Termdictionary">Term Dictionary</a></description></item>
/// <item><description><tt>.tip</tt>: <a href="#Termindex">Term Index</a></description></item>
/// <item><description><tt>.frq</tt>: <a href="#Frequencies">Frequencies</a></description></item>
/// <item><description><tt>.prx</tt>: <a href="#Positions">Positions</a></description></item>
/// </list>
/// </para>
/// <para/>
/// <a name="Termdictionary" id="Termdictionary"></a>
/// <h3>Term Dictionary</h3>
///
/// <para>The .tim file contains the list of terms in each
/// field along with per-term statistics (such as docfreq)
/// and pointers to the frequencies, positions and
/// skip data in the .frq and .prx files.
/// See <see cref="BlockTreeTermsWriter"/> for more details on the format.
/// </para>
///
/// <para>NOTE: The term dictionary can plug into different postings implementations:
/// the postings writer/reader are actually responsible for encoding
/// and decoding the Postings Metadata and Term Metadata sections described here:</para>
/// <list type="bullet">
/// <item><description>Postings Metadata --> Header, SkipInterval, MaxSkipLevels, SkipMinimum</description></item>
/// <item><description>Term Metadata --> FreqDelta, SkipDelta?, ProxDelta?</description></item>
/// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item>
/// <item><description>SkipInterval,MaxSkipLevels,SkipMinimum --> Uint32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) </description></item>
/// <item><description>SkipDelta,FreqDelta,ProxDelta --> VLong (<see cref="Store.DataOutput.WriteVInt64(long)"/>) </description></item>
/// </list>
/// <para>Notes:</para>
/// <list type="bullet">
/// <item><description>Header is a CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) storing the version information
/// for the postings.</description></item>
/// <item><description>SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate
/// <see cref="Search.DocIdSetIterator.Advance(int)"/>. Larger values result in smaller indexes, greater
/// acceleration, but fewer accelerable cases, while smaller values result in bigger indexes,
/// less acceleration (in case of a small value for MaxSkipLevels) and more accelerable cases.
/// </description></item>
/// <item><description>MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A
/// low value results in smaller indexes but less acceleration, a larger value results in
/// slightly larger indexes but greater acceleration. See format of .frq file for more
/// information about skip levels.</description></item>
/// <item><description>SkipMinimum is the minimum document frequency a term must have in order to write any
/// skip data at all.</description></item>
/// <item><description>FreqDelta determines the position of this term's TermFreqs within the .frq
/// file. In particular, it is the difference between the position of this term's
/// data in that file and the position of the previous term's data (or zero, for
/// the first term in the block).</description></item>
/// <item><description>ProxDelta determines the position of this term's TermPositions within the
/// .prx file. In particular, it is the difference between the position of this
/// term's data in that file and the position of the previous term's data (or zero,
/// for the first term in the block. For fields that omit position data, this will
/// be 0 since prox information is not stored.</description></item>
/// <item><description>SkipDelta determines the position of this term's SkipData within the .frq
/// file. In particular, it is the number of bytes after TermFreqs that the
/// SkipData starts. In other words, it is the length of the TermFreq data.
/// SkipDelta is only stored if DocFreq is not smaller than SkipMinimum.</description></item>
/// </list>
/// <a name="Termindex" id="Termindex"></a>
/// <h3>Term Index</h3>
/// <para>The .tip file contains an index into the term dictionary, so that it can be
/// accessed randomly. See <see cref="BlockTreeTermsWriter"/> for more details on the format.</para>
/// <a name="Frequencies" id="Frequencies"></a>
/// <h3>Frequencies</h3>
/// <para>The .frq file contains the lists of documents which contain each term, along
/// with the frequency of the term in that document (except when frequencies are
/// omitted: <see cref="Index.IndexOptions.DOCS_ONLY"/>).</para>
/// <list type="bullet">
/// <item><description>FreqFile (.frq) --> Header, <TermFreqs, SkipData?> <sup>TermCount</sup></description></item>
/// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item>
/// <item><description>TermFreqs --> <TermFreq> <sup>DocFreq</sup></description></item>
/// <item><description>TermFreq --> DocDelta[, Freq?]</description></item>
/// <item><description>SkipData --> <<SkipLevelLength, SkipLevel>
/// <sup>NumSkipLevels-1</sup>, SkipLevel> <SkipDatum></description></item>
/// <item><description>SkipLevel --> <SkipDatum> <sup>DocFreq/(SkipInterval^(Level +
/// 1))</sup></description></item>
/// <item><description>SkipDatum -->
/// DocSkip,PayloadLength?,OffsetLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?</description></item>
/// <item><description>DocDelta,Freq,DocSkip,PayloadLength,OffsetLength,FreqSkip,ProxSkip --> VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) </description></item>
/// <item><description>SkipChildLevelPointer --> VLong (<see cref="Store.DataOutput.WriteVInt64(long)"/>) </description></item>
/// </list>
/// <para>TermFreqs are ordered by term (the term is implicit, from the term dictionary).</para>
/// <para>TermFreq entries are ordered by increasing document number.</para>
/// <para>DocDelta: if frequencies are indexed, this determines both the document
/// number and the frequency. In particular, DocDelta/2 is the difference between
/// this document number and the previous document number (or zero when this is the
/// first document in a TermFreqs). When DocDelta is odd, the frequency is one.
/// When DocDelta is even, the frequency is read as another VInt. If frequencies
/// are omitted, DocDelta contains the gap (not multiplied by 2) between document
/// numbers and no frequency information is stored.</para>
/// <para>For example, the TermFreqs for a term which occurs once in document seven
/// and three times in document eleven, with frequencies indexed, would be the
/// following sequence of VInts:</para>
/// <para>15, 8, 3</para>
/// <para>If frequencies were omitted (<see cref="Index.IndexOptions.DOCS_ONLY"/>) it would be this
/// sequence of VInts instead:</para>
/// <para>7,4</para>
/// <para>DocSkip records the document number before every SkipInterval <sup>th</sup>
/// document in TermFreqs. If payloads and offsets are disabled for the term's field, then
/// DocSkip represents the difference from the previous value in the sequence. If
/// payloads and/or offsets are enabled for the term's field, then DocSkip/2 represents the
/// difference from the previous value in the sequence. In this case when
/// DocSkip is odd, then PayloadLength and/or OffsetLength are stored indicating the length of
/// the last payload/offset before the SkipInterval<sup>th</sup> document in TermPositions.</para>
/// <para>PayloadLength indicates the length of the last payload.</para>
/// <para>OffsetLength indicates the length of the last offset (endOffset-startOffset).</para>
/// <para>
/// FreqSkip and ProxSkip record the position of every SkipInterval <sup>th</sup>
/// entry in FreqFile and ProxFile, respectively. File positions are relative to
/// the start of TermFreqs and Positions, to the previous SkipDatum in the
/// sequence.</para>
/// <para>For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData
/// entries, containing the 15 <sup>th</sup> and 31 <sup>st</sup> document numbers
/// in TermFreqs. The first FreqSkip names the number of bytes after the beginning
/// of TermFreqs that the 16 <sup>th</sup> SkipDatum starts, and the second the
/// number of bytes after that that the 32 <sup>nd</sup> starts. The first ProxSkip
/// names the number of bytes after the beginning of Positions that the 16
/// <sup>th</sup> SkipDatum starts, and the second the number of bytes after that
/// that the 32 <sup>nd</sup> starts.</para>
/// <para>Each term can have multiple skip levels. The amount of skip levels for a
/// term is NumSkipLevels = Min(MaxSkipLevels,
/// floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a
/// skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level
/// is Level=0.
/// <para/>
/// Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0
/// has 8 SkipData entries, containing the 3<sup>rd</sup>, 7<sup>th</sup>,
/// 11<sup>th</sup>, 15<sup>th</sup>, 19<sup>th</sup>, 23<sup>rd</sup>,
/// 27<sup>th</sup>, and 31<sup>st</sup> document numbers in TermFreqs. Skip level
/// 1 has 2 SkipData entries, containing the 15<sup>th</sup> and 31<sup>st</sup>
/// document numbers in TermFreqs.
/// <para/>
/// The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer
/// referencing the corresponding SkipData entry in level-1. In the example has
/// entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a
/// pointer to entry 31 on level 0.
/// </para>
/// <a name="Positions" id="Positions"></a>
/// <h3>Positions</h3>
/// <para>The .prx file contains the lists of positions that each term occurs at
/// within documents. Note that fields omitting positional data do not store
/// anything into this file, and if all fields in the index omit positional data
/// then the .prx file will not exist.</para>
/// <list type="bullet">
/// <item><description>ProxFile (.prx) --> Header, <TermPositions> <sup>TermCount</sup></description></item>
/// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item>
/// <item><description>TermPositions --> <Positions> <sup>DocFreq</sup></description></item>
/// <item><description>Positions --> <PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?> <sup>Freq</sup></description></item>
/// <item><description>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --> VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) </description></item>
/// <item><description>PayloadData --> byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) <sup>PayloadLength</sup></description></item>
/// </list>
/// <para>TermPositions are ordered by term (the term is implicit, from the term dictionary).</para>
/// <para>Positions entries are ordered by increasing document number (the document
/// number is implicit from the .frq file).</para>
/// <para>PositionDelta is, if payloads are disabled for the term's field, the
/// difference between the position of the current occurrence in the document and
/// the previous occurrence (or zero, if this is the first occurrence in this
/// document). If payloads are enabled for the term's field, then PositionDelta/2
/// is the difference between the current and the previous position. If payloads
/// are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
/// the length of the payload at the current term position.</para>
/// <para>For example, the TermPositions for a term which occurs as the fourth term in
/// one document, and as the fifth and ninth term in a subsequent document, would
/// be the following sequence of VInts (payloads disabled):</para>
/// <para>4, 5, 4</para>
/// <para>PayloadData is metadata associated with the current term position. If
/// PayloadLength is stored at the current position, then it indicates the length
/// of this payload. If PayloadLength is not stored, then this payload has the same
/// length as the payload at the previous position.</para>
/// <para>OffsetDelta/2 is the difference between this position's startOffset from the
/// previous occurrence (or zero, if this is the first occurrence in this document).
/// If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
/// previous occurrence and an OffsetLength follows. Offset data is only written for
/// <see cref="Index.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"/>.</para>
/// </summary>
// TODO: this class could be created by wrapping
// BlockTreeTermsDict around Lucene40PostingsBaseFormat; ie
// we should not duplicate the code from that class here:
[Obsolete("Only for reading old 4.0 segments")]
[PostingsFormatName("Lucene40")] // LUCENENET specific - using PostingsFormatName attribute to ensure the default name passed from subclasses is the same as this class name
public class Lucene40PostingsFormat : PostingsFormat
{
/// <summary>
/// Minimum items (terms or sub-blocks) per block for BlockTree. </summary>
protected readonly int m_minBlockSize;
/// <summary>
/// Maximum items (terms or sub-blocks) per block for BlockTree. </summary>
protected readonly int m_maxBlockSize;
/// <summary>
/// Creates <see cref="Lucene40PostingsFormat"/> with default
/// settings.
/// </summary>
public Lucene40PostingsFormat()
: this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE)
{
}
/// <summary>
/// Creates <see cref="Lucene40PostingsFormat"/> with custom
/// values for <paramref name="minBlockSize"/> and
/// <paramref name="maxBlockSize"/> passed to block terms dictionary. </summary>
/// <seealso cref="BlockTreeTermsWriter.BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)"/>
private Lucene40PostingsFormat(int minBlockSize, int maxBlockSize)
: base()
{
this.m_minBlockSize = minBlockSize;
if (Debugging.AssertsEnabled) Debugging.Assert(minBlockSize > 1);
this.m_maxBlockSize = maxBlockSize;
}
public override FieldsConsumer FieldsConsumer(SegmentWriteState state)
{
throw UnsupportedOperationException.Create("this codec can only be used for reading");
}
public override FieldsProducer FieldsProducer(SegmentReadState state)
{
PostingsReaderBase postings = new Lucene40PostingsReader(state.Directory, state.FieldInfos, state.SegmentInfo, state.Context, state.SegmentSuffix);
bool success = false;
try
{
FieldsProducer ret = new BlockTreeTermsReader(state.Directory, state.FieldInfos, state.SegmentInfo, postings, state.Context, state.SegmentSuffix, state.TermsIndexDivisor);
success = true;
return ret;
}
finally
{
if (!success)
{
postings.Dispose();
}
}
}
/// <summary>
/// Extension of freq postings file. </summary>
internal const string FREQ_EXTENSION = "frq";
/// <summary>
/// Extension of prox postings file. </summary>
internal const string PROX_EXTENSION = "prx";
public override string ToString()
{
return Name + "(minBlockSize=" + m_minBlockSize + " maxBlockSize=" + m_maxBlockSize + ")";
}
}
}