-
Notifications
You must be signed in to change notification settings - Fork 624
/
Lucene40DocValuesFormat.cs
212 lines (188 loc) · 13.8 KB
/
Lucene40DocValuesFormat.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
using System;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Codecs.Lucene40
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using IndexFileNames = Lucene.Net.Index.IndexFileNames;
using SegmentReadState = Lucene.Net.Index.SegmentReadState;
using SegmentWriteState = Lucene.Net.Index.SegmentWriteState;
/// <summary>
/// Lucene 4.0 DocValues format.
/// <para/>
/// Files:
/// <list type="bullet">
/// <item><description><c>.dv.cfs</c>: compound container (<see cref="Store.CompoundFileDirectory"/>)</description></item>
/// <item><description><c>.dv.cfe</c>: compound entries (<see cref="Store.CompoundFileDirectory"/>)</description></item>
/// </list>
/// Entries within the compound file:
/// <list type="bullet">
/// <item><description><c><segment>_<fieldNumber>.dat</c>: data values</description></item>
/// <item><description><c><segment>_<fieldNumber>.idx</c>: index into the .dat for DEREF types</description></item>
/// </list>
/// <para>
/// There are several many types of <see cref="Index.DocValues"/> with different encodings.
/// From the perspective of filenames, all types store their values in <c>.dat</c>
/// entries within the compound file. In the case of dereferenced/sorted types, the <c>.dat</c>
/// actually contains only the unique values, and an additional <c>.idx</c> file contains
/// pointers to these unique values.
/// </para>
/// Formats:
/// <list type="bullet">
/// <item><description><see cref="LegacyDocValuesType.VAR_INTS"/> .dat --> Header, PackedType, MinValue,
/// DefaultValue, PackedStream</description></item>
/// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_8"/> .dat --> Header, ValueSize,
/// Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) <sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_16"/> .dat --> Header, ValueSize,
/// Short (<see cref="Store.DataOutput.WriteInt16(short)"/>) <sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_32"/> .dat --> Header, ValueSize,
/// Int32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) <sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.FIXED_INTS_64"/> .dat --> Header, ValueSize,
/// Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) <sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.FLOAT_32"/> .dat --> Header, ValueSize, Float32<sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.FLOAT_64"/> .dat --> Header, ValueSize, Float64<sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_STRAIGHT"/> .dat --> Header, ValueSize,
/// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * ValueSize)<sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_STRAIGHT"/> .idx --> Header, TotalBytes, Addresses</description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_STRAIGHT"/> .dat --> Header,
/// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * <i>variable ValueSize</i>)<sup>maxdoc</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_DEREF"/> .idx --> Header, NumValues, Addresses</description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_DEREF"/> .dat --> Header, ValueSize,
/// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * ValueSize)<sup>NumValues</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_DEREF"/> .idx --> Header, TotalVarBytes, Addresses</description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_DEREF"/> .dat --> Header,
/// (LengthPrefix + Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * <i>variable ValueSize</i>)<sup>NumValues</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_SORTED"/> .idx --> Header, NumValues, Ordinals</description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_FIXED_SORTED"/> .dat --> Header, ValueSize,
/// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * ValueSize)<sup>NumValues</sup></description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_SORTED"/> .idx --> Header, TotalVarBytes, Addresses, Ordinals</description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_SORTED"/> .dat --> Header,
/// (Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>) * <i>variable ValueSize</i>)<sup>NumValues</sup></description></item>
/// </list>
/// Data Types:
/// <list type="bullet">
/// <item><description>Header --> CodecHeader (<see cref="CodecUtil.WriteHeader(Store.DataOutput, string, int)"/>) </description></item>
/// <item><description>PackedType --> Byte (<see cref="Store.DataOutput.WriteByte(byte)"/>)</description></item>
/// <item><description>MaxAddress, MinValue, DefaultValue --> Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item>
/// <item><description>PackedStream, Addresses, Ordinals --> <see cref="Util.Packed.PackedInt32s"/></description></item>
/// <item><description>ValueSize, NumValues --> Int32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) </description></item>
/// <item><description>Float32 --> 32-bit float encoded with <see cref="J2N.BitConversion.SingleToRawInt32Bits(float)"/>
/// then written as Int32 (<see cref="Store.DataOutput.WriteInt32(int)"/>) </description></item>
/// <item><description>Float64 --> 64-bit float encoded with <see cref="J2N.BitConversion.DoubleToRawInt64Bits(double)"/>
/// then written as Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item>
/// <item><description>TotalBytes --> VLong (<see cref="Store.DataOutput.WriteVInt64(long)"/>) </description></item>
/// <item><description>TotalVarBytes --> Int64 (<see cref="Store.DataOutput.WriteInt64(long)"/>) </description></item>
/// <item><description>LengthPrefix --> Length of the data value as VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>) (maximum
/// of 2 bytes)</description></item>
/// </list>
/// Notes:
/// <list type="bullet">
/// <item><description>PackedType is a 0 when compressed, 1 when the stream is written as 64-bit integers.</description></item>
/// <item><description>Addresses stores pointers to the actual byte location (indexed by docid). In the VAR_STRAIGHT
/// case, each entry can have a different length, so to determine the length, docid+1 is
/// retrieved. A sentinel address is written at the end for the VAR_STRAIGHT case, so the Addresses
/// stream contains maxdoc+1 indices. For the deduplicated VAR_DEREF case, each length
/// is encoded as a prefix to the data itself as a VInt (<see cref="Store.DataOutput.WriteVInt32(int)"/>)
/// (maximum of 2 bytes).</description></item>
/// <item><description>Ordinals stores the term ID in sorted order (indexed by docid). In the FIXED_SORTED case,
/// the address into the .dat can be computed from the ordinal as
/// <c>Header+ValueSize+(ordinal*ValueSize)</c> because the byte length is fixed.
/// In the VAR_SORTED case, there is double indirection (docid -> ordinal -> address), but
/// an additional sentinel ordinal+address is always written (so there are NumValues+1 ordinals). To
/// determine the length, ord+1's address is looked up as well.</description></item>
/// <item><description><see cref="LegacyDocValuesType.BYTES_VAR_STRAIGHT"/> in contrast to other straight
/// variants uses a <c>.idx</c> file to improve lookup perfromance. In contrast to
/// <see cref="LegacyDocValuesType.BYTES_VAR_DEREF"/> it doesn't apply deduplication of the document values.
/// </description></item>
/// </list>
/// <para/>
/// Limitations:
/// <list type="bullet">
/// <item><description> Binary doc values can be at most <see cref="MAX_BINARY_FIELD_LENGTH"/> in length.</description></item>
/// </list>
/// </summary>
[Obsolete("Only for reading old 4.0 and 4.1 segments")]
[DocValuesFormatName("Lucene40")] // LUCENENET specific - using DocValuesFormatName attribute to ensure the default name passed from subclasses is the same as this class name
public class Lucene40DocValuesFormat : DocValuesFormat
// NOTE: not registered in SPI, doesnt respect segment suffix, etc
// for back compat only!
{
/// <summary>
/// Maximum length for each binary doc values field. </summary>
public static readonly int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
/// <summary>
/// Sole constructor. </summary>
public Lucene40DocValuesFormat()
: base()
{
}
public override DocValuesConsumer FieldsConsumer(SegmentWriteState state)
{
throw new NotSupportedException("this codec can only be used for reading");
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override DocValuesProducer FieldsProducer(SegmentReadState state)
{
string filename = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, "dv", IndexFileNames.COMPOUND_FILE_EXTENSION);
return new Lucene40DocValuesReader(state, filename, Lucene40FieldInfosReader.LEGACY_DV_TYPE_KEY);
}
// constants for VAR_INTS
internal const string VAR_INTS_CODEC_NAME = "PackedInts";
internal const int VAR_INTS_VERSION_START = 0;
internal const int VAR_INTS_VERSION_CURRENT = VAR_INTS_VERSION_START;
internal const sbyte VAR_INTS_PACKED = 0x00;
internal const sbyte VAR_INTS_FIXED_64 = 0x01;
// constants for FIXED_INTS_8, FIXED_INTS_16, FIXED_INTS_32, FIXED_INTS_64
internal const string INTS_CODEC_NAME = "Ints";
internal const int INTS_VERSION_START = 0;
internal const int INTS_VERSION_CURRENT = INTS_VERSION_START;
// constants for FLOAT_32, FLOAT_64
internal const string FLOATS_CODEC_NAME = "Floats";
internal const int FLOATS_VERSION_START = 0;
internal const int FLOATS_VERSION_CURRENT = FLOATS_VERSION_START;
// constants for BYTES_FIXED_STRAIGHT
internal const string BYTES_FIXED_STRAIGHT_CODEC_NAME = "FixedStraightBytes";
internal const int BYTES_FIXED_STRAIGHT_VERSION_START = 0;
internal const int BYTES_FIXED_STRAIGHT_VERSION_CURRENT = BYTES_FIXED_STRAIGHT_VERSION_START;
// constants for BYTES_VAR_STRAIGHT
internal const string BYTES_VAR_STRAIGHT_CODEC_NAME_IDX = "VarStraightBytesIdx";
internal const string BYTES_VAR_STRAIGHT_CODEC_NAME_DAT = "VarStraightBytesDat";
internal const int BYTES_VAR_STRAIGHT_VERSION_START = 0;
internal const int BYTES_VAR_STRAIGHT_VERSION_CURRENT = BYTES_VAR_STRAIGHT_VERSION_START;
// constants for BYTES_FIXED_DEREF
internal const string BYTES_FIXED_DEREF_CODEC_NAME_IDX = "FixedDerefBytesIdx";
internal const string BYTES_FIXED_DEREF_CODEC_NAME_DAT = "FixedDerefBytesDat";
internal const int BYTES_FIXED_DEREF_VERSION_START = 0;
internal const int BYTES_FIXED_DEREF_VERSION_CURRENT = BYTES_FIXED_DEREF_VERSION_START;
// constants for BYTES_VAR_DEREF
internal const string BYTES_VAR_DEREF_CODEC_NAME_IDX = "VarDerefBytesIdx";
internal const string BYTES_VAR_DEREF_CODEC_NAME_DAT = "VarDerefBytesDat";
internal const int BYTES_VAR_DEREF_VERSION_START = 0;
internal const int BYTES_VAR_DEREF_VERSION_CURRENT = BYTES_VAR_DEREF_VERSION_START;
// constants for BYTES_FIXED_SORTED
internal const string BYTES_FIXED_SORTED_CODEC_NAME_IDX = "FixedSortedBytesIdx";
internal const string BYTES_FIXED_SORTED_CODEC_NAME_DAT = "FixedSortedBytesDat";
internal const int BYTES_FIXED_SORTED_VERSION_START = 0;
internal const int BYTES_FIXED_SORTED_VERSION_CURRENT = BYTES_FIXED_SORTED_VERSION_START;
// constants for BYTES_VAR_SORTED
// NOTE this IS NOT A BUG! 4.0 actually screwed this up (VAR_SORTED and VAR_DEREF have same codec header)
internal const string BYTES_VAR_SORTED_CODEC_NAME_IDX = "VarDerefBytesIdx";
internal const string BYTES_VAR_SORTED_CODEC_NAME_DAT = "VarDerefBytesDat";
internal const int BYTES_VAR_SORTED_VERSION_START = 0;
internal const int BYTES_VAR_SORTED_VERSION_CURRENT = BYTES_VAR_SORTED_VERSION_START;
}
}