-
Notifications
You must be signed in to change notification settings - Fork 624
/
SimilarityBase.cs
326 lines (289 loc) · 14.2 KB
/
SimilarityBase.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
using Lucene.Net.Diagnostics;
using System;
using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
using BytesRef = Lucene.Net.Util.BytesRef;
using FieldInvertState = Lucene.Net.Index.FieldInvertState;
using NumericDocValues = Lucene.Net.Index.NumericDocValues;
using SmallSingle = Lucene.Net.Util.SmallSingle;
namespace Lucene.Net.Search.Similarities
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A subclass of <see cref="Similarity"/> that provides a simplified API for its
/// descendants. Subclasses are only required to implement the <see cref="Score(BasicStats, float, float)"/>
/// and <see cref="ToString()"/> methods. Implementing
/// <see cref="Explain(Explanation, BasicStats, int, float, float)"/> is optional,
/// inasmuch as <see cref="SimilarityBase"/> already provides a basic explanation of the score
/// and the term frequency. However, implementers of a subclass are encouraged to
/// include as much detail about the scoring method as possible.
/// <para/>
/// Note: multi-word queries such as phrase queries are scored in a different way
/// than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for
/// the phrase as a whole (since it does not know it), this class instead scores
/// phrases as a summation of the individual term scores.
/// <para/>
/// @lucene.experimental
/// </summary>
public abstract class SimilarityBase : Similarity
{
/// <summary>
/// For <see cref="Log2(double)"/>. Precomputed for efficiency reasons. </summary>
private static readonly double LOG_2 = Math.Log(2);
/// <summary>
/// True if overlap tokens (tokens with a position of increment of zero) are
/// discounted from the document's length.
/// </summary>
private bool discountOverlaps = true; // LUCENENET Specific: made private, since it can be get/set through property
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
protected SimilarityBase() // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
{
}
/// <summary>
/// Determines whether overlap tokens (Tokens with
/// 0 position increment) are ignored when computing
/// norm. By default this is <c>true</c>, meaning overlap
/// tokens do not count when computing norms.
/// <para/>
/// @lucene.experimental
/// </summary>
/// <seealso cref="ComputeNorm(FieldInvertState)"/>
public virtual bool DiscountOverlaps
{
get => discountOverlaps;
set => discountOverlaps = value;
}
public override sealed SimWeight ComputeWeight(float queryBoost, CollectionStatistics collectionStats, params TermStatistics[] termStats)
{
BasicStats[] stats = new BasicStats[termStats.Length];
for (int i = 0; i < termStats.Length; i++)
{
stats[i] = NewStats(collectionStats.Field, queryBoost);
FillBasicStats(stats[i], collectionStats, termStats[i]);
}
return stats.Length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats) as SimWeight;
}
/// <summary>
/// Factory method to return a custom stats object </summary>
protected internal virtual BasicStats NewStats(string field, float queryBoost)
{
return new BasicStats(field, queryBoost);
}
/// <summary>
/// Fills all member fields defined in <see cref="BasicStats"/> in <paramref name="stats"/>.
/// Subclasses can override this method to fill additional stats.
/// </summary>
protected internal virtual void FillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats)
{
// #positions(field) must be >= #positions(term)
if (Debugging.AssertsEnabled) Debugging.Assert(collectionStats.SumTotalTermFreq == -1 || collectionStats.SumTotalTermFreq >= termStats.TotalTermFreq);
long numberOfDocuments = collectionStats.MaxDoc;
long docFreq = termStats.DocFreq;
long totalTermFreq = termStats.TotalTermFreq;
// codec does not supply totalTermFreq: substitute docFreq
if (totalTermFreq == -1)
{
totalTermFreq = docFreq;
}
long numberOfFieldTokens;
float avgFieldLength;
long sumTotalTermFreq = collectionStats.SumTotalTermFreq;
if (sumTotalTermFreq <= 0)
{
// field does not exist;
// We have to provide something if codec doesnt supply these measures,
// or if someone omitted frequencies for the field... negative values cause
// NaN/Inf for some scorers.
numberOfFieldTokens = docFreq;
avgFieldLength = 1;
}
else
{
numberOfFieldTokens = sumTotalTermFreq;
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
}
// TODO: add sumDocFreq for field (numberOfFieldPostings)
stats.NumberOfDocuments = numberOfDocuments;
stats.NumberOfFieldTokens = numberOfFieldTokens;
stats.AvgFieldLength = avgFieldLength;
stats.DocFreq = docFreq;
stats.TotalTermFreq = totalTermFreq;
}
/// <summary>
/// Scores the document <c>doc</c>.
/// <para>Subclasses must apply their scoring formula in this class.</para> </summary>
/// <param name="stats"> the corpus level statistics. </param>
/// <param name="freq"> the term frequency. </param>
/// <param name="docLen"> the document length. </param>
/// <returns> the score. </returns>
public abstract float Score(BasicStats stats, float freq, float docLen);
/// <summary>
/// Subclasses should implement this method to explain the score. <paramref name="expl"/>
/// already contains the score, the name of the class and the doc id, as well
/// as the term frequency and its explanation; subclasses can add additional
/// clauses to explain details of their scoring formulae.
/// <para>The default implementation does nothing.</para>
/// </summary>
/// <param name="expl"> the explanation to extend with details. </param>
/// <param name="stats"> the corpus level statistics. </param>
/// <param name="doc"> the document id. </param>
/// <param name="freq"> the term frequency. </param>
/// <param name="docLen"> the document length. </param>
protected internal virtual void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen)
{
}
/// <summary>
/// Explains the score. The implementation here provides a basic explanation
/// in the format <em>Score(name-of-similarity, doc=doc-id,
/// freq=term-frequency), computed from:</em>, and
/// attaches the score (computed via the <see cref="Score(BasicStats, float, float)"/>
/// method) and the explanation for the term frequency. Subclasses content with
/// this format may add additional details in
/// <see cref="Explain(Explanation, BasicStats, int, float, float)"/>.
/// </summary>
/// <param name="stats"> the corpus level statistics. </param>
/// <param name="doc"> the document id. </param>
/// <param name="freq"> the term frequency and its explanation. </param>
/// <param name="docLen"> the document length. </param>
/// <returns> the explanation. </returns>
public virtual Explanation Explain(BasicStats stats, int doc, Explanation freq, float docLen)
{
Explanation result = new Explanation();
result.Value = Score(stats, freq.Value, docLen);
result.Description = "score(" + this.GetType().Name + ", doc=" + doc + ", freq=" + freq.Value + "), computed from:";
result.AddDetail(freq);
Explain(result, stats, doc, freq.Value, docLen);
return result;
}
public override SimScorer GetSimScorer(SimWeight stats, AtomicReaderContext context)
{
if (stats is MultiSimilarity.MultiStats multiStats)
{
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
SimWeight[] subStats = multiStats.subStats;
SimScorer[] subScorers = new SimScorer[subStats.Length];
for (int i = 0; i < subScorers.Length; i++)
{
BasicStats basicstats = (BasicStats)subStats[i];
subScorers[i] = new BasicSimScorer(this, basicstats, context.AtomicReader.GetNormValues(basicstats.Field));
}
return new MultiSimilarity.MultiSimScorer(subScorers);
}
else
{
BasicStats basicstats = (BasicStats)stats;
return new BasicSimScorer(this, basicstats, context.AtomicReader.GetNormValues(basicstats.Field));
}
}
/// <summary>
/// Subclasses must override this method to return the name of the <see cref="Similarity"/>
/// and preferably the values of parameters (if any) as well.
/// </summary>
public override abstract string ToString();
// ------------------------------ Norm handling ------------------------------
/// <summary>
/// Norm -> document length map. </summary>
private static readonly float[] NORM_TABLE = LoadNormTable();
private static float[] LoadNormTable() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
float[] normTable = new float[256];
for (int i = 0; i < 256; i++)
{
float floatNorm = SmallSingle.SByte315ToSingle((sbyte)i);
normTable[i] = 1.0f / (floatNorm * floatNorm);
}
return normTable;
}
/// <summary>
/// Encodes the document length in the same way as <see cref="TFIDFSimilarity"/>. </summary>
public override long ComputeNorm(FieldInvertState state)
{
float numTerms;
if (discountOverlaps)
{
numTerms = state.Length - state.NumOverlap;
}
else
{
numTerms = state.Length;
}
return EncodeNormValue(state.Boost, numTerms);
}
/// <summary>
/// Decodes a normalization factor (document length) stored in an index. </summary>
/// <see cref="EncodeNormValue(float,float)"/>
protected internal virtual float DecodeNormValue(byte norm)
{
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/// <summary>
/// Encodes the length to a byte via <see cref="SmallSingle"/>. </summary>
protected internal virtual byte EncodeNormValue(float boost, float length)
{
return SmallSingle.SingleToByte315((boost / (float)Math.Sqrt(length)));
}
// ----------------------------- Static methods ------------------------------
/// <summary>
/// Returns the base two logarithm of <c>x</c>. </summary>
public static double Log2(double x)
{
// Put this to a 'util' class if we need more of these.
return Math.Log(x) / LOG_2;
}
// --------------------------------- Classes ---------------------------------
/// <summary>
/// Delegates the <see cref="Score(int, float)"/> and
/// <see cref="Explain(int, Explanation)"/> methods to
/// <see cref="SimilarityBase.Score(BasicStats, float, float)"/> and
/// <see cref="SimilarityBase.Explain(BasicStats, int, Explanation, float)"/>,
/// respectively.
/// </summary>
private class BasicSimScorer : SimScorer
{
private readonly SimilarityBase outerInstance;
private readonly BasicStats stats;
private readonly NumericDocValues norms;
internal BasicSimScorer(SimilarityBase outerInstance, BasicStats stats, NumericDocValues norms)
{
this.outerInstance = outerInstance;
this.stats = stats;
this.norms = norms;
}
public override float Score(int doc, float freq)
{
// We have to supply something in case norms are omitted
return outerInstance.Score(stats, freq, norms == null ? 1F : outerInstance.DecodeNormValue((byte)norms.Get(doc)));
}
public override Explanation Explain(int doc, Explanation freq)
{
return outerInstance.Explain(stats, doc, freq, norms == null ? 1F : outerInstance.DecodeNormValue((byte)norms.Get(doc)));
}
public override float ComputeSlopFactor(int distance)
{
return 1.0f / (distance + 1);
}
public override float ComputePayloadFactor(int doc, int start, int end, BytesRef payload)
{
return 1f;
}
}
}
}