-
Notifications
You must be signed in to change notification settings - Fork 623
/
IBSimilarity.cs
157 lines (146 loc) · 7.25 KB
/
IBSimilarity.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
using Lucene.Net.Support;
namespace Lucene.Net.Search.Similarities
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Provides a framework for the family of information-based models, as described
/// in StÉphane Clinchant and Eric Gaussier. 2010. Information-based
/// models for ad hoc IR. In Proceeding of the 33rd international ACM SIGIR
/// conference on Research and development in information retrieval (SIGIR '10).
/// ACM, New York, NY, USA, 234-241.
/// <para>The retrieval function is of the form <em>RSV(q, d) = ∑
/// -x<sup>q</sup><sub>w</sub> log Prob(X<sub>w</sub> >=
/// t<sup>d</sup><sub>w</sub> | λ<sub>w</sub>)</em>, where
/// <list type="bullet">
/// <item><description><em>x<sup>q</sup><sub>w</sub></em> is the query boost;</description></item>
/// <item><description><em>X<sub>w</sub></em> is a random variable that counts the occurrences
/// of word <em>w</em>;</description></item>
/// <item><description><em>t<sup>d</sup><sub>w</sub></em> is the normalized term frequency;</description></item>
/// <item><description><em>λ<sub>w</sub></em> is a parameter.</description></item>
/// </list>
/// </para>
/// <para>The framework described in the paper has many similarities to the DFR
/// framework (see <see cref="DFRSimilarity"/>). It is possible that the two
/// Similarities will be merged at one point.</para>
/// <para>To construct an <see cref="IBSimilarity"/>, you must specify the implementations for
/// all three components of the Information-Based model.
/// <list type="table">
/// <listheader>
/// <term>Component</term>
/// <term>Implementations</term>
/// </listheader>
/// <item>
/// <term><see cref="Distribution"/>: Probabilistic distribution used to
/// model term occurrence</term>
/// <term>
/// <list type="bullet">
/// <item><description><see cref="DistributionLL"/>: Log-logistic</description></item>
/// <item><description><see cref="DistributionLL"/>: Smoothed power-law</description></item>
/// </list>
/// </term>
/// </item>
/// <item>
/// <term><see cref="Lambda"/>: λ<sub>w</sub> parameter of the
/// probability distribution</term>
/// <term>
/// <list type="bullet">
/// <item><description><see cref="LambdaDF"/>: <c>N<sub>w</sub>/N</c> or average
/// number of documents where w occurs</description></item>
/// <item><description><see cref="LambdaTTF"/>: <c>F<sub>w</sub>/N</c> or
/// average number of occurrences of w in the collection</description></item>
/// </list>
/// </term>
/// </item>
/// <item>
/// <term><see cref="Normalization"/>: Term frequency normalization</term>
/// <term>Any supported DFR normalization (listed in
/// <see cref="DFRSimilarity"/>)
/// </term>
/// </item>
/// </list>
/// </para>
/// @lucene.experimental
/// </summary>
/// <seealso cref="DFRSimilarity"/>
[ExceptionToClassNameConvention]
public class IBSimilarity : SimilarityBase
{
/// <summary>
/// The probabilistic distribution used to model term occurrence. </summary>
protected internal readonly Distribution m_distribution;
/// <summary>
/// The <em>lambda (λ<sub>w</sub>)</em> parameter. </summary>
protected internal readonly Lambda m_lambda;
/// <summary>
/// The term frequency normalization. </summary>
protected internal readonly Normalization m_normalization;
/// <summary>
/// Creates IBSimilarity from the three components.
/// <para/>
/// Note that <c>null</c> values are not allowed:
/// if you want no normalization, instead pass
/// <see cref="Normalization.NoNormalization"/>. </summary>
/// <param name="distribution"> probabilistic distribution modeling term occurrence </param>
/// <param name="lambda"> distribution's λ<sub>w</sub> parameter </param>
/// <param name="normalization"> term frequency normalization </param>
public IBSimilarity(Distribution distribution, Lambda lambda, Normalization normalization)
{
this.m_distribution = distribution;
this.m_lambda = lambda;
this.m_normalization = normalization;
}
public override float Score(BasicStats stats, float freq, float docLen)
{
return stats.TotalBoost * m_distribution.Score(stats, m_normalization.Tfn(stats, freq, docLen), m_lambda.CalculateLambda(stats));
}
protected internal override void Explain(Explanation expl, BasicStats stats, int doc, float freq, float docLen)
{
if (stats.TotalBoost != 1.0f)
{
expl.AddDetail(new Explanation(stats.TotalBoost, "boost"));
}
Explanation normExpl = m_normalization.Explain(stats, freq, docLen);
Explanation lambdaExpl = m_lambda.Explain(stats);
expl.AddDetail(normExpl);
expl.AddDetail(lambdaExpl);
expl.AddDetail(m_distribution.Explain(stats, normExpl.Value, lambdaExpl.Value));
}
/// <summary>
/// The name of IB methods follow the pattern
/// <c>IB <distribution> <lambda><normalization></c>. The name of the
/// distribution is the same as in the original paper; for the names of lambda
/// parameters, refer to the doc of the <see cref="Similarities.Lambda"/> classes.
/// </summary>
public override string ToString()
{
return "IB " + m_distribution.ToString() + "-" + m_lambda.ToString() + m_normalization.ToString();
}
/// <summary>
/// Returns the distribution
/// </summary>
public virtual Distribution Distribution => m_distribution;
/// <summary>
/// Returns the distribution's lambda parameter
/// </summary>
public virtual Lambda Lambda => m_lambda;
/// <summary>
/// Returns the term frequency normalization
/// </summary>
public virtual Normalization Normalization => m_normalization;
}
}