-
Notifications
You must be signed in to change notification settings - Fork 623
/
FieldCacheTermsFilter.cs
145 lines (139 loc) · 5.86 KB
/
FieldCacheTermsFilter.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
using IBits = Lucene.Net.Util.IBits;
using BytesRef = Lucene.Net.Util.BytesRef;
using FixedBitSet = Lucene.Net.Util.FixedBitSet;
using SortedDocValues = Lucene.Net.Index.SortedDocValues;
/// <summary>
/// A <see cref="Filter"/> that only accepts documents whose single
/// term value in the specified field is contained in the
/// provided set of allowed terms.
///
/// <para/>
///
/// This is the same functionality as TermsFilter (from
/// queries/), except this filter requires that the
/// field contains only a single term for all documents.
/// Because of drastically different implementations, they
/// also have different performance characteristics, as
/// described below.
///
/// <para/>
///
/// The first invocation of this filter on a given field will
/// be slower, since a <see cref="SortedDocValues"/> must be
/// created. Subsequent invocations using the same field
/// will re-use this cache. However, as with all
/// functionality based on <see cref="IFieldCache"/>, persistent RAM
/// is consumed to hold the cache, and is not freed until the
/// <see cref="Index.IndexReader"/> is disposed. In contrast, TermsFilter
/// has no persistent RAM consumption.
///
///
/// <para/>
///
/// With each search, this filter translates the specified
/// set of <see cref="Index.Terms"/> into a private <see cref="FixedBitSet"/> keyed by
/// term number per unique <see cref="Index.IndexReader"/> (normally one
/// reader per segment). Then, during matching, the term
/// number for each docID is retrieved from the cache and
/// then checked for inclusion using the <see cref="FixedBitSet"/>.
/// Since all testing is done using RAM resident data
/// structures, performance should be very fast, most likely
/// fast enough to not require further caching of the
/// <see cref="DocIdSet"/> for each possible combination of terms.
/// However, because docIDs are simply scanned linearly, an
/// index with a great many small documents may find this
/// linear scan too costly.
///
/// <para/>
///
/// In contrast, TermsFilter builds up a <see cref="FixedBitSet"/>,
/// keyed by docID, every time it's created, by enumerating
/// through all matching docs using <see cref="Index.DocsEnum"/> to seek
/// and scan through each term's docID list. While there is
/// no linear scan of all docIDs, besides the allocation of
/// the underlying array in the <see cref="FixedBitSet"/>, this
/// approach requires a number of "disk seeks" in proportion
/// to the number of terms, which can be exceptionally costly
/// when there are cache misses in the OS's IO cache.
///
/// <para/>
///
/// Generally, this filter will be slower on the first
/// invocation for a given field, but subsequent invocations,
/// even if you change the allowed set of <see cref="Index.Terms"/>, should be
/// faster than TermsFilter, especially as the number of
/// <see cref="Index.Terms"/> being matched increases. If you are matching only
/// a very small number of terms, and those terms in turn
/// match a very small number of documents, TermsFilter may
/// perform faster.
///
/// <para/>
///
/// Which filter is best is very application dependent.
/// </summary>
public class FieldCacheTermsFilter : Filter
{
private readonly string field;
private readonly BytesRef[] terms;
public FieldCacheTermsFilter(string field, params BytesRef[] terms)
{
this.field = field;
this.terms = terms;
}
public FieldCacheTermsFilter(string field, params string[] terms)
{
this.field = field;
this.terms = new BytesRef[terms.Length];
for (int i = 0; i < terms.Length; i++)
{
this.terms[i] = new BytesRef(terms[i]);
}
}
public virtual IFieldCache FieldCache => Search.FieldCache.DEFAULT;
public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs)
{
SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), field);
FixedBitSet bits = new FixedBitSet(fcsi.ValueCount);
for (int i = 0; i < terms.Length; i++)
{
int ord = fcsi.LookupTerm(terms[i]);
if (ord >= 0)
{
bits.Set(ord);
}
}
return new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) =>
{
int ord = fcsi.GetOrd(doc);
if (ord == -1)
{
// missing
return false;
}
else
{
return bits.Get(ord);
}
});
}
}
}