src/Lucene.Net/Search/FieldCacheTermsFilter.cs

namespace Lucene.Net.Search
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
    using IBits = Lucene.Net.Util.IBits;
    using BytesRef = Lucene.Net.Util.BytesRef;
    using FixedBitSet = Lucene.Net.Util.FixedBitSet;
    using SortedDocValues = Lucene.Net.Index.SortedDocValues;

    /// <summary>
    /// A <see cref="Filter"/> that only accepts documents whose single
    /// term value in the specified field is contained in the
    /// provided set of allowed terms.
    ///
    /// <para/>
    ///
    /// This is the same functionality as TermsFilter (from
    /// queries/), except this filter requires that the
    /// field contains only a single term for all documents.
    /// Because of drastically different implementations, they
    /// also have different performance characteristics, as
    /// described below.
    ///
    /// <para/>
    ///
    /// The first invocation of this filter on a given field will
    /// be slower, since a <see cref="SortedDocValues"/> must be
    /// created.  Subsequent invocations using the same field
    /// will re-use this cache.  However, as with all
    /// functionality based on <see cref="IFieldCache"/>, persistent RAM
    /// is consumed to hold the cache, and is not freed until the
    /// <see cref="Index.IndexReader"/> is disposed.  In contrast, TermsFilter
    /// has no persistent RAM consumption.
    ///
    ///
    /// <para/>
    ///
    /// With each search, this filter translates the specified
    /// set of <see cref="Index.Terms"/> into a private <see cref="FixedBitSet"/> keyed by
    /// term number per unique <see cref="Index.IndexReader"/> (normally one
    /// reader per segment).  Then, during matching, the term
    /// number for each docID is retrieved from the cache and
    /// then checked for inclusion using the <see cref="FixedBitSet"/>.
    /// Since all testing is done using RAM resident data
    /// structures, performance should be very fast, most likely
    /// fast enough to not require further caching of the
    /// <see cref="DocIdSet"/> for each possible combination of terms.
    /// However, because docIDs are simply scanned linearly, an
    /// index with a great many small documents may find this
    /// linear scan too costly.
    ///
    /// <para/>
    ///
    /// In contrast, TermsFilter builds up a <see cref="FixedBitSet"/>,
    /// keyed by docID, every time it's created, by enumerating
    /// through all matching docs using <see cref="Index.DocsEnum"/> to seek
    /// and scan through each term's docID list.  While there is
    /// no linear scan of all docIDs, besides the allocation of
    /// the underlying array in the <see cref="FixedBitSet"/>, this
    /// approach requires a number of "disk seeks" in proportion
    /// to the number of terms, which can be exceptionally costly
    /// when there are cache misses in the OS's IO cache.
    ///
    /// <para/>
    ///
    /// Generally, this filter will be slower on the first
    /// invocation for a given field, but subsequent invocations,
    /// even if you change the allowed set of <see cref="Index.Terms"/>, should be
    /// faster than TermsFilter, especially as the number of
    /// <see cref="Index.Terms"/> being matched increases.  If you are matching only
    /// a very small number of terms, and those terms in turn
    /// match a very small number of documents, TermsFilter may
    /// perform faster.
    ///
    /// <para/>
    ///
    /// Which filter is best is very application dependent.
    /// </summary>
    public class FieldCacheTermsFilter : Filter
    {
        private readonly string field;
        private readonly BytesRef[] terms;

        public FieldCacheTermsFilter(string field, params BytesRef[] terms)
        {
            this.field = field;
            this.terms = terms;
        }

        public FieldCacheTermsFilter(string field, params string[] terms)
        {
            this.field = field;
            this.terms = new BytesRef[terms.Length];
            for (int i = 0; i < terms.Length; i++)
            {
                this.terms[i] = new BytesRef(terms[i]);
            }
        }

        public virtual IFieldCache FieldCache => Search.FieldCache.DEFAULT;

        public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs)
        {
            SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), field);
            FixedBitSet bits = new FixedBitSet(fcsi.ValueCount);
            for (int i = 0; i < terms.Length; i++)
            {
                int ord = fcsi.LookupTerm(terms[i]);
                if (ord >= 0)
                {
                    bits.Set(ord);
                }
            }
            return new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) =>
            {
                int ord = fcsi.GetOrd(doc);
                if (ord == -1)
                {
                    // missing
                    return false;
                }
                else
                {
                    return bits.Get(ord);
                }
            });
        }
    }
}