Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-10444: Support alternate aggregation functions in association facets #718

Merged
merged 10 commits into from
Apr 6, 2022
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ New Features
* LUCENE-10237: Add MergeOnFlushMergePolicy to sandbox.
(Michael Froh, Anand Kotriwal)

* LUCENE-10444: Support alternate aggregation functions in association facets. (Greg Miller)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.AssociationAggregationFunction;
import org.apache.lucene.facet.taxonomy.FloatAssociationFacetField;
import org.apache.lucene.facet.taxonomy.IntAssociationFacetField;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetSumFloatAssociations;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetSumIntAssociations;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetFloatAssociations;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetIntAssociations;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
Expand Down Expand Up @@ -102,8 +103,12 @@ private List<FacetResult> sumAssociations() throws IOException {
// you'd use a "normal" query:
FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, fc);

Facets tags = new TaxonomyFacetSumIntAssociations("$tags", taxoReader, config, fc);
Facets genre = new TaxonomyFacetSumFloatAssociations("$genre", taxoReader, config, fc);
Facets tags =
new TaxonomyFacetIntAssociations(
"$tags", taxoReader, config, fc, AssociationAggregationFunction.SUM);
Facets genre =
new TaxonomyFacetFloatAssociations(
"$genre", taxoReader, config, fc, AssociationAggregationFunction.SUM);

// Retrieve results
List<FacetResult> results = new ArrayList<>();
Expand Down Expand Up @@ -132,7 +137,9 @@ private FacetResult drillDown() throws IOException {
FacetsCollector.search(searcher, q, 10, fc);

// Retrieve results
Facets facets = new TaxonomyFacetSumFloatAssociations("$genre", taxoReader, config, fc);
Facets facets =
new TaxonomyFacetFloatAssociations(
"$genre", taxoReader, config, fc, AssociationAggregationFunction.SUM);
FacetResult result = facets.getTopChildren(10, "genre");

indexReader.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetSumValueSource;
import org.apache.lucene.facet.taxonomy.AssociationAggregationFunction;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetFloatAssociations;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
Expand Down Expand Up @@ -105,8 +106,12 @@ private FacetResult search() throws IOException, ParseException {

// Retrieve results
Facets facets =
new TaxonomyFacetSumValueSource(
taxoReader, config, fc, expr.getDoubleValuesSource(bindings));
new TaxonomyFacetFloatAssociations(
taxoReader,
config,
fc,
AssociationAggregationFunction.SUM,
expr.getDoubleValuesSource(bindings));
FacetResult result = facets.getTopChildren(10, "A");

indexReader.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@
* <li>Taxonomy-based methods rely on a separate taxonomy index to map hierarchical facet paths to
* global int ordinals for fast counting at search time; these methods can compute counts
* (({@link org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts}) aggregate long or
* double values {@link org.apache.lucene.facet.taxonomy.TaxonomyFacetSumIntAssociations},
* {@link org.apache.lucene.facet.taxonomy.TaxonomyFacetSumFloatAssociations}, {@link
* org.apache.lucene.facet.taxonomy.TaxonomyFacetSumValueSource}. Add {@link
* double values {@link org.apache.lucene.facet.taxonomy.TaxonomyFacetIntAssociations}, {@link
* org.apache.lucene.facet.taxonomy.TaxonomyFacetFloatAssociations}. Add {@link
* org.apache.lucene.facet.FacetField} or {@link
* org.apache.lucene.facet.taxonomy.AssociationFacetField} to your documents at index time to
* use taxonomy-based methods.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy;

/**
* Specify aggregation logic used in {@link TaxonomyFacetIntAssociations} and {@link
* TaxonomyFacetFloatAssociations}.
*/
public abstract class AssociationAggregationFunction {

/** Sole constructor. */
protected AssociationAggregationFunction() {}

/** Implement aggregation logic for integers */
public abstract int aggregate(int existingVal, int newVal);

/** Implement aggregation logic for floats */
public abstract float aggregate(float existingVal, float newVal);

/** Aggregation that computes the maximum value */
public static final AssociationAggregationFunction MAX =
new AssociationAggregationFunction() {
@Override
public int aggregate(int existingVal, int newVal) {
return Math.max(existingVal, newVal);
}

@Override
public float aggregate(float existingVal, float newVal) {
return Math.max(existingVal, newVal);
}
};

/** Aggregation that computes the sum */
public static final AssociationAggregationFunction SUM =
new AssociationAggregationFunction() {
@Override
public int aggregate(int existingVal, int newVal) {
return existingVal + newVal;
}

@Override
public float aggregate(float existingVal, float newVal) {
return existingVal + newVal;
}
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, F
public FastTaxonomyFacetCounts(
String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc)
throws IOException {
super(indexFieldName, taxoReader, config, fc);
super(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, fc);
count(fc.getMatchingDocs());
}

Expand All @@ -65,7 +65,7 @@ public FastTaxonomyFacetCounts(
public FastTaxonomyFacetCounts(
String indexFieldName, IndexReader reader, TaxonomyReader taxoReader, FacetsConfig config)
throws IOException {
super(indexFieldName, taxoReader, config, null);
super(indexFieldName, taxoReader, config, AssociationAggregationFunction.SUM, null);
countAll(reader);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

/**
* Add an instance of this to your {@link Document} to add a facet label associated with a float.
* Use {@link TaxonomyFacetSumFloatAssociations} to aggregate float values per facet label at search
* Use {@link TaxonomyFacetFloatAssociations} to aggregate float values per facet label at search
* time.
*
* @lucene.experimental
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,21 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {

// TODO: also use native hash map for sparse collection, like IntTaxonomyFacets

/** Aggregation function used for combining values. */
final AssociationAggregationFunction aggregationFunction;

/** Per-ordinal value. */
final float[] values;

/** Sole constructor. */
FloatTaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config)
FloatTaxonomyFacets(
String indexFieldName,
TaxonomyReader taxoReader,
AssociationAggregationFunction aggregationFunction,
FacetsConfig config)
throws IOException {
super(indexFieldName, taxoReader, config);
this.aggregationFunction = aggregationFunction;
values = new float[taxoReader.getSize()];
}

Expand All @@ -49,22 +57,24 @@ void rollup() throws IOException {
if (ft.hierarchical && ft.multiValued == false) {
int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim));
assert dimRootOrd > 0;
values[dimRootOrd] += rollup(children[dimRootOrd]);
float newValue =
aggregationFunction.aggregate(values[dimRootOrd], rollup(children[dimRootOrd]));
gsmiller marked this conversation as resolved.
Show resolved Hide resolved
values[dimRootOrd] = newValue;
}
}
}

private float rollup(int ord) throws IOException {
int[] children = getChildren();
int[] siblings = getSiblings();
float sum = 0;
float aggregationValue = 0f;
while (ord != TaxonomyReader.INVALID_ORDINAL) {
float childValue = values[ord] + rollup(children[ord]);
float childValue = aggregationFunction.aggregate(values[ord], rollup(children[ord]));
values[ord] = childValue;
sum += childValue;
aggregationValue = aggregationFunction.aggregate(aggregationValue, childValue);
ord = siblings[ord];
}
return sum;
return aggregationValue;
}

@Override
Expand Down Expand Up @@ -106,13 +116,13 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
int[] siblings = getSiblings();

int ord = children[dimOrd];
float sumValues = 0;
float aggregatedValue = 0;
int childCount = 0;

TopOrdAndFloatQueue.OrdAndValue reuse = null;
while (ord != TaxonomyReader.INVALID_ORDINAL) {
if (values[ord] > 0) {
sumValues += values[ord];
aggregatedValue = aggregationFunction.aggregate(aggregatedValue, values[ord]);
childCount++;
if (values[ord] > bottomValue) {
if (reuse == null) {
Expand All @@ -130,16 +140,16 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
ord = siblings[ord];
}

if (sumValues == 0) {
if (aggregatedValue == 0) {
return null;
}

if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
sumValues = values[dimOrd];
aggregatedValue = values[dimOrd];
} else {
// Our sum'd count is not correct, in general:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our "aggregated" count?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not necessarily a "count" though here right? It's an aggregated weight associated with the value.

sumValues = -1;
aggregatedValue = -1;
}
} else {
// Our sum'd dim count is accurate, so we keep it
Expand All @@ -160,6 +170,6 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
}

return new FacetResult(dim, path, sumValues, labelValues, childCount);
return new FacetResult(dim, path, aggregatedValue, labelValues, childCount);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

/**
* Add an instance of this to your {@link Document} to add a facet label associated with an int. Use
* {@link TaxonomyFacetSumIntAssociations} to aggregate int values per facet label at search time.
* {@link TaxonomyFacetIntAssociations} to aggregate int values per facet label at search time.
*
* @lucene.experimental
*/
Expand Down
Loading