Skip to content

Commit

Permalink
Merge pull request #377 from bento-platform/fix-cat-counts
Browse files Browse the repository at this point in the history
Fix categorical counts for public stats endpoint
  • Loading branch information
davidlougheed committed Feb 3, 2023
2 parents 72b3a9a + f23544c commit 656d1b4
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 10 deletions.
2 changes: 1 addition & 1 deletion chord_metadata_service/package.cfg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[package]
name = katsu
version = 2.17.0
version = 2.17.1
authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau
36 changes: 27 additions & 9 deletions chord_metadata_service/restapi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,16 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
"""
Computes counts of distinct values for a queryset.
"""

# values() restrict the table of results to this COLUMN
# annotate() creates a `total` column for the aggregation
# Count("*") aggregates results including nulls

annotated_queryset = queryset.values(field).annotate(total=Count("*"))
num_missing = 0

stats: dict[str, int] = dict()
stats: dict[str, int] = {}

for item in annotated_queryset:
key = item[field]
if key is None:
Expand Down Expand Up @@ -389,11 +392,17 @@ def get_categorical_stats(field_props: dict) -> list[BinWithValue]:
stats = stats_for_field(model, field_name, add_missing=True)

# Enforce values order from config and apply policies
labels: list[str] = field_props["config"]["enum"]
labels: Optional[list[str]] = field_props["config"].get("enum")
derived_labels: bool = labels is None

# Special case: for some fields, values are based on what's present in the
# dataset. Apply lexical sort, and exclude the "missing" value which will
# be appended at the end if it is set.
if labels is None:
# dataset (enum is null in the public JSON).
# - Here, apply lexical sort, and exclude the "missing" value which will
# be appended at the end if it is set.
# - Note that in this situation, we explictly MUST remove rounded-down 0-counts
# (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold
# matching entries in the DB.
if derived_labels:
labels = sorted(
[k for k in stats.keys() if k != "missing"],
key=lambda x: x.lower()
Expand All @@ -403,12 +412,21 @@ def get_categorical_stats(field_props: dict) -> list[BinWithValue]:
bins: list[BinWithValue] = []

for category in labels:
v = stats.get(category, 0)
if v and v <= threshold:
v = 0
v: int = stats.get(category, 0)

# Censor small counts by rounding them to 0
if v <= threshold:
# We cannot append 0-counts for derived labels, since that indicates
# there is a non-0 count for this label in the database - i.e., if the label is pulled
# from the values in the database, someone could otherwise learn 1 <= this field <= threshold
# given it being present at all.
if derived_labels:
continue
v = 0 # Otherwise (pre-made labels, so we aren't leaking anything), censor the small count

bins.append({"label": category, "value": v})

if stats["missing"] > 0:
if stats["missing"]:
bins.append({"label": "missing", "value": stats["missing"]})

return bins
Expand Down

0 comments on commit 656d1b4

Please sign in to comment.