Merge pull request #377 from bento-platform/fix-cat-counts

Fix categorical counts for public stats endpoint
bento-platform · Feb 3, 2023 · 656d1b4 · 656d1b4
2 parents 72b3a9a + f23544c
commit 656d1b4
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 10 deletions.
diff --git a/chord_metadata_service/package.cfg b/chord_metadata_service/package.cfg
@@ -1,4 +1,4 @@
 [package]
 name = katsu
-version = 2.17.0
+version = 2.17.1
 authors = Ksenia Zaytseva, David Lougheed, Simon Chénard, Romain Grégoire, Paul Pillot, Son Chau
diff --git a/chord_metadata_service/restapi/utils.py b/chord_metadata_service/restapi/utils.py
@@ -302,13 +302,16 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
     """
     Computes counts of distinct values for a queryset.
     """
+
     # values() restrict the table of results to this COLUMN
     # annotate() creates a `total` column for the aggregation
     # Count("*") aggregates results including nulls
+
     annotated_queryset = queryset.values(field).annotate(total=Count("*"))
     num_missing = 0
 
-    stats: dict[str, int] = dict()
+    stats: dict[str, int] = {}
+
     for item in annotated_queryset:
         key = item[field]
         if key is None:
@@ -389,11 +392,17 @@ def get_categorical_stats(field_props: dict) -> list[BinWithValue]:
     stats = stats_for_field(model, field_name, add_missing=True)
 
     # Enforce values order from config and apply policies
-    labels: list[str] = field_props["config"]["enum"]
+    labels: Optional[list[str]] = field_props["config"].get("enum")
+    derived_labels: bool = labels is None
+
     # Special case: for some fields, values are based on what's present in the
-    # dataset. Apply lexical sort, and exclude the "missing" value which will
-    # be appended at the end if it is set.
-    if labels is None:
+    # dataset (enum is null in the public JSON).
+    # - Here, apply lexical sort, and exclude the "missing" value which will
+    #   be appended at the end if it is set.
+    # - Note that in this situation, we explictly MUST remove rounded-down 0-counts
+    #   (below the threshold) below, otherwise we LEAK that there is 1 <= x <= threshold
+    #   matching entries in the DB.
+    if derived_labels:
         labels = sorted(
             [k for k in stats.keys() if k != "missing"],
             key=lambda x: x.lower()
@@ -403,12 +412,21 @@ def get_categorical_stats(field_props: dict) -> list[BinWithValue]:
     bins: list[BinWithValue] = []
 
     for category in labels:
-        v = stats.get(category, 0)
-        if v and v <= threshold:
-            v = 0
+        v: int = stats.get(category, 0)
+
+        # Censor small counts by rounding them to 0
+        if v <= threshold:
+            # We cannot append 0-counts for derived labels, since that indicates
+            # there is a non-0 count for this label in the database - i.e., if the label is pulled
+            # from the values in the database, someone could otherwise learn 1 <= this field <= threshold
+            # given it being present at all.
+            if derived_labels:
+                continue
+            v = 0  # Otherwise (pre-made labels, so we aren't leaking anything), censor the small count
+
         bins.append({"label": category, "value": v})
 
-    if stats["missing"] > 0:
+    if stats["missing"]:
         bins.append({"label": "missing", "value": stats["missing"]})
 
     return bins