Skip to content

Commit

Permalink
Merge pull request #339 from bento-platform/features/stats_list
Browse files Browse the repository at this point in the history
Features/stats list
  • Loading branch information
ppillot committed Sep 23, 2022
2 parents ff172dc + 827405c commit 31cb9c8
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 32 deletions.
28 changes: 23 additions & 5 deletions chord_metadata_service/patients/api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination
from chord_metadata_service.restapi.utils import (
get_field_options,
filter_queryset_field_value
filter_queryset_field_value,
get_queryset_stats
)
from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation

Expand Down Expand Up @@ -139,8 +140,25 @@ def get(self, request, *args, **kwargs):
*(e.error_list if hasattr(e, "error_list") else e.error_dict.items()),
))

if filtered_qs.count() > settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
return Response({"count": filtered_qs.count()})
else:
# the count < threshold when there is no match in db the queryset is empty, count = 0
if filtered_qs.count() <= settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
return Response(settings.INSUFFICIENT_DATA_AVAILABLE)

tissues_count, sampled_tissues = get_queryset_stats(
filtered_qs,
"phenopackets__biosamples__sampled_tissue__label"
)
experiments_count, experiment_type = get_queryset_stats(
filtered_qs,
"phenopackets__biosamples__experiment__experiment_type"
)
return Response({
"count": filtered_qs.count(),
"biosamples": {
"count": tissues_count,
"sampled_tissue": sampled_tissues
},
"experiments": {
"count": experiments_count,
"experiment_type": experiment_type
}
})
4 changes: 4 additions & 0 deletions chord_metadata_service/patients/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ def test_public_get(self):
self.assertEqual(response_obj, settings.INSUFFICIENT_DATA_AVAILABLE)
else:
self.assertEqual(Individual.objects.all().count(), response_obj['count'])
self.assertEqual(response_obj['biosamples']['count'], 0)
self.assertIsInstance(response_obj['biosamples']['sampled_tissue'], list)
self.assertEqual(response_obj['experiments']['count'], 0)
self.assertIsInstance(response_obj['experiments']['experiment_type'], list)

@override_settings(CONFIG_PUBLIC={})
def test_public_get_no_config(self):
Expand Down
67 changes: 55 additions & 12 deletions chord_metadata_service/restapi/api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
from rest_framework.decorators import api_view, permission_classes

from chord_metadata_service.restapi.utils import (
get_age_numeric_binned,
get_field_options,
parse_individual_age,
stats_for_field,
compute_binned_ages,
get_field_bins,
queryset_stats_for_field,
get_categorical_stats,
get_date_stats,
get_range_stats
Expand Down Expand Up @@ -71,16 +71,7 @@ def overview(_request):
diseases_stats = stats_for_field(pheno_models.Phenopacket, "diseases__term__label")
diseases_count = len(diseases_stats)

# age_numeric is computed at ingestion time of phenopackets. On some instances
# it might be unavailable and as a fallback must be computed from the age JSON field which
# has two alternate formats (hence more complex and slower to process)
individuals_age = get_field_bins(patients_models.Individual, "age_numeric", OVERVIEW_AGE_BIN_SIZE)
if None in individuals_age: # fallback
del individuals_age[None]
individuals_age = Counter(individuals_age)
individuals_age.update(
compute_binned_ages(OVERVIEW_AGE_BIN_SIZE) # single update instead of creating iterables in a loop
)
individuals_age = get_age_numeric_binned(patients_models.Individual.objects.all(), OVERVIEW_AGE_BIN_SIZE)

r = {
"phenopackets": phenopackets_count,
Expand Down Expand Up @@ -138,6 +129,58 @@ def overview(_request):
return Response(r)


@api_view(["GET", "POST"])
@permission_classes([OverrideOrSuperUserOnly])
def search_overview(request):
"""
get+post:
Overview statistics of a list of patients (associated with a search result)
- Parameter
- id: a list of patient ids
"""
individual_id = request.GET.getlist("id") if request.method == "GET" else request.data.get("id", [])

queryset = patients_models.Individual.objects.all()
if len(individual_id) > 0:
queryset = queryset.filter(id__in=individual_id)

biosamples_count = queryset.values("phenopackets__biosamples__id").count()
experiments_count = queryset.values("phenopackets__biosamples__experiment__id").count()

# Sex related fields stats are precomputed here and post processed later
# to include missing values inferred from the schema
individuals_sex = queryset_stats_for_field(queryset, "sex")

r = {
"biosamples": {
"count": biosamples_count,
"sampled_tissue": queryset_stats_for_field(queryset, "phenopackets__biosamples__sampled_tissue__label"),
"histological_diagnosis": queryset_stats_for_field(
queryset,
"phenopackets__biosamples__histological_diagnosis__label"
),
},
"diseases": {
"term": queryset_stats_for_field(queryset, "phenopackets__diseases__term__label"),
},
"individuals": {
"sex": {k: individuals_sex.get(k, 0) for k in (s[0] for s in pheno_models.Individual.SEX)},
"age": get_age_numeric_binned(queryset, OVERVIEW_AGE_BIN_SIZE),
},
"phenotypic_features": {
"type": queryset_stats_for_field(queryset, "phenopackets__phenotypic_features__pftype__label")
},
"experiments": {
"count": experiments_count,
"experiment_type": queryset_stats_for_field(
queryset,
"phenopackets__biosamples__experiment__experiment_type"
),
},
}
return Response(r)


# Cache page for the requested url for 2 hours
@cache_page(60 * 60 * 2)
@api_view(["GET"])
Expand Down
12 changes: 12 additions & 0 deletions chord_metadata_service/restapi/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
from copy import deepcopy

Expand Down Expand Up @@ -103,6 +104,17 @@ def test_overview(self):
self.assertEqual(response_obj['data_type_specific']['instruments']['platform']['Illumina'], 2)
self.assertEqual(response_obj['data_type_specific']['instruments']['model']['Illumina HiSeq 4000'], 2)

def test_search_overview(self):
payload = json.dumps({'id': [ph_c.VALID_INDIVIDUAL_1['id']]})
response = self.client.post(reverse('search-overview'), payload, content_type='application/json')
response_obj = response.json()
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertIsInstance(response_obj, dict)
self.assertEqual(response_obj['biosamples']['count'], 1)
self.assertIn('wall of urinary bladder', response_obj['biosamples']['sampled_tissue'])
self.assertIn('Proptosis', response_obj['phenotypic_features']['type'])
self.assertIn(ph_c.VALID_DISEASE_1['term']['label'], response_obj['diseases']['term'])


class McodeOverviewTest(APITestCase):
# create 2 mcodepackets for 2 individuals
Expand Down
14 changes: 8 additions & 6 deletions chord_metadata_service/restapi/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
public_search_fields,
public_overview,
public_dataset,
search_overview
)
from chord_metadata_service.restapi.routers import BatchListRouter

Expand Down Expand Up @@ -72,30 +73,31 @@
urlpatterns = [
path('', include(router.urls)),
path('', include(batch_router.urls)),

# apps schemas
path('chord_phenopacket_schema', phenopacket_views.get_chord_phenopacket_schema,
name="chord-phenopacket-schema"),
path('experiment_schema', experiment_views.get_experiment_schema,
name="experiment-schema"),
path('mcode_schema', mcode_views.get_mcode_schema,
name="mcode-schema"),
# overview

# overviews (statistics)
path('overview', overview, name="overview"),
# mcode overview
path('mcode_overview', mcode_overview, name="mcode-overview"),
path('search_overview', search_overview, name="search-overview"),

# autocomplete URLs
path('disease_term_autocomplete', DiseaseTermAutocomplete.as_view(), name='disease-term-autocomplete',),
path('phenotypic_feature_type_autocomplete', PhenotypicFeatureTypeAutocomplete.as_view(),
name='phenotypic-feature-type-autocomplete',),
path('biosample_sampled_tissue_autocomplete', BiosampleSampledTissueAutocomplete.as_view(),
name='biosample-sampled-tissue-autocomplete',),
# public search results

# public endpoints (no confidential information leak)
path('public', individual_views.PublicListIndividuals.as_view(),
name='public',),
# public search fields schema
path('public_search_fields', public_search_fields, name='public-search-fields',),
# public overview
path('public_overview', public_overview, name='public-overview',),
# public dataset properties
path('public_dataset', public_dataset, name='public-dataset'),
]
72 changes: 63 additions & 9 deletions chord_metadata_service/restapi/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import isodate
import datetime

from collections import defaultdict
from collections import defaultdict, Counter
from typing import Tuple, Mapping
from calendar import month_abbr

Expand All @@ -13,6 +13,9 @@
from chord_metadata_service.experiments import models as experiments_models


OVERVIEW_AGE_BIN_SIZE = 10


def camel_case_field_names(string):
""" Function to convert snake_case field names to camelCase """
# Capitalize every part except the first
Expand Down Expand Up @@ -193,13 +196,21 @@ def stats_for_field(model, field: str, add_missing=False) -> Mapping[str, int]:
Computes counts of distinct values for a given field. Mainly applicable to
char fields representing categories
"""
queryset = model.objects.all()
return queryset_stats_for_field(queryset, field, add_missing)


def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping[str, int]:
"""
Computes counts of distinct values for a queryset.
"""
# values() restrict the table of results to this COLUMN
# annotate() creates a `total` column for the aggregation
# Count() aggregates the results by performing a GROUP BY on the field
query_set = model.objects.all().values(field).annotate(total=Count(field))
queryset = queryset.values(field).annotate(total=Count(field))

stats: Mapping[str, int] = dict()
for item in query_set:
for item in queryset:
key = item[field]
if key is None:
continue
Expand All @@ -212,17 +223,17 @@ def stats_for_field(model, field: str, add_missing=False) -> Mapping[str, int]:

if add_missing:
isnull_filter = {f"{field}__isnull": True}
stats['missing'] = model.objects.all().values(field).filter(**isnull_filter).count()
stats['missing'] = queryset.values(field).filter(**isnull_filter).count()

return stats


def get_field_bins(model, field, bin_size):
def get_field_bins(query_set, field, bin_size):
# computes a new column "binned" by substracting the modulo by bin size to
# the value which requires binning (e.g. 28 => 28 - 28 % 10 = 20)
# cast to integer to avoid numbers such as 60.00 if that was a decimal,
# and aggregate over this value.
query_set = model.objects.all().annotate(
query_set = query_set.annotate(
binned=Cast(
F(field) - Func(F(field), bin_size, function="MOD"),
IntegerField()
Expand All @@ -232,13 +243,17 @@ def get_field_bins(model, field, bin_size):
return stats


def compute_binned_ages(bin_size: int):
def compute_binned_ages(individual_queryset, bin_size: int):
"""
When age_numeric field is not available, use this function to process
the age field in its various formats.
Returns an array of values floored to the closest decade (e.g. 25 --> 20)
Params:
- individual_queryset: a queryset made on the individual model, containing
the age and age_numeric fields
- bin_size: how many years there is per bin
Returns a list of values floored to the closest decade (e.g. 25 --> 20)
"""
a = pheno_models.Individual.objects.filter(age_numeric__isnull=True).values('age')
a = individual_queryset.filter(age_numeric__isnull=True).values('age')
binned_ages = []
for r in a.iterator(): # reduce memory footprint (no caching)
if r["age"] is None:
Expand All @@ -248,6 +263,45 @@ def compute_binned_ages(bin_size: int):
return binned_ages


def get_age_numeric_binned(individual_queryset, bin_size):
"""
age_numeric is computed at ingestion time of phenopackets. On some instances
it might be unavailable and as a fallback must be computed from the age JSON field which
has two alternate formats (hence more complex and slower to process)
"""
individuals_age = get_field_bins(individual_queryset, "age_numeric", bin_size)
if None not in individuals_age:
return individuals_age

del individuals_age[None]
individuals_age = Counter(individuals_age)
individuals_age.update(
compute_binned_ages(individual_queryset, bin_size) # single update instead of creating iterables in a loop
)
return individuals_age


def get_queryset_stats(queryset, field):
"""
Fetches public statistics for a field within a given queryset. This function
is used to compute statistics after filtering has been applied.
A cutoff is applied to all counts to avoid leaking too small sets of results.
"""
stats = queryset_stats_for_field(queryset, field, add_missing=True)
threshold = settings.CONFIG_PUBLIC["rules"]["count_threshold"]
bins = []
total = 0
for key, value in stats.items():
bins.append({
"label": key,
"value": value if value > threshold else 0
})
total += value
if total <= threshold:
total = 0
return total, bins


def get_categorical_stats(field_props):
"""
Fetches statistics for a given categorical field and apply privacy policies
Expand Down

0 comments on commit 31cb9c8

Please sign in to comment.