Skip to content

Commit

Permalink
Merge pull request #352 from bento-platform/bugfix/bento-public-missing
Browse files Browse the repository at this point in the history
Bugfix/bento public missing
  • Loading branch information
gsfk committed Nov 9, 2022
2 parents 545d99e + 4dbd8b1 commit 4538f23
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 38 deletions.
16 changes: 6 additions & 10 deletions chord_metadata_service/patients/api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from chord_metadata_service.restapi.utils import (
get_field_options,
filter_queryset_field_value,
get_queryset_stats
biosample_tissue_stats,
experiment_type_stats
)
from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation
from drf_spectacular.utils import extend_schema, inline_serializer
Expand Down Expand Up @@ -192,14 +193,9 @@ def get(self, request, *args, **kwargs):
if filtered_qs.count() <= settings.CONFIG_PUBLIC["rules"]["count_threshold"]:
return Response(settings.INSUFFICIENT_DATA_AVAILABLE)

tissues_count, sampled_tissues = get_queryset_stats(
filtered_qs,
"phenopackets__biosamples__sampled_tissue__label"
)
experiments_count, experiment_type = get_queryset_stats(
filtered_qs,
"phenopackets__biosamples__experiment__experiment_type"
)
tissues_count, sampled_tissues = biosample_tissue_stats(filtered_qs)
experiments_count, experiment_types = experiment_type_stats(filtered_qs)

return Response({
"count": filtered_qs.count(),
"biosamples": {
Expand All @@ -208,6 +204,6 @@ def get(self, request, *args, **kwargs):
},
"experiments": {
"count": experiments_count,
"experiment_type": experiment_type
"experiment_type": experiment_types
}
})
63 changes: 63 additions & 0 deletions chord_metadata_service/patients/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from copy import deepcopy
import json
import uuid
import csv
import io
from django.conf import settings
Expand All @@ -12,6 +13,12 @@
from chord_metadata_service.restapi.utils import iso_duration_to_years
from chord_metadata_service.phenopackets.tests import constants as ph_c
from chord_metadata_service.phenopackets import models as ph_m
from chord_metadata_service.phenopackets.models import Biosample, Procedure, MetaData, Phenopacket
from chord_metadata_service.experiments.models import Experiment, ExperimentResult, Instrument
from chord_metadata_service.experiments.tests import constants as exp_c
from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table
from chord_metadata_service.chord.data_types import DATA_TYPE_EXPERIMENT
from chord_metadata_service.chord.tests import constants as chord_c

from . import constants as c

Expand Down Expand Up @@ -650,3 +657,59 @@ def test_public_filtering_age_range_min_and_max_no_config(self):
self.assertIsInstance(response_obj, dict)
self.assertIsInstance(response_obj, dict)
self.assertEqual(response_obj, settings.NO_PUBLIC_DATA_AVAILABLE)


class PublicFilteringBiosampleAndExperimentStatsTest(APITestCase):
""" Test for stats returned by api/public GET """

response_threshold = 0
num_individuals_without_phenopackets = 30

def setUp(self) -> None:
self.project = Project.objects.create(**chord_c.VALID_PROJECT_1)
self.dataset = Dataset.objects.create(**chord_c.valid_dataset_1(self.project))
to, tr = chord_c.valid_table_1(self.dataset.identifier, model_compatible=True)
TableOwnership.objects.create(**to)
self.table = Table.objects.create(**tr)

# plain individuals from other tests above, no phenopackets
self.individuals = [c.generate_valid_individual() for _ in range(self.num_individuals_without_phenopackets)]
for individual in self.individuals:
Individual.objects.create(**individual)

# ..... but add someone with 2 biosamples and one experiment
self.individual, _ = Individual.objects.get_or_create(
id='patient:1', sex='FEMALE', age={"age": "P25Y3M2D"})
self.procedure = Procedure.objects.create(**ph_c.VALID_PROCEDURE_1)
self.biosample_1 = Biosample.objects.create(**ph_c.valid_biosample_1(self.individual, self.procedure))
self.biosample_2 = Biosample.objects.create(**ph_c.valid_biosample_2(None, self.procedure))
self.meta_data = MetaData.objects.create(**ph_c.VALID_META_DATA_1)
self.phenopacket = Phenopacket.objects.create(
id="phenopacket_id:1",
subject=self.individual,
meta_data=self.meta_data,
table=self.table
)
self.phenopacket.biosamples.set([self.biosample_1, self.biosample_2])

# table for experiments metadata
to_exp = TableOwnership.objects.create(table_id=uuid.uuid4(), service_id=uuid.uuid4(),
service_artifact="experiments", dataset=self.dataset)
self.t_exp = Table.objects.create(ownership_record=to_exp, name="Table 2", data_type=DATA_TYPE_EXPERIMENT)

# add Experiments metadata and link to self.biosample_1
self.instrument = Instrument.objects.create(**exp_c.valid_instrument())
self.experiment_result = ExperimentResult.objects.create(**exp_c.valid_experiment_result())
self.experiment = Experiment.objects.create(**exp_c.valid_experiment(
biosample=self.biosample_1, instrument=self.instrument, table=self.t_exp))
self.experiment.experiment_results.set([self.experiment_result])

@override_settings(CONFIG_PUBLIC=CONFIG_PUBLIC_TEST)
def test_public_get_stats(self):
response = self.client.get('/api/public?sex=FEMALE')
self.assertEqual(response.status_code, status.HTTP_200_OK)
response_obj = response.json()
self.assertEqual(response_obj['biosamples']['count'], 2)
self.assertIsInstance(response_obj['biosamples']['sampled_tissue'], list)
self.assertEqual(response_obj['experiments']['count'], 1)
self.assertIsInstance(response_obj['experiments']['experiment_type'], list)
68 changes: 40 additions & 28 deletions chord_metadata_service/restapi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ def monthly_generator(start: str, end: str) -> Tuple[int, int]:
[start_year, start_month] = [int(k) for k in start.split("-")]
[end_year, end_month] = [int(k) for k in end.split("-")]
last_month_nb = (end_year - start_year) * 12 + end_month
for month_nb in range(start_month, last_month_nb):
year = start_year + month_nb // 12
for month_nb in range(start_month, last_month_nb + 1):
year = start_year + (month_nb - 1) // 12
month = month_nb % 12 or 12
yield year, month

Expand Down Expand Up @@ -271,13 +271,15 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
"""
# values() restrict the table of results to this COLUMN
# annotate() creates a `total` column for the aggregation
# Count() aggregates the results by performing a GROUP BY on the field
queryset = queryset.values(field).annotate(total=Count(field))
# Count("*") aggregates results including nulls
annotated_queryset = queryset.values(field).annotate(total=Count("*"))
num_missing = 0

stats: Mapping[str, int] = dict()
for item in queryset:
for item in annotated_queryset:
key = item[field]
if key is None:
num_missing = item["total"]
continue

if not isinstance(key, str):
Expand All @@ -291,8 +293,7 @@ def queryset_stats_for_field(queryset, field: str, add_missing=False) -> Mapping
stats[key] = item["total"]

if add_missing:
isnull_filter = {f"{field}__isnull": True}
stats['missing'] = queryset.values(field).filter(**isnull_filter).count()
stats["missing"] = num_missing

return stats

Expand Down Expand Up @@ -350,27 +351,6 @@ def get_age_numeric_binned(individual_queryset, bin_size):
return individuals_age


def get_queryset_stats(queryset, field):
"""
Fetches public statistics for a field within a given queryset. This function
is used to compute statistics after filtering has been applied.
A cutoff is applied to all counts to avoid leaking too small sets of results.
"""
stats = queryset_stats_for_field(queryset, field, add_missing=True)
threshold = settings.CONFIG_PUBLIC["rules"]["count_threshold"]
bins = []
total = 0
for key, value in stats.items():
bins.append({
"label": key,
"value": value if value > threshold else 0
})
total += value
if total <= threshold:
total = 0
return total, bins


def get_categorical_stats(field_props):
"""
Fetches statistics for a given categorical field and apply privacy policies
Expand Down Expand Up @@ -597,3 +577,35 @@ def filter_queryset_field_value(qs, field_props, value: str):
condition = {f"{field}__startswith": val}

return qs.filter(**condition)


def experiment_type_stats(queryset):
"""
returns count and bento_public format list of stats for experiment type
note that queryset_stats_for_field() does not count "missing" correctly when the field has multiple foreign keys
"""
e_types = queryset.values(label=F("phenopackets__biosamples__experiment__experiment_type")).annotate(
value=Count("phenopackets__biosamples__experiment", distinct=True))
return bento_public_format_count_and_stats_list(e_types)


def biosample_tissue_stats(queryset):
"""
returns count and bento_public format list of stats for biosample sampled_tissue
"""
b_tissue = queryset.values(label=F("phenopackets__biosamples__sampled_tissue__label")).annotate(
value=Count("phenopackets__biosamples", distinct=True))
return bento_public_format_count_and_stats_list(b_tissue)


def bento_public_format_count_and_stats_list(annotated_queryset):
stats_list = []
total = 0
for q in annotated_queryset:
label = q["label"]
value = int(q["value"])
total += value
if label is not None:
stats_list.append({"label": label, "value": value})

return total, stats_list

0 comments on commit 4538f23

Please sign in to comment.