Skip to content

Commit

Permalink
Merge pull request #399 from bento-platform/features/explorer_biosamp…
Browse files Browse the repository at this point in the history
…le_experiment_search

Features/explorer biosample experiment search
  • Loading branch information
noctillion committed May 30, 2023
2 parents f96de2d + dbf09a6 commit 9417501
Show file tree
Hide file tree
Showing 11 changed files with 388 additions and 43 deletions.
16 changes: 13 additions & 3 deletions chord_metadata_service/chord/tests/test_api_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ def test_private_table_search_values_list_invalid_field_syntax(self):

def test_private_table_search_bento_search_results(self):
# Valid query to search for biosample id in list
# Output as Bento search (a list of 4 values)
# Output as Bento search (a list of 5 values)

d = {
"query": TEST_SEARCH_QUERY_10,
Expand All @@ -495,11 +495,21 @@ def test_private_table_search_bento_search_results(self):
self.assertEqual(r.status_code, status.HTTP_200_OK)
c = r.json()
self.assertEqual(len(c["results"]), 1) # 1 matching phenopacket
self.assertEqual(len(c["results"][0]), 4) # 4 columns by result
self.assertEqual(len(c["results"][0]), 5) # 5 columns by result
self.assertEqual(
{"subject_id", "alternate_ids", "biosamples", "num_experiments"},
{"subject_id", "alternate_ids", "biosamples", "experiments_with_biosamples", "num_experiments"},
set(c["results"][0].keys()))
self.assertIsInstance(c["results"][0]["alternate_ids"], list)
self.assertIsInstance(c["results"][0]["experiments_with_biosamples"], list)
for biosample in c["results"][0]["experiments_with_biosamples"]:
self.assertIn("biosample_id", biosample)
self.assertIn("sampled_tissue", biosample)
self.assertIn("id", biosample["sampled_tissue"])
self.assertIn("label", biosample["sampled_tissue"])
self.assertIn("experiment", biosample)
self.assertIn("experiment_id", biosample["experiment"])
self.assertIn("experiment_type", biosample["experiment"])
self.assertIn("study_type", biosample["experiment"])

def test_private_search_bento_search_results(self):
# Valid query to search for biosample id in list
Expand Down
86 changes: 60 additions & 26 deletions chord_metadata_service/chord/views_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,16 @@
from chord_metadata_service.patients.models import Individual

from chord_metadata_service.phenopackets.api_views import PHENOPACKET_SELECT_REL, PHENOPACKET_PREFETCH
from chord_metadata_service.phenopackets.models import Phenopacket
from chord_metadata_service.phenopackets.models import Phenopacket, Biosample
from chord_metadata_service.phenopackets.serializers import PhenopacketSerializer

from .data_types import DATA_TYPE_EXPERIMENT, DATA_TYPE_MCODEPACKET, DATA_TYPE_PHENOPACKET, DATA_TYPES
from .models import Dataset, TableOwnership, Table
from .permissions import ReadOnly, OverrideOrSuperUserOnly

from collections import defaultdict


OUTPUT_FORMAT_VALUES_LIST = "values_list"
OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"

Expand Down Expand Up @@ -269,6 +272,25 @@ def mcodepacket_query_results(query, params, options=None):
return queryset


def get_biosamples_with_experiment_details(subject_ids):
"""
The function returns a queryset where each entry represents a biosample obtained from a subject, along with
details of any associated experiment. If a biosample does not have an associated experiment, the experiment
details are returned as None.
"""
biosamples_exp_tissue_details = Biosample.objects.filter(phenopacket__subject_id__in=subject_ids)\
.values(
subject_id=F("phenopacket__subject_id"),
biosample_id=F("id"),
experiment_id=F("experiment__id"),
experiment_type=F("experiment__experiment_type"),
study_type=F("experiment__study_type"),
tissue_id=F("sampled_tissue__id"),
tissue_label=F("sampled_tissue__label")
)
return biosamples_exp_tissue_details


def phenopacket_query_results(query, params, options=None):
queryset = Phenopacket.objects \
.filter(id__in=data_type_results(query, params, "id"))
Expand All @@ -282,29 +304,42 @@ def phenopacket_query_results(query, params, options=None):
if "add_field" in options:
fields.append(options["add_field"])

# Results displayed as 4/5 columns:
# "individuals ID", "table ID" (optional), [Alternate ids list], number of experiments, [Biosamples list...]
return queryset.values(
*fields,
alternate_ids=Coalesce(F("subject__alternate_ids"), [])
).annotate(
# Weird bug with Django 4.1 here: num_experiments must come before the use of ArrayAgg or biosamples
# is considered as an ArrayField...
num_experiments=Count("biosamples__experiment"),
# Postgre specific: aggregates multiple values in a list
biosamples=Coalesce(
ArrayAgg("biosamples__id", distinct=True, filter=Q(biosamples__id__isnull=False)),
[]
)
)
results = queryset.values(
*fields,
alternate_ids=Coalesce(F("subject__alternate_ids"), []),
).annotate(
num_experiments=Count("biosamples__experiment"),
biosamples=Coalesce(ArrayAgg("biosamples__id", distinct=True, filter=Q(biosamples__id__isnull=False)), []),
)

# To expand further on this query : the select_related call
# will join on these tables we'd call anyway, thus 2 less request
# to the DB. prefetch_related works on M2M relationships and makes
# sure that, for instance, when querying diseases, we won't make multiple call
# for the same set of data
return queryset.select_related(*PHENOPACKET_SELECT_REL) \
.prefetch_related(*PHENOPACKET_PREFETCH)
# Get the biosamples with experiments data
phenopacket_ids = [result['subject_id'] for result in results]
biosamples_experiments_details = get_biosamples_with_experiment_details(phenopacket_ids)

# Group the experiments with biosamples by subject_id
experiments_with_biosamples = defaultdict(list)
for b in biosamples_experiments_details:
experiments_with_biosamples[b["subject_id"]].append({
"biosample_id": b["biosample_id"],
"sampled_tissue": {
"id": b["tissue_id"],
"label": b["tissue_label"]
},
"experiment": {
"experiment_id": b["experiment_id"],
"experiment_type": b["experiment_type"],
"study_type": b["study_type"]
}
})

# Add the experiments_with_biosamples data to the results
for result in results:
result["experiments_with_biosamples"] = experiments_with_biosamples[result['subject_id']]

return results
else:
return queryset.select_related(*PHENOPACKET_SELECT_REL) \
.prefetch_related(*PHENOPACKET_PREFETCH)


QUERY_RESULTS_FN: Dict[str, Callable] = {
Expand Down Expand Up @@ -391,8 +426,8 @@ def search(request, internal_data=False):
"data_type": data_type,
"matches": list(serializer_class(p).data for p in table_objects)
} for table_id, table_objects in itertools.groupby(
queryset,
key=lambda o: str(o.table_id) # object here
queryset if queryset is not None else [],
key=lambda o: str(o.table_id) # object here
)
}, start))

Expand Down Expand Up @@ -621,7 +656,6 @@ def chord_table_search(search_params, table_id, start, internal=False) -> Tuple[
params=search_params["params"] + (table_id,),
options=search_params
)

if not internal:
return queryset.exists(), None # True if at least one match

Expand Down
58 changes: 55 additions & 3 deletions chord_metadata_service/experiments/api_views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from rest_framework import viewsets
from rest_framework import viewsets, mixins
from rest_framework.settings import api_settings
from rest_framework.decorators import api_view, permission_classes
from rest_framework.permissions import AllowAny
Expand All @@ -11,9 +11,18 @@
from .models import Experiment, ExperimentResult
from .schemas import EXPERIMENT_SCHEMA
from .filters import ExperimentFilter, ExperimentResultFilter
from chord_metadata_service.restapi.pagination import LargeResultsSetPagination
from chord_metadata_service.restapi.pagination import LargeResultsSetPagination, BatchResultsSetPagination
from drf_spectacular.utils import extend_schema, inline_serializer
from rest_framework import serializers
from rest_framework import serializers, status


from chord_metadata_service.restapi.api_renderers import (
FHIRRenderer,
PhenopacketsRenderer,
ExperimentCSVRenderer,
)

from chord_metadata_service.restapi.negociation import FormatInPostContentNegotiation

__all__ = [
"EXPERIMENT_SELECT_REL",
Expand All @@ -28,6 +37,7 @@

EXPERIMENT_PREFETCH = (
"experiment_results",
"biosample__individual"
)


Expand All @@ -54,6 +64,48 @@ def dispatch(self, *args, **kwargs):
return super(ExperimentViewSet, self).dispatch(*args, **kwargs)


class BatchViewSet(mixins.ListModelMixin, viewsets.GenericViewSet):
"""
A viewset that only implements the 'list' action.
To be used with the BatchListRouter which maps the POST method to .list()
"""
pass


class ExperimentBatchViewSet(BatchViewSet):
"""
get:
Return a list of all existing experiments
post:
return a list of experiments based on a list of ids
"""

serializer_class = ExperimentSerializer
pagination_class = BatchResultsSetPagination
renderer_classes = (*api_settings.DEFAULT_RENDERER_CLASSES, FHIRRenderer,
PhenopacketsRenderer, ExperimentCSVRenderer)
content_negotiation_class = FormatInPostContentNegotiation

def get_queryset(self):
experiment_ids = self.request.data.get("id", None)
filter_by_id = {"id__in": experiment_ids} if experiment_ids else {}
queryset = Experiment.objects.filter(**filter_by_id)\
.select_related(*EXPERIMENT_SELECT_REL)\
.prefetch_related(*EXPERIMENT_PREFETCH)\
.order_by("id")

return queryset

def create(self, request, *args, **kwargs):
ids_list = request.data.get('id', [])
request.data["id"] = ids_list
queryset = self.get_queryset()

serializer = ExperimentSerializer(queryset, many=True)
return Response(serializer.data, status=status.HTTP_200_OK)


class ExperimentResultViewSet(viewsets.ModelViewSet):
"""
get:
Expand Down
20 changes: 20 additions & 0 deletions chord_metadata_service/experiments/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from chord_metadata_service.restapi.serializers import GenericSerializer
from .models import Experiment, ExperimentResult, Instrument
from chord_metadata_service.patients.models import Individual


__all__ = ["ExperimentSerializer"]
Expand All @@ -17,9 +18,28 @@ class Meta:
fields = "__all__"


# this is for dinamic field selection, allow the serializer include/exclude fields in the output
class DynamicFieldsMixin:
def __init__(self, *args, **kwargs):
fields = kwargs.pop("fields", None)
super().__init__(*args, **kwargs)
if fields is not None:
allowed = set(fields)
existing = set(self.fields)
for field_name in existing - allowed:
self.fields.pop(field_name)


class IndividualSerializer(DynamicFieldsMixin, GenericSerializer):
class Meta:
model = Individual
fields = "__all__"


class ExperimentSerializer(GenericSerializer):
experiment_results = ExperimentResultSerializer(read_only=True, many=True)
instrument = InstrumentSerializer()
biosample_individual = IndividualSerializer(source='biosample.individual', read_only=True, fields=['id'])

class Meta:
model = Experiment
Expand Down
82 changes: 82 additions & 0 deletions chord_metadata_service/experiments/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
from jsonschema.validators import Draft7Validator
from uuid import uuid4

from django.test import TestCase
from chord_metadata_service.restapi.api_renderers import ExperimentCSVRenderer
import csv
import io

from rest_framework import status
from rest_framework.test import APITestCase
from chord_metadata_service.chord.models import Project, Dataset, TableOwnership, Table
Expand Down Expand Up @@ -135,3 +140,80 @@ def test_combine_filters_experiment_results_2(self):
response_data = response.json()
self.assertEqual(response_data["count"], 0)
self.assertEqual(len(response_data["results"]), 0)

def test_post_experiment_batch_no_data(self):
response = self.client.post('/api/batch/experiments', format='json')
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(len(response.json()), 2)

def test_post_experiment_batch_with_ids(self):
response = self.client.post('/api/batch/experiments', {'id': ['experiment:1']}, format='json')
self.assertEqual(response.status_code, status.HTTP_200_OK)
response_data = response.json()
self.assertEqual(len(response_data), 1)
self.assertEqual(response_data[0]['id'], 'experiment:1')


class TestExperimentCSVRenderer(TestCase):
"""
Test the CSV renderer for the experiment API
"""
def setUp(self):
self.renderer = ExperimentCSVRenderer()
self.data = [{
'id': 'id1',
'study_type': 'study_type1',
'experiment_type': 'experiment_type1',
'molecule': 'molecule1',
'library_strategy': 'library_strategy1',
'library_source': 'library_source1',
'library_selection': 'library_selection1',
'library_layout': 'library_layout1',
'created': 'created1',
'updated': 'updated1',
'biosample': 'biosample1',
'biosample_individual': {'id': 'individual_id1'},
}]

def test_csv_headers(self):
response = self.renderer.render(self.data)
csv_content = response.content.decode()
csv_file = io.StringIO(csv_content)
reader = csv.DictReader(csv_file)
expected_headers = ['Id', 'Study type', 'Experiment type', 'Molecule', 'Library strategy',
'Library source', 'Library selection', 'Library layout',
'Created', 'Updated', 'Biosample', 'Individual id']
self.assertListEqual(list(reader.fieldnames), expected_headers)

def test_csv_render_with_missing_fields(self):
data_with_missing_fields = [{
'id': 'id3',
'study_type': 'study_type3',
'experiment_type': 'experiment_type3',
'molecule': 'molecule3',
'library_strategy': 'library_strategy3',
'library_source': 'library_source3',
# 'library_selection' intentionally missing
'library_layout': 'library_layout3',
'created': 'created3',
'updated': 'updated3',
'biosample': 'biosample3',
'biosample_individual': {'id': 'individual_id3'},
}]
response = self.renderer.render(data_with_missing_fields)
csv_content = response.content.decode()
csv_file = io.StringIO(csv_content)
reader = csv.DictReader(csv_file)
row = next(reader)
self.assertIsNone(row.get('library_selection'))

def test_csv_render_with_empty_data(self):
data_empty = [{}]
response = self.renderer.render(data_empty)
csv_content = response.content.decode()
csv_file = io.StringIO(csv_content)
reader = csv.DictReader(csv_file)
row = next(reader)
for row in reader:
for key in row:
self.assertEqual(row[key], '')
1 change: 1 addition & 0 deletions chord_metadata_service/patients/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def filter_search(self, qs, name, value):
"phenopackets__biosamples__experiment__experiment_results__description",
"phenopackets__biosamples__experiment__experiment_results__filename",
"phenopackets__biosamples__experiment__experiment_results__file_format",
"phenopackets__biosamples__experiment__experiment_results__genome_assembly_id",
"phenopackets__biosamples__experiment__experiment_results__data_output_type",
"phenopackets__biosamples__experiment__experiment_results__usage",
"phenopackets__biosamples__experiment__experiment_results__creation_date",
Expand Down

0 comments on commit 9417501

Please sign in to comment.