Merge pull request #164 from bento-platform/develop

Version 1.1.0
bento-platform · Oct 15, 2020 · fd7f79c · fd7f79c
2 parents a978bf2 + d0ff72b
commit fd7f79c
Show file tree

Hide file tree

Showing 22 changed files with 655 additions and 48 deletions.
diff --git a/.env-sample b/.env-sample
@@ -0,0 +1,20 @@
+# If Django runs in DEBUG mode
+export CHORD_DEBUG=false
+export SERVICE_SECRET_KEY=some_secret_key
+
+export POSTGRES_DATABASE=
+export POSTGRES_USER=
+export POSTGRES_PASSWORD=
+# Should only need to use a host or socket dir
+export POSTGRES_HOST=
+export POSTGRES_SOCKET_DIR=
+export POSTGRES_PORT=
+
+# CHORD-specific
+export CHORD_URL=
+export CHORD_PERMISSIONS=
+export SERVICE_ID=
+
+# CanDIG-specific
+export INSIDE_CANDIG=true
+export CANDIG_OPA_URL=http://0.0.0.0:8181
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ passwords.txt
 
 # virtualenv
 env/
+.env
 
 # IDE related (intelliJ, vim files)
 .idea/

diff --git a/README.md b/README.md
@@ -1,21 +1,23 @@
-# CHORD Metadata Service
-
-![Build Status](https://api.travis-ci.com/c3g/chord_metadata_service.svg?branch=master)
-[![codecov](https://codecov.io/gh/c3g/chord_metadata_service/branch/master/graph/badge.svg)](https://codecov.io/gh/c3g/chord_metadata_service)
+# Katsu Metadata Service
 
+![Build Status](https://travis-ci.com/bento-platform/katsu.svg?branch=master)
+[![codecov](https://codecov.io/gh/bento-platform/katsu/branch/master/graph/badge.svg)](https://codecov.io/gh/bento-platform/katsu)
 
 ## License
 
-The majority of the CHORD Metadata Service is licensed under the LGPLv3 license; copyright (c) 2019-2020 the Canadian
+The majority of the Katsu Metadata Service is licensed under the LGPLv3 license; copyright (c) 2019-2020 the Canadian
 Centre for Computational Genomics.
 
 Portions are copyright (c) 2019 Julius OB Jacobsen, Peter N Robinson, Christopher J Mungall (Phenopackets); licensed
 under the BSD 3-clause license.
 
+## Funding
+
+Katsu Metadata service development is funded by CANARIE under the CHORD project.
 
 ## Architecture
 
-CHORD Metadata Service is a service to store epigenomic metadata.
+Katsu Metadata Service is a service to store epigenomic metadata.
 
 1. Patients service handles anonymized individual’s data (individual id, sex, age or date of birth)
     * Data model: aggregated profile from GA4GH Phenopackets Individual, FHIR Patient and mCODE Patient.
@@ -65,22 +67,11 @@ The service uses PostgreSQL database for data storage.
 
 * Create and activate virtual environment
 * Run: `pip install -r requirements.txt`
-* Configure database connection in settings.py
+* To configure the application (such as the DB credentials) we are using python-dotenv:
+    - Take a look at the .env-sample file at the root of the project
+    - You can export these in your virtualenv or simply `cp .env-sample .env`
+    - python-dotenv can handle either (a local .env will override env vars though)
 
-e.g. settings if running database on localhost, default port for PostgreSQL is 5432:
-
-```python
-DATABASES = {
-    'default': {
-        'ENGINE': 'django.db.backends.postgresql_psycopg2',
-        'NAME': 'database_name',
-        'USER': 'user',
-        'PASSWORD': 'password',
-        'HOST': 'localhost',
-        'PORT': '5432',
-    }
-}
-```
 
 * Run:
 
@@ -113,7 +104,7 @@ for a standalone instance of this server, so it can be swapped out.
 
 ### Note On Permissions
 
-By default, `chord_metadata_service` uses the CHORD permission system, which
+By default, `katsu` uses the CHORD permission system, which
 functions as follows:
 
   * URLs under the `/private` namespace are assumed to be protected by an
@@ -126,6 +117,15 @@ functions as follows:
 This can be turned off with the `CHORD_PERMISSIONS` environment variable and/or
 Django setting, or with the `AUTH_OVERRIDE` Django setting.
 
+### Authorization inside CanDIG
+
+When ran inside the CanDIG context, to properly implement authorization you'll
+have to do the following:
+
+1. Make sure the CHORD_PERMISSIONS is set to "false"
+2. Set INSIDE_CANDIG to "true"
+3. Provide the URL for the OPA instance in CANDIG_OPA_URL
+
 ## Developing
 
 ### Branching
@@ -167,7 +167,7 @@ coverage html
 ### Accessing the Django Shell from inside a CHORD Container
 
 Assuming `chord_singularity` is being used, the following commands can be used
-to bootstrap your way to a `chord_metadata_service` environment within a CHORD
+to bootstrap your way to a `katsu` environment within a CHORD
 container:
 
 ```bash

diff --git a/chord_metadata_service/chord/ingest.py b/chord_metadata_service/chord/ingest.py
@@ -23,6 +23,7 @@
 __all__ = [
     "METADATA_WORKFLOWS",
     "WORKFLOWS_PATH",
+    "IngestError",
     "ingest_resource",
     "WORKFLOW_INGEST_FUNCTION_MAP",
 ]
@@ -176,6 +177,10 @@
 WORKFLOWS_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "workflows")
 
 
+class IngestError(Exception):
+    pass
+
+
 def create_phenotypic_feature(pf):
     pf_obj = pm.PhenotypicFeature(
         description=pf.get("description", ""),
@@ -184,7 +189,8 @@ def create_phenotypic_feature(pf):
         severity=pf.get("severity"),
         modifier=pf.get("modifier", []),  # TODO: Validate ontology term in schema...
         onset=pf.get("onset"),
-        evidence=pf.get("evidence")  # TODO: Separate class?
+        evidence=pf.get("evidence"),  # TODO: Separate class?
+        extra_properties=pf.get("extra_properties", {})
     )
 
     pf_obj.save()
@@ -208,7 +214,9 @@ def ingest_resource(resource: dict) -> rm.Resource:
         namespace_prefix=namespace_prefix,
         url=resource["url"],
         version=version,
-        iri_prefix=resource["iri_prefix"]
+        iri_prefix=resource["iri_prefix"],
+        extra_properties=resource.get("extra_properties", {})
+        # TODO extra_properties
     )
 
     return rs_obj
@@ -268,7 +276,11 @@ def ingest_phenopacket(phenopacket_data, table_id) -> pm.Phenopacket:
         subject_query = _query_and_check_nulls(subject, "date_of_birth", transform=isoparse)
         for k in ("alternate_ids", "age", "sex", "karyotypic_sex", "taxonomy"):
             subject_query.update(_query_and_check_nulls(subject, k))
-        subject, _ = pm.Individual.objects.get_or_create(id=subject["id"], **subject_query)
+        subject, _ = pm.Individual.objects.get_or_create(id=subject["id"],
+                                                         race=subject.get("race", ""),
+                                                         ethnicity=subject.get("ethnicity", ""),
+                                                         extra_properties=subject.get("extra_properties", {}),
+                                                         **subject_query)
 
     phenotypic_features_db = [create_phenotypic_feature(pf) for pf in phenotypic_features]
 
@@ -288,6 +300,7 @@ def ingest_phenopacket(phenopacket_data, table_id) -> pm.Phenopacket:
             procedure=procedure,
             is_control_sample=bs.get("is_control_sample", False),
             diagnostic_markers=bs.get("diagnostic_markers", []),
+            extra_properties=bs.get("extra_properties", {}),
             **bs_query
         )
 
@@ -307,7 +320,8 @@ def ingest_phenopacket(phenopacket_data, table_id) -> pm.Phenopacket:
         g_obj, _ = pm.Gene.objects.get_or_create(
             id=g["id"],
             alternate_ids=g.get("alternate_ids", []),
-            symbol=g["symbol"]
+            symbol=g["symbol"],
+            extra_properties=g.get("extra_properties", {})
         )
         genes_db.append(g_obj)
 
@@ -318,6 +332,7 @@ def ingest_phenopacket(phenopacket_data, table_id) -> pm.Phenopacket:
             term=disease["term"],
             disease_stage=disease.get("disease_stage", []),
             tnm_finding=disease.get("tnm_finding", []),
+            extra_properties=disease.get("extra_properties", {}),
             **_query_and_check_nulls(disease, "onset")
         )
         diseases_db.append(d_obj.id)
@@ -330,7 +345,7 @@ def ingest_phenopacket(phenopacket_data, table_id) -> pm.Phenopacket:
             hts_format=htsfile["hts_format"],
             genome_assembly=htsfile["genome_assembly"],
             individual_to_sample_identifiers=htsfile.get("individual_to_sample_identifiers", None),
-            extra_properties=htsfile.get("extra_properties", None)
+            extra_properties=htsfile.get("extra_properties", {})
         )
         hts_files_db.append(htsf_obj)
 
@@ -340,7 +355,8 @@ def ingest_phenopacket(phenopacket_data, table_id) -> pm.Phenopacket:
         created_by=meta_data["created_by"],
         submitted_by=meta_data.get("submitted_by"),
         phenopacket_schema_version="1.0.0-RC3",
-        external_references=meta_data.get("external_references", [])
+        external_references=meta_data.get("external_references", []),
+        extra_properties=meta_data.get("extra_properties", {})
     )
     meta_data_obj.save()
 
@@ -369,8 +385,15 @@ def _map_if_list(fn, data, *args):
     return [fn(d, *args) for d in data] if isinstance(data, list) else fn(data, *args)
 
 
+def _get_output_or_raise(workflow_outputs, key):
+    if key not in workflow_outputs:
+        raise IngestError(f"Missing workflow output: {key}")
+
+    return workflow_outputs[key]
+
+
 def ingest_experiments_workflow(workflow_outputs, table_id):
-    with open(workflow_outputs["json_document"], "r") as jf:
+    with open(_get_output_or_raise(workflow_outputs, "json_document"), "r") as jf:
         json_data = json.load(jf)
 
         dataset = TableOwnership.objects.get(table_id=table_id).dataset
@@ -382,13 +405,13 @@ def ingest_experiments_workflow(workflow_outputs, table_id):
 
 
 def ingest_phenopacket_workflow(workflow_outputs, table_id):
-    with open(workflow_outputs["json_document"], "r") as jf:
+    with open(_get_output_or_raise(workflow_outputs, "json_document"), "r") as jf:
         json_data = json.load(jf)
         return _map_if_list(ingest_phenopacket, json_data, table_id)
 
 
 def ingest_fhir_workflow(workflow_outputs, table_id):
-    with open(workflow_outputs["patients"], "r") as pf:
+    with open(_get_output_or_raise(workflow_outputs, "patients"), "r") as pf:
         patients_data = json.load(pf)
         phenopacket_ids = ingest_patients(
             patients_data,
@@ -413,7 +436,7 @@ def ingest_fhir_workflow(workflow_outputs, table_id):
 
 
 def ingest_mcode_fhir_workflow(workflow_outputs, table_id):
-    with open(workflow_outputs["json_document"], "r") as jf:
+    with open(_get_output_or_raise(workflow_outputs, "json_document"), "r") as jf:
         json_data = json.load(jf)
         mcodepacket = parse_bundle(json_data)
         ingest_mcodepacket(mcodepacket, table_id)

diff --git a/chord_metadata_service/chord/views_ingest.py b/chord_metadata_service/chord/views_ingest.py
@@ -1,5 +1,6 @@
 import json
 import os
+import traceback
 import uuid
 
 from django.core.exceptions import ValidationError
@@ -14,7 +15,7 @@
 from bento_lib.responses import errors
 from bento_lib.workflows import get_workflow, get_workflow_resource, workflow_exists
 
-from .ingest import METADATA_WORKFLOWS, WORKFLOWS_PATH, WORKFLOW_INGEST_FUNCTION_MAP
+from .ingest import METADATA_WORKFLOWS, WORKFLOWS_PATH, WORKFLOW_INGEST_FUNCTION_MAP, IngestError
 from .models import Table
 
 
@@ -88,10 +89,8 @@ def ingest(request):
             # Wrap ingestion in a transaction, so if it fails we don't end up in a partial state in the database.
             WORKFLOW_INGEST_FUNCTION_MAP[workflow_id](workflow_outputs, table_id)
 
-    except KeyError:
-        # Tried to access a non-existant workflow output
-        # TODO: More precise error (which key?)
-        return Response(errors.bad_request_error("Missing workflow output"), status=400)
+    except IngestError as e:
+        return Response(errors.bad_request_error(f"Encountered ingest error: {e}"), status=400)
 
     except json.decoder.JSONDecodeError as e:
         return Response(errors.bad_request_error(f"Invalid JSON provided for ingest document (message: {e})"),
@@ -103,6 +102,12 @@ def ingest(request):
             *(e.error_list if hasattr(e, "error_list") else e.error_dict.items()),
         ))
 
+    except Exception as e:
+        # Encountered some other error from the ingestion attempt, return a somewhat detailed message
+        print(f"Encountered an exception while processing an ingest attempt:\n{traceback.format_exc()}")
+        return Response(errors.internal_server_error(f"Encountered an exception while processing an ingest attempt "
+                                                     f"(error: {repr(e)}"), status=500)
+
     # TODO: Schema validation
     # TODO: Rollback in case of failures
     return Response(status=204)
diff --git a/chord_metadata_service/experiments/api_views.py b/chord_metadata_service/experiments/api_views.py
@@ -3,10 +3,12 @@
 from rest_framework.decorators import api_view, permission_classes
 from rest_framework.permissions import AllowAny
 from rest_framework.response import Response
+from django_filters.rest_framework import DjangoFilterBackend
 
 from .serializers import ExperimentSerializer
 from .models import Experiment
 from .schemas import EXPERIMENT_SCHEMA
+from .filters import ExperimentFilter
 from chord_metadata_service.restapi.pagination import LargeResultsSetPagination
 
 
@@ -23,6 +25,8 @@ class ExperimentViewSet(viewsets.ModelViewSet):
     serializer_class = ExperimentSerializer
     pagination_class = LargeResultsSetPagination
     renderer_classes = tuple(api_settings.DEFAULT_RENDERER_CLASSES)
+    filter_backends = [DjangoFilterBackend]
+    filter_class = ExperimentFilter
 
 
 @api_view(["GET"])

diff --git a/chord_metadata_service/experiments/filters.py b/chord_metadata_service/experiments/filters.py
@@ -0,0 +1,14 @@
+import django_filters
+from .models import Experiment
+
+
+class ExperimentFilter(django_filters.rest_framework.FilterSet):
+    experiment_type = django_filters.CharFilter(lookup_expr='iexact')
+    molecule = django_filters.CharFilter(lookup_expr='iexact')
+    library_strategy = django_filters.CharFilter(lookup_expr='iexact')
+
+    class Meta:
+        model = Experiment
+        fields = ["id", "reference_registry_id",
+                  "experiment_type", "molecule",
+                  "library_strategy", "biosample"]