Merge pull request #373 from bento-platform/garbage-collect

Cleanup functionality + revamped workflows for new WES + fixes for DB startup errors
bento-platform · Jan 26, 2023 · 75663ee · 75663ee
2 parents 9090b6a + 13126ed
commit 75663ee
Show file tree

Hide file tree

Showing 36 changed files with 754 additions and 109 deletions.
diff --git a/bento.Dockerfile b/bento.Dockerfile
@@ -1,6 +1,10 @@
-FROM ghcr.io/bento-platform/bento_base_image:python-debian-latest
+FROM ghcr.io/bento-platform/bento_base_image:python-debian-2023.01.17
 
-RUN pip install --no-cache-dir "uvicorn[standard]==0.20.0"
+# Install Postgres client for checking if database is ready
+# Install uvicorn to serve the API
+RUN apt-get update -y && \
+    apt-get install -y postgresql-client && \
+    pip install --no-cache-dir "uvicorn[standard]==0.20.0"
 
 # Backwards-compatible with old BentoV2 container layout
 WORKDIR /app
@@ -16,4 +20,4 @@ COPY . .
 # Create temporary directory for downloading files etc.
 RUN mkdir -p tmp
 
-CMD [ "sh", "./entrypoint.sh" ]
+CMD [ "bash", "./entrypoint.bash" ]
diff --git a/bento.dev.Dockerfile b/bento.dev.Dockerfile
@@ -1,4 +1,8 @@
-FROM ghcr.io/bento-platform/bento_base_image:python-debian-latest
+FROM ghcr.io/bento-platform/bento_base_image:python-debian-2023.01.17
+
+# Install Postgres client for checking if database is ready
+RUN apt-get update -y && \
+    apt-get install -y postgresql-client
 
 # Backwards-compatible with old BentoV2 container layout
 WORKDIR /app
@@ -9,10 +13,7 @@ COPY requirements-dev.txt requirements-dev.txt
 # Install production dependencies
 RUN pip install --no-cache-dir -r requirements-dev.txt
 
-# Copy all application code
-COPY . .
-
 # Create temporary directory for downloading files etc.
 RUN mkdir -p tmp
 
-CMD [ "sh", "./entrypoint.dev.sh" ]
+CMD [ "bash", "./entrypoint.dev.bash" ]
diff --git a/chord_metadata_service/chord/api_views.py b/chord_metadata_service/chord/api_views.py
@@ -1,15 +1,20 @@
-from rest_framework import viewsets
+import logging
+
+from rest_framework import status, viewsets
 from rest_framework.permissions import BasePermission, SAFE_METHODS
+from rest_framework.response import Response
 from rest_framework.settings import api_settings
 
+from django_filters.rest_framework import DjangoFilterBackend
+
+from chord_metadata_service.cleanup import run_all_cleanup
 from chord_metadata_service.restapi.api_renderers import PhenopacketsRenderer, JSONLDDatasetRenderer, RDFDatasetRenderer
 from chord_metadata_service.restapi.pagination import LargeResultsSetPagination
+
 from .models import Project, Dataset, TableOwnership, Table
 from .permissions import OverrideOrSuperUserOnly
 from .serializers import ProjectSerializer, DatasetSerializer, TableOwnershipSerializer, TableSerializer
 from .filters import AuthorizedDatasetFilter
-from django_filters.rest_framework import DjangoFilterBackend
-import logging
 
 logger = logging.getLogger(__name__)
 
@@ -89,3 +94,18 @@ class TableViewSet(CHORDPublicModelViewSet):
 
     queryset = Table.objects.all().prefetch_related("ownership_record").order_by("ownership_record_id")
     serializer_class = TableSerializer
+
+    def destroy(self, request, *args, **kwargs):
+        # First, delete the table record itself
+        # - use the cascade from the ownership record rather than the default DRF behaviour
+        table = self.get_object()
+        table_id = table.ownership_record_id
+        table.ownership_record.delete()
+        table.delete()
+
+        # Then, run cleanup
+        logger.info(f"Running cleanup after deleting table {table_id} via DRF API")
+        n_removed = run_all_cleanup()
+        logger.info(f"Cleanup: removed {n_removed} objects in total")
+
+        return Response(status=status.HTTP_204_NO_CONTENT)
diff --git a/chord_metadata_service/chord/tests/test_api_search.py b/chord_metadata_service/chord/tests/test_api_search.py
@@ -49,6 +49,8 @@
 
 POST_GET = ("POST", "GET")
 
+DATA_TYPE_NOT_REAL = "not_a_real_data_type"
+
 
 class DataTypeTest(APITestCase):
     def test_data_type_list(self):
@@ -72,18 +74,33 @@ def test_data_type_detail(self):
             **DATA_TYPES[DATA_TYPE_PHENOPACKET],
         })
 
+    def test_data_type_detail_404(self):
+        r = self.client.get(reverse("data-type-detail", kwargs={"data_type": DATA_TYPE_NOT_REAL}))
+        self.assertEqual(r.status_code, status.HTTP_404_NOT_FOUND)
+        r.json()  # assert json response
+
     def test_data_type_schema(self):
         r = self.client.get(reverse("data-type-schema", kwargs={"data_type": DATA_TYPE_PHENOPACKET}))
         self.assertEqual(r.status_code, status.HTTP_200_OK)
         c = r.json()
         self.assertDictEqual(c, DATA_TYPES[DATA_TYPE_PHENOPACKET]["schema"])
 
+    def test_data_type_schema_404(self):
+        r = self.client.get(reverse("data-type-schema", kwargs={"data_type": DATA_TYPE_NOT_REAL}))
+        self.assertEqual(r.status_code, status.HTTP_404_NOT_FOUND)
+        r.json()  # assert json response
+
     def test_data_type_metadata_schema(self):
         r = self.client.get(reverse("data-type-metadata-schema", kwargs={"data_type": DATA_TYPE_PHENOPACKET}))
         self.assertEqual(r.status_code, status.HTTP_200_OK)
         c = r.json()
         self.assertDictEqual(c, DATA_TYPES[DATA_TYPE_PHENOPACKET]["metadata_schema"])
 
+    def test_data_type_metadata_schema_404(self):
+        r = self.client.get(reverse("data-type-metadata-schema", kwargs={"data_type": DATA_TYPE_NOT_REAL}))
+        self.assertEqual(r.status_code, status.HTTP_404_NOT_FOUND)
+        r.json()  # assert json response
+
 
 class TableTest(APITestCase):
     @staticmethod

diff --git a/chord_metadata_service/chord/views_search.py b/chord_metadata_service/chord/views_search.py
@@ -5,6 +5,7 @@
 
 from bento_lib.responses import errors
 from bento_lib.search import build_search_response, postgres
+
 from datetime import datetime
 from django.db import connection
 from django.db.models import Count, F, Q
@@ -13,12 +14,16 @@
 from django.conf import settings
 from django.views.decorators.cache import cache_page
 from psycopg2 import sql
-from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field
 from rest_framework.decorators import api_view, permission_classes
 from rest_framework.permissions import AllowAny
 from rest_framework.response import Response
+
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
+from chord_metadata_service.cleanup import run_all_cleanup
+from chord_metadata_service.logger import logger
+from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field
+
 from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH
 from chord_metadata_service.experiments.models import Experiment
 from chord_metadata_service.experiments.serializers import ExperimentSerializer
@@ -27,7 +32,7 @@
 from chord_metadata_service.mcode.serializers import MCodePacketSerializer
 
 from chord_metadata_service.metadata.elastic import es
-from chord_metadata_service.metadata.settings import DEBUG, CHORD_SERVICE_ARTIFACT, CHORD_SERVICE_ID
+from chord_metadata_service.metadata.settings import CHORD_SERVICE_ARTIFACT, CHORD_SERVICE_ID
 
 from chord_metadata_service.patients.models import Individual
 
@@ -42,9 +47,6 @@
 OUTPUT_FORMAT_VALUES_LIST = "values_list"
 OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)
-
 
 @api_view(["GET"])
 @permission_classes([AllowAny])
@@ -149,11 +151,22 @@ def table_detail(request, table_id):  # pragma: no cover
     #  Or just always use owner...
     try:
         table = Table.objects.get(ownership_record_id=table_id)
+        table_ownership = TableOwnership.objects.get(table_id=table_id)
     except Table.DoesNotExist:
         return Response(errors.not_found_error(f"Table with ID {table_id} not found"), status=404)
+    except TableOwnership.DoesNotExist:
+        return Response(
+            errors.not_found_error(f"Table ownership record for table with ID {table_id} not found"), status=404)
 
     if request.method == "DELETE":
-        table.delete()
+        # First, delete the table record itself - use the cascade from the ownership record
+        table_ownership.delete()  # also deletes table
+
+        # Then, run cleanup
+        logger.info(f"Running cleanup after deleting table {table_id} via /tables Bento data service endpoint")
+        n_removed = run_all_cleanup()
+        logger.info(f"Cleanup: removed {n_removed} objects in total")
+
         return Response(status=204)
 
     # GET

diff --git a/chord_metadata_service/chord/workflows/metadata.py b/chord_metadata_service/chord/workflows/metadata.py
@@ -16,6 +16,8 @@
     "WORKFLOWS_PATH",
 ]
 
+from typing import Optional
+
 from chord_metadata_service.chord.data_types import (
     DATA_TYPE_EXPERIMENT,
     DATA_TYPE_EXPERIMENT_RESULT,
@@ -46,11 +48,15 @@ def json_file_input(id_: str, required: bool = True):
     }
 
 
-def json_file_output(id_: str):
+def json_file_output(id_: str, output_name: Optional[str] = None):
     return {
         "id": id_,
         "type": "file",
-        "value": f"{{{id_}}}",
+
+        # this triple {} abomination, with e.g. id_=json_document, turns into the string '{json_document}'
+        # the 'output_name or' part is a bit of a hack until we move to a new ingest system which can actually read
+        # Cromwell output JSON to grab the right files or something.
+        "value": output_name or f"{{{id_}}}",
     }
 
 
@@ -63,7 +69,7 @@ def json_file_output(id_: str):
             "data_type": DATA_TYPE_PHENOPACKET,
             "file": "phenopackets_json.wdl",
             "inputs": [json_file_input("json_document")],
-            "outputs": [json_file_output("json_document")],
+            "outputs": [json_file_output("json_document", "ingest.json")],
         },
         WORKFLOW_EXPERIMENTS_JSON: {
             "name": "Bento Experiments JSON",
@@ -72,7 +78,7 @@ def json_file_output(id_: str):
             "data_type": DATA_TYPE_EXPERIMENT,
             "file": "experiments_json.wdl",
             "inputs": [json_file_input("json_document")],
-            "outputs": [json_file_output("json_document")]
+            "outputs": [json_file_output("json_document", "ingest.json")]
         },
         WORKFLOW_FHIR_JSON: {
             "name": "FHIR Resources JSON",
@@ -94,10 +100,10 @@ def json_file_output(id_: str):
 
             ],
             "outputs": [
-                json_file_output("patients"),
-                json_file_output("observations"),
-                json_file_output("conditions"),
-                json_file_output("specimens"),
+                json_file_output("patients", "patients.json"),
+                json_file_output("observations", "observations.json"),
+                json_file_output("conditions", "conditions.json"),
+                json_file_output("specimens", "specimens.json"),
                 {
                     "id": "created_by",
                     "type": "string",
@@ -114,7 +120,7 @@ def json_file_output(id_: str):
             "data_type": DATA_TYPE_MCODEPACKET,
             "file": "mcode_fhir_json.wdl",
             "inputs": [json_file_input("json_document")],
-            "outputs": [json_file_output("json_document")],
+            "outputs": [json_file_output("json_document", "ingest.json")],
         },
         WORKFLOW_MCODE_JSON: {
             "name": "MCODE Resources JSON",
@@ -123,7 +129,7 @@ def json_file_output(id_: str):
             "data_type": DATA_TYPE_MCODEPACKET,
             "file": "mcode_json.wdl",
             "inputs": [json_file_input("json_document")],
-            "outputs": [json_file_output("json_document")],
+            "outputs": [json_file_output("json_document", "ingest.json")],
         },
         WORKFLOW_READSET: {
             "name": "Readset",
@@ -154,7 +160,7 @@ def json_file_output(id_: str):
             "data_type": DATA_TYPE_EXPERIMENT_RESULT,
             "file": "maf_derived_from_vcf_json.wdl",
             "inputs": [json_file_input("json_document")],
-            "outputs": [json_file_output("json_document")],
+            "outputs": [json_file_output("json_document", "ingest.json")],
         },
     },
     "analysis": {

diff --git a/chord_metadata_service/chord/workflows/wdls/experiments_json.wdl b/chord_metadata_service/chord/workflows/wdls/experiments_json.wdl
@@ -1,17 +1,21 @@
 workflow experiments_json {
     File json_document
 
-    call identity_task {
+    call copy_task {
         input: json_document_in = json_document
     }
+
+    output {
+        File json_document_out = copy_task.json_document
+    }
 }
 
-task identity_task {
+task copy_task {
     File json_document_in
     command {
-        true
+        cp "${json_document_in}" ingest.json
     }
     output {
-        File json_document = "${json_document_in}"
+        File json_document = "ingest.json"
     }
 }
diff --git a/chord_metadata_service/chord/workflows/wdls/fhir_json.wdl b/chord_metadata_service/chord/workflows/wdls/fhir_json.wdl
@@ -4,8 +4,8 @@ workflow fhir_json {
     File? conditions
     File? specimens
 
-    call identity_task {
-        input: json_in = patients
+    call copy_task {
+        input: json_in = patients, out_name = "patients.json"
     }
 
     call optional_fhir_json_task as ofjt1 {
@@ -19,15 +19,16 @@ workflow fhir_json {
     }
 }
 
-task identity_task {
+task copy_task {
     File json_in
+    String out_name
 
     command {
-        true
+        cp "${json_in}" "${out_name}"
     }
 
     output {
-        File json_out = "${json_in}"
+        File json_out = "${out_name}"
     }
 }
 

diff --git a/chord_metadata_service/chord/workflows/wdls/maf_derived_from_vcf_json.wdl b/chord_metadata_service/chord/workflows/wdls/maf_derived_from_vcf_json.wdl
@@ -1,17 +1,21 @@
 workflow maf_derived_from_vcf_json {
     File json_document
 
-    call identity_task {
+    call copy_task {
         input: json_document_in = json_document
     }
+
+    output {
+        File json_document_out = copy_task.json_document
+    }
 }
 
-task identity_task {
+task copy_task {
     File json_document_in
     command {
-        true
+        cp "${json_document_in}" ingest.json
     }
     output {
-        File json_document = "${json_document_in}"
+        File json_document = "ingest.json"
     }
 }
diff --git a/chord_metadata_service/chord/workflows/wdls/mcode_fhir_json.wdl b/chord_metadata_service/chord/workflows/wdls/mcode_fhir_json.wdl
@@ -1,20 +1,21 @@
 workflow mcode_fhir_json {
     File json_document
 
-    call identity_task {
+    call copy_task {
         input: json_document_in = json_document
     }
 
+    output {
+        File json_document_out = copy_task.json_document
+    }
 }
 
-task identity_task {
+task copy_task {
     File json_document_in
-
     command {
-        true
+        cp "${json_document_in}" ingest.json
     }
-
     output {
-        File json_document = "${json_document_in}"
+        File json_document = "ingest.json"
     }
 }