Skip to content

Commit

Permalink
Merge pull request #373 from bento-platform/garbage-collect
Browse files Browse the repository at this point in the history
Cleanup functionality + revamped workflows for new WES + fixes for DB startup errors
  • Loading branch information
davidlougheed committed Jan 26, 2023
2 parents 9090b6a + 13126ed commit 75663ee
Show file tree
Hide file tree
Showing 36 changed files with 754 additions and 109 deletions.
10 changes: 7 additions & 3 deletions bento.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
FROM ghcr.io/bento-platform/bento_base_image:python-debian-latest
FROM ghcr.io/bento-platform/bento_base_image:python-debian-2023.01.17

RUN pip install --no-cache-dir "uvicorn[standard]==0.20.0"
# Install Postgres client for checking if database is ready
# Install uvicorn to serve the API
RUN apt-get update -y && \
apt-get install -y postgresql-client && \
pip install --no-cache-dir "uvicorn[standard]==0.20.0"

# Backwards-compatible with old BentoV2 container layout
WORKDIR /app
Expand All @@ -16,4 +20,4 @@ COPY . .
# Create temporary directory for downloading files etc.
RUN mkdir -p tmp

CMD [ "sh", "./entrypoint.sh" ]
CMD [ "bash", "./entrypoint.bash" ]
11 changes: 6 additions & 5 deletions bento.dev.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
FROM ghcr.io/bento-platform/bento_base_image:python-debian-latest
FROM ghcr.io/bento-platform/bento_base_image:python-debian-2023.01.17

# Install Postgres client for checking if database is ready
RUN apt-get update -y && \
apt-get install -y postgresql-client

# Backwards-compatible with old BentoV2 container layout
WORKDIR /app
Expand All @@ -9,10 +13,7 @@ COPY requirements-dev.txt requirements-dev.txt
# Install production dependencies
RUN pip install --no-cache-dir -r requirements-dev.txt

# Copy all application code
COPY . .

# Create temporary directory for downloading files etc.
RUN mkdir -p tmp

CMD [ "sh", "./entrypoint.dev.sh" ]
CMD [ "bash", "./entrypoint.dev.bash" ]
26 changes: 23 additions & 3 deletions chord_metadata_service/chord/api_views.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
from rest_framework import viewsets
import logging

from rest_framework import status, viewsets
from rest_framework.permissions import BasePermission, SAFE_METHODS
from rest_framework.response import Response
from rest_framework.settings import api_settings

from django_filters.rest_framework import DjangoFilterBackend

from chord_metadata_service.cleanup import run_all_cleanup
from chord_metadata_service.restapi.api_renderers import PhenopacketsRenderer, JSONLDDatasetRenderer, RDFDatasetRenderer
from chord_metadata_service.restapi.pagination import LargeResultsSetPagination

from .models import Project, Dataset, TableOwnership, Table
from .permissions import OverrideOrSuperUserOnly
from .serializers import ProjectSerializer, DatasetSerializer, TableOwnershipSerializer, TableSerializer
from .filters import AuthorizedDatasetFilter
from django_filters.rest_framework import DjangoFilterBackend
import logging

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -89,3 +94,18 @@ class TableViewSet(CHORDPublicModelViewSet):

queryset = Table.objects.all().prefetch_related("ownership_record").order_by("ownership_record_id")
serializer_class = TableSerializer

def destroy(self, request, *args, **kwargs):
# First, delete the table record itself
# - use the cascade from the ownership record rather than the default DRF behaviour
table = self.get_object()
table_id = table.ownership_record_id
table.ownership_record.delete()
table.delete()

# Then, run cleanup
logger.info(f"Running cleanup after deleting table {table_id} via DRF API")
n_removed = run_all_cleanup()
logger.info(f"Cleanup: removed {n_removed} objects in total")

return Response(status=status.HTTP_204_NO_CONTENT)
17 changes: 17 additions & 0 deletions chord_metadata_service/chord/tests/test_api_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@

POST_GET = ("POST", "GET")

DATA_TYPE_NOT_REAL = "not_a_real_data_type"


class DataTypeTest(APITestCase):
def test_data_type_list(self):
Expand All @@ -72,18 +74,33 @@ def test_data_type_detail(self):
**DATA_TYPES[DATA_TYPE_PHENOPACKET],
})

def test_data_type_detail_404(self):
r = self.client.get(reverse("data-type-detail", kwargs={"data_type": DATA_TYPE_NOT_REAL}))
self.assertEqual(r.status_code, status.HTTP_404_NOT_FOUND)
r.json() # assert json response

def test_data_type_schema(self):
r = self.client.get(reverse("data-type-schema", kwargs={"data_type": DATA_TYPE_PHENOPACKET}))
self.assertEqual(r.status_code, status.HTTP_200_OK)
c = r.json()
self.assertDictEqual(c, DATA_TYPES[DATA_TYPE_PHENOPACKET]["schema"])

def test_data_type_schema_404(self):
r = self.client.get(reverse("data-type-schema", kwargs={"data_type": DATA_TYPE_NOT_REAL}))
self.assertEqual(r.status_code, status.HTTP_404_NOT_FOUND)
r.json() # assert json response

def test_data_type_metadata_schema(self):
r = self.client.get(reverse("data-type-metadata-schema", kwargs={"data_type": DATA_TYPE_PHENOPACKET}))
self.assertEqual(r.status_code, status.HTTP_200_OK)
c = r.json()
self.assertDictEqual(c, DATA_TYPES[DATA_TYPE_PHENOPACKET]["metadata_schema"])

def test_data_type_metadata_schema_404(self):
r = self.client.get(reverse("data-type-metadata-schema", kwargs={"data_type": DATA_TYPE_NOT_REAL}))
self.assertEqual(r.status_code, status.HTTP_404_NOT_FOUND)
r.json() # assert json response


class TableTest(APITestCase):
@staticmethod
Expand Down
25 changes: 19 additions & 6 deletions chord_metadata_service/chord/views_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bento_lib.responses import errors
from bento_lib.search import build_search_response, postgres

from datetime import datetime
from django.db import connection
from django.db.models import Count, F, Q
Expand All @@ -13,12 +14,16 @@
from django.conf import settings
from django.views.decorators.cache import cache_page
from psycopg2 import sql
from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field
from rest_framework.decorators import api_view, permission_classes
from rest_framework.permissions import AllowAny
from rest_framework.response import Response

from typing import Any, Callable, Dict, Optional, Tuple, Union

from chord_metadata_service.cleanup import run_all_cleanup
from chord_metadata_service.logger import logger
from chord_metadata_service.restapi.utils import get_field_bins, queryset_stats_for_field

from chord_metadata_service.experiments.api_views import EXPERIMENT_SELECT_REL, EXPERIMENT_PREFETCH
from chord_metadata_service.experiments.models import Experiment
from chord_metadata_service.experiments.serializers import ExperimentSerializer
Expand All @@ -27,7 +32,7 @@
from chord_metadata_service.mcode.serializers import MCodePacketSerializer

from chord_metadata_service.metadata.elastic import es
from chord_metadata_service.metadata.settings import DEBUG, CHORD_SERVICE_ARTIFACT, CHORD_SERVICE_ID
from chord_metadata_service.metadata.settings import CHORD_SERVICE_ARTIFACT, CHORD_SERVICE_ID

from chord_metadata_service.patients.models import Individual

Expand All @@ -42,9 +47,6 @@
OUTPUT_FORMAT_VALUES_LIST = "values_list"
OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG if DEBUG else logging.INFO)


@api_view(["GET"])
@permission_classes([AllowAny])
Expand Down Expand Up @@ -149,11 +151,22 @@ def table_detail(request, table_id): # pragma: no cover
# Or just always use owner...
try:
table = Table.objects.get(ownership_record_id=table_id)
table_ownership = TableOwnership.objects.get(table_id=table_id)
except Table.DoesNotExist:
return Response(errors.not_found_error(f"Table with ID {table_id} not found"), status=404)
except TableOwnership.DoesNotExist:
return Response(
errors.not_found_error(f"Table ownership record for table with ID {table_id} not found"), status=404)

if request.method == "DELETE":
table.delete()
# First, delete the table record itself - use the cascade from the ownership record
table_ownership.delete() # also deletes table

# Then, run cleanup
logger.info(f"Running cleanup after deleting table {table_id} via /tables Bento data service endpoint")
n_removed = run_all_cleanup()
logger.info(f"Cleanup: removed {n_removed} objects in total")

return Response(status=204)

# GET
Expand Down
28 changes: 17 additions & 11 deletions chord_metadata_service/chord/workflows/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"WORKFLOWS_PATH",
]

from typing import Optional

from chord_metadata_service.chord.data_types import (
DATA_TYPE_EXPERIMENT,
DATA_TYPE_EXPERIMENT_RESULT,
Expand Down Expand Up @@ -46,11 +48,15 @@ def json_file_input(id_: str, required: bool = True):
}


def json_file_output(id_: str):
def json_file_output(id_: str, output_name: Optional[str] = None):
return {
"id": id_,
"type": "file",
"value": f"{{{id_}}}",

# this triple {} abomination, with e.g. id_=json_document, turns into the string '{json_document}'
# the 'output_name or' part is a bit of a hack until we move to a new ingest system which can actually read
# Cromwell output JSON to grab the right files or something.
"value": output_name or f"{{{id_}}}",
}


Expand All @@ -63,7 +69,7 @@ def json_file_output(id_: str):
"data_type": DATA_TYPE_PHENOPACKET,
"file": "phenopackets_json.wdl",
"inputs": [json_file_input("json_document")],
"outputs": [json_file_output("json_document")],
"outputs": [json_file_output("json_document", "ingest.json")],
},
WORKFLOW_EXPERIMENTS_JSON: {
"name": "Bento Experiments JSON",
Expand All @@ -72,7 +78,7 @@ def json_file_output(id_: str):
"data_type": DATA_TYPE_EXPERIMENT,
"file": "experiments_json.wdl",
"inputs": [json_file_input("json_document")],
"outputs": [json_file_output("json_document")]
"outputs": [json_file_output("json_document", "ingest.json")]
},
WORKFLOW_FHIR_JSON: {
"name": "FHIR Resources JSON",
Expand All @@ -94,10 +100,10 @@ def json_file_output(id_: str):

],
"outputs": [
json_file_output("patients"),
json_file_output("observations"),
json_file_output("conditions"),
json_file_output("specimens"),
json_file_output("patients", "patients.json"),
json_file_output("observations", "observations.json"),
json_file_output("conditions", "conditions.json"),
json_file_output("specimens", "specimens.json"),
{
"id": "created_by",
"type": "string",
Expand All @@ -114,7 +120,7 @@ def json_file_output(id_: str):
"data_type": DATA_TYPE_MCODEPACKET,
"file": "mcode_fhir_json.wdl",
"inputs": [json_file_input("json_document")],
"outputs": [json_file_output("json_document")],
"outputs": [json_file_output("json_document", "ingest.json")],
},
WORKFLOW_MCODE_JSON: {
"name": "MCODE Resources JSON",
Expand All @@ -123,7 +129,7 @@ def json_file_output(id_: str):
"data_type": DATA_TYPE_MCODEPACKET,
"file": "mcode_json.wdl",
"inputs": [json_file_input("json_document")],
"outputs": [json_file_output("json_document")],
"outputs": [json_file_output("json_document", "ingest.json")],
},
WORKFLOW_READSET: {
"name": "Readset",
Expand Down Expand Up @@ -154,7 +160,7 @@ def json_file_output(id_: str):
"data_type": DATA_TYPE_EXPERIMENT_RESULT,
"file": "maf_derived_from_vcf_json.wdl",
"inputs": [json_file_input("json_document")],
"outputs": [json_file_output("json_document")],
"outputs": [json_file_output("json_document", "ingest.json")],
},
},
"analysis": {
Expand Down
12 changes: 8 additions & 4 deletions chord_metadata_service/chord/workflows/wdls/experiments_json.wdl
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
workflow experiments_json {
File json_document

call identity_task {
call copy_task {
input: json_document_in = json_document
}

output {
File json_document_out = copy_task.json_document
}
}

task identity_task {
task copy_task {
File json_document_in
command {
true
cp "${json_document_in}" ingest.json
}
output {
File json_document = "${json_document_in}"
File json_document = "ingest.json"
}
}
11 changes: 6 additions & 5 deletions chord_metadata_service/chord/workflows/wdls/fhir_json.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ workflow fhir_json {
File? conditions
File? specimens

call identity_task {
input: json_in = patients
call copy_task {
input: json_in = patients, out_name = "patients.json"
}

call optional_fhir_json_task as ofjt1 {
Expand All @@ -19,15 +19,16 @@ workflow fhir_json {
}
}

task identity_task {
task copy_task {
File json_in
String out_name

command {
true
cp "${json_in}" "${out_name}"
}

output {
File json_out = "${json_in}"
File json_out = "${out_name}"
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
workflow maf_derived_from_vcf_json {
File json_document

call identity_task {
call copy_task {
input: json_document_in = json_document
}

output {
File json_document_out = copy_task.json_document
}
}

task identity_task {
task copy_task {
File json_document_in
command {
true
cp "${json_document_in}" ingest.json
}
output {
File json_document = "${json_document_in}"
File json_document = "ingest.json"
}
}
13 changes: 7 additions & 6 deletions chord_metadata_service/chord/workflows/wdls/mcode_fhir_json.wdl
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
workflow mcode_fhir_json {
File json_document

call identity_task {
call copy_task {
input: json_document_in = json_document
}

output {
File json_document_out = copy_task.json_document
}
}

task identity_task {
task copy_task {
File json_document_in

command {
true
cp "${json_document_in}" ingest.json
}

output {
File json_document = "${json_document_in}"
File json_document = "ingest.json"
}
}

0 comments on commit 75663ee

Please sign in to comment.