Allow excluding cases from in-house database (#579)

Closes: #579 Related-Issue: #579 Projected-Results-Impact: require-revalidation
varfish-org · Aug 26, 2022 · bc37b6c · bc37b6c
1 parent ff98581
commit bc37b6c
Show file tree

Hide file tree

Showing 8 changed files with 157 additions and 7 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -43,6 +43,7 @@ End-User Summary
   A full re-import after re-annotation with varfish-annotator v0.23 or above is recommended.
   Alternatively, you can use ``python manage.py svs_sv_fill_nulls`` to update the records on the fly.
 - Implement new in-house background database for structural variants (#32).
+- Allow to exclude cases from in-house database through project settings (#579).
 
 Full Change List
 ================
@@ -70,7 +71,7 @@ Full Change List
 - Fixed broken VariantValidator query (#523).
 - Converted not cooperative tooltip to standard title on Filter & Display button (#508).
 - Fixed smallvariant flags filter query (#502).
-- Added flags `segregates`, `doesnt_segregate` and `no_disease_association` to file export (#502).
+- Added flags ``segregates``, ``doesnt_segregate`` and ``no_disease_association`` to file export (#502).
 - Adjusting path to new varfish-annotator db download (#546).
 - Fixing issue with sync-from-remote when no remote is defined (#570).
 - Adding feature to enable and configure link-out to HGMD (#576).
@@ -81,6 +82,7 @@ Full Change List
 - Form template reports error if genomebuild variable is not set (#607).
 - Making ``keyvalue`` more robust to failure (#613).
 - Implement new in-house background database for structural variants (#32).
+- Allow to exclude cases from in-house database through project settings (#579).
 
 ------
 v1.2.0

diff --git a/docs_manual/admin_upgrade.rst b/docs_manual/admin_upgrade.rst
@@ -30,6 +30,18 @@ You can find out more details, give feedback, and ask for help `in this Github d
 v1.2.* to v2.*.*
 ----------------
 
+**In-House Background Database.**
+A number of changes were made to the implementation of the background database.
+The upgrade will re-create the in-house database as empty.
+
+You will have to re-build the database manually with the command ``python manage.py rebuild_variant_summary``.
+Assuming that you are running within ``varfish-docker-compose``, you can use the following command directly.
+
+::
+
+    $ docker exec -it varfish-docker-compose_varfish-web_1 python /usr/src/app/manage.py \
+        rebuild_variant_summary
+
 **Structural Variants.**
 In case that the support for structural variants has been used, it is **strongly recommended** to re-annotate the structural variants with an updated version of ``varfish-annotator`` (v0.24 or above).
 You will need to use ``varfish-cli`` in a recent version (v0.3.4 or above) for being able to import the data into VarFish.

diff --git a/svs/bg_db.py b/svs/bg_db.py
@@ -25,6 +25,7 @@
 from intervaltree import Interval, IntervalTree
 from projectroles.plugins import get_backend_api
 import psutil
+from projectroles.templatetags.projectroles_common_tags import get_app_setting
 from sqlalchemy import delete
 
 from svs.models import (
@@ -42,7 +43,7 @@
 
 #: Logger to use in this module.
 from variants.helpers import get_engine, get_meta
-from variants.models import CHROMOSOME_NAMES, CHROMOSOME_STR_TO_CHROMOSOME_INT
+from variants.models import CHROMOSOME_NAMES, CHROMOSOME_STR_TO_CHROMOSOME_INT, Case
 
 LOGGER = logging.getLogger(__name__)
 
@@ -548,6 +549,12 @@ def log(msg: str):
         genomebuild=job.genomebuild, varfish_version=varfish_version, state="building"
     )
 
+    log("Obtain IDs of cases marked for exclusion")
+    excluded_case_ids = {}
+    for case in Case.objects.prefetch_related("project").iterator():
+        if get_app_setting("variants", "exclude_from_inhouse_db", project=case.project):
+            excluded_case_ids.add(case.id)
+
     log("Starting actual clustering")
     params = ClusterAlgoParams()
     algo = ClusterSvAlgorithm(params)
@@ -563,6 +570,8 @@ def log(msg: str):
                     chunk_size=chunk_size
                 )
             ):
+                if db_record.case_id in excluded_case_ids:
+                    continue  # skip excluded cases
                 sv_record = sv_model_to_attrs(db_record)
                 algo.push(sv_record)
                 record_count += 1

diff --git a/variants/migrations/0033_smallvariantsummary.py b/variants/migrations/0033_smallvariantsummary.py
@@ -64,8 +64,8 @@
                     FROM variants_smallvariant AS variants
                 ) AS variants_per_case
                 GROUP BY (release, chromosome, position, reference, alternative)
-            WITH DATA;
-    
+            WITH NO DATA;
+
             CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
             CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
                 release, chromosome, position, reference, alternative

diff --git a/variants/migrations/0044_adjust_smallvariantstats.py b/variants/migrations/0044_adjust_smallvariantstats.py
@@ -75,7 +75,7 @@
                     FROM variants_smallvariant AS variants
                 ) AS variants_per_case
                 GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
-            WITH DATA;
+            WITH NO DATA;
 
             CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
             CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(

diff --git a/variants/migrations/0046_partition_smallvariants_table.py b/variants/migrations/0046_partition_smallvariants_table.py
@@ -47,7 +47,7 @@
                 FROM variants_smallvariant AS variants
             ) AS variants_per_case
             GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
-        WITH DATA;
+        WITH NO DATA;
 
         CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
         CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
@@ -240,7 +240,7 @@
                     FROM variants_smallvariant AS variants
                 ) AS variants_per_case
                 GROUP BY (release, chromosome, chromosome_no, start, "end", bin, reference, alternative)
-            WITH DATA;
+            WITH NO DATA;
 
             CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
             CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(

diff --git a/variants/migrations/0086_smallvariantsummary_excludefrominhousedb.py b/variants/migrations/0086_smallvariantsummary_excludefrominhousedb.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+"""Adjust the small variants statistics materialized view to not use excluded cases.
+
+This is done by recreating it.
+"""
+
+from django.conf import settings
+from django.db import migrations, models
+
+SQL_OUTER = r"""
+DROP MATERIALIZED VIEW IF EXISTS variants_smallvariantsummary;
+
+CREATE MATERIALIZED VIEW variants_smallvariantsummary
+AS
+    %s
+WITH NO DATA;
+
+CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
+CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
+    release, chromosome, start, "end", bin, reference, alternative
+);
+"""
+
+SQL_INNER_FORWARD = r"""
+WITH excluded_case_ids AS (
+    SELECT DISTINCT variants_case.id AS case_id
+    FROM variants_case
+    JOIN projectroles_project ON variants_case.project_id = projectroles_project.id
+    JOIN projectroles_appsetting ON
+        projectroles_project.id = projectroles_appsetting.project_id AND
+        projectroles_appsetting.name = 'exclude_from_inhouse_db' AND
+        projectroles_appsetting.value = '1'
+)
+SELECT
+    row_number() OVER (PARTITION BY true) AS id,
+    release,
+    chromosome,
+    start,
+    "end",
+    bin,
+    reference,
+    alternative,
+    sum(num_hom_ref) AS count_hom_ref,
+    sum(num_het) AS count_het,
+    sum(num_hom_alt) AS count_hom_alt,
+    sum(num_hemi_ref) AS count_hemi_ref,
+    sum(num_hemi_alt) AS count_hemi_alt
+FROM (
+    SELECT DISTINCT
+        variants.release,
+        variants.chromosome,
+        variants.start,
+        variants."end",
+        variants.bin,
+        variants.reference,
+        variants.alternative,
+        variants.num_hom_ref,
+        variants.num_het,
+        variants.num_hom_alt,
+        variants.num_hemi_ref,
+        variants.num_hemi_alt,
+        variants.case_id
+    FROM variants_smallvariant AS variants
+    WHERE NOT EXISTS (SELECT 1 from excluded_case_ids AS e WHERE e.case_id = variants.case_id)
+) AS variants_per_case
+GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
+"""
+
+SQL_INNER_REVERSE = r"""
+SELECT
+    row_number() OVER (PARTITION BY true) AS id,
+    release,
+    chromosome,
+    start,
+    "end",
+    bin,
+    reference,
+    alternative,
+    sum(num_hom_ref) AS count_hom_ref,
+    sum(num_het) AS count_het,
+    sum(num_hom_alt) AS count_hom_alt,
+    sum(num_hemi_ref) AS count_hemi_ref,
+    sum(num_hemi_alt) AS count_hemi_alt
+FROM (
+    SELECT DISTINCT
+        variants.release,
+        variants.chromosome,
+        variants.start,
+        variants."end",
+        variants.bin,
+        variants.reference,
+        variants.alternative,
+        variants.num_hom_ref,
+        variants.num_het,
+        variants.num_hom_alt,
+        variants.num_hemi_ref,
+        variants.num_hemi_alt,
+        variants.case_id
+    FROM variants_smallvariant AS variants
+) AS variants_per_case
+GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
+"""
+
+if settings.IS_TESTING:
+    operations = []
+else:
+    operations = [migrations.RunSQL(SQL_OUTER % SQL_INNER_FORWARD, SQL_OUTER % SQL_INNER_REVERSE,)]
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("variants", "0085_add_variant_index"),
+    ]
+
+    operations = operations
diff --git a/variants/plugins.py b/variants/plugins.py
@@ -119,6 +119,17 @@ class ProjectAppPlugin(ProjectAppPluginPoint):
                 "Use this if the sex is unknown."
             ),
         },
+        "exclude_from_inhouse_db": {
+            "scope": SODAR_CONSTANTS["APP_SETTING_SCOPE_PROJECT"],
+            "type": "BOOLEAN",
+            "default": False,
+            "label": "Exclude from in-house database",
+            "description": (
+                "Exclude project's cases from in-house database.  This is intended to be used for cases containing "
+                "training data that may exist with multiple copies and thus introduce artifacts in the in-house "
+                "database (such as no variant of a case showing up because being in the in-house database many times)."
+            ),
+        },
     }
 
     #: Additional columns to display for the projects.