Skip to content

Commit

Permalink
Allow excluding cases from in-house database (#579)
Browse files Browse the repository at this point in the history
Closes: #579
Related-Issue: #579
Projected-Results-Impact: require-revalidation
  • Loading branch information
holtgrewe committed Aug 26, 2022
1 parent ff98581 commit bc37b6c
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 7 deletions.
4 changes: 3 additions & 1 deletion HISTORY.rst
Expand Up @@ -43,6 +43,7 @@ End-User Summary
A full re-import after re-annotation with varfish-annotator v0.23 or above is recommended.
Alternatively, you can use ``python manage.py svs_sv_fill_nulls`` to update the records on the fly.
- Implement new in-house background database for structural variants (#32).
- Allow to exclude cases from in-house database through project settings (#579).

Full Change List
================
Expand Down Expand Up @@ -70,7 +71,7 @@ Full Change List
- Fixed broken VariantValidator query (#523).
- Converted not cooperative tooltip to standard title on Filter & Display button (#508).
- Fixed smallvariant flags filter query (#502).
- Added flags `segregates`, `doesnt_segregate` and `no_disease_association` to file export (#502).
- Added flags ``segregates``, ``doesnt_segregate`` and ``no_disease_association`` to file export (#502).
- Adjusting path to new varfish-annotator db download (#546).
- Fixing issue with sync-from-remote when no remote is defined (#570).
- Adding feature to enable and configure link-out to HGMD (#576).
Expand All @@ -81,6 +82,7 @@ Full Change List
- Form template reports error if genomebuild variable is not set (#607).
- Making ``keyvalue`` more robust to failure (#613).
- Implement new in-house background database for structural variants (#32).
- Allow to exclude cases from in-house database through project settings (#579).

------
v1.2.0
Expand Down
12 changes: 12 additions & 0 deletions docs_manual/admin_upgrade.rst
Expand Up @@ -30,6 +30,18 @@ You can find out more details, give feedback, and ask for help `in this Github d
v1.2.* to v2.*.*
----------------

**In-House Background Database.**
A number of changes were made to the implementation of the background database.
The upgrade will re-create the in-house database as empty.

You will have to re-build the database manually with the command ``python manage.py rebuild_variant_summary``.
Assuming that you are running within ``varfish-docker-compose``, you can use the following command directly.

::

$ docker exec -it varfish-docker-compose_varfish-web_1 python /usr/src/app/manage.py \
rebuild_variant_summary

**Structural Variants.**
In case that the support for structural variants has been used, it is **strongly recommended** to re-annotate the structural variants with an updated version of ``varfish-annotator`` (v0.24 or above).
You will need to use ``varfish-cli`` in a recent version (v0.3.4 or above) for being able to import the data into VarFish.
Expand Down
11 changes: 10 additions & 1 deletion svs/bg_db.py
Expand Up @@ -25,6 +25,7 @@
from intervaltree import Interval, IntervalTree
from projectroles.plugins import get_backend_api
import psutil
from projectroles.templatetags.projectroles_common_tags import get_app_setting
from sqlalchemy import delete

from svs.models import (
Expand All @@ -42,7 +43,7 @@

#: Logger to use in this module.
from variants.helpers import get_engine, get_meta
from variants.models import CHROMOSOME_NAMES, CHROMOSOME_STR_TO_CHROMOSOME_INT
from variants.models import CHROMOSOME_NAMES, CHROMOSOME_STR_TO_CHROMOSOME_INT, Case

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -548,6 +549,12 @@ def log(msg: str):
genomebuild=job.genomebuild, varfish_version=varfish_version, state="building"
)

log("Obtain IDs of cases marked for exclusion")
excluded_case_ids = {}
for case in Case.objects.prefetch_related("project").iterator():
if get_app_setting("variants", "exclude_from_inhouse_db", project=case.project):
excluded_case_ids.add(case.id)

log("Starting actual clustering")
params = ClusterAlgoParams()
algo = ClusterSvAlgorithm(params)
Expand All @@ -563,6 +570,8 @@ def log(msg: str):
chunk_size=chunk_size
)
):
if db_record.case_id in excluded_case_ids:
continue # skip excluded cases
sv_record = sv_model_to_attrs(db_record)
algo.push(sv_record)
record_count += 1
Expand Down
4 changes: 2 additions & 2 deletions variants/migrations/0033_smallvariantsummary.py
Expand Up @@ -64,8 +64,8 @@
FROM variants_smallvariant AS variants
) AS variants_per_case
GROUP BY (release, chromosome, position, reference, alternative)
WITH DATA;
WITH NO DATA;
CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
release, chromosome, position, reference, alternative
Expand Down
2 changes: 1 addition & 1 deletion variants/migrations/0044_adjust_smallvariantstats.py
Expand Up @@ -75,7 +75,7 @@
FROM variants_smallvariant AS variants
) AS variants_per_case
GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
WITH DATA;
WITH NO DATA;
CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
Expand Down
4 changes: 2 additions & 2 deletions variants/migrations/0046_partition_smallvariants_table.py
Expand Up @@ -47,7 +47,7 @@
FROM variants_smallvariant AS variants
) AS variants_per_case
GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
WITH DATA;
WITH NO DATA;
CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
Expand Down Expand Up @@ -240,7 +240,7 @@
FROM variants_smallvariant AS variants
) AS variants_per_case
GROUP BY (release, chromosome, chromosome_no, start, "end", bin, reference, alternative)
WITH DATA;
WITH NO DATA;
CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
Expand Down
116 changes: 116 additions & 0 deletions variants/migrations/0086_smallvariantsummary_excludefrominhousedb.py
@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
"""Adjust the small variants statistics materialized view to not use excluded cases.
This is done by recreating it.
"""

from django.conf import settings
from django.db import migrations, models

SQL_OUTER = r"""
DROP MATERIALIZED VIEW IF EXISTS variants_smallvariantsummary;
CREATE MATERIALIZED VIEW variants_smallvariantsummary
AS
%s
WITH NO DATA;
CREATE UNIQUE INDEX variants_smallvariantsummary_id ON variants_smallvariantsummary(id);
CREATE INDEX variants_smallvariantsummary_coord ON variants_smallvariantsummary(
release, chromosome, start, "end", bin, reference, alternative
);
"""

SQL_INNER_FORWARD = r"""
WITH excluded_case_ids AS (
SELECT DISTINCT variants_case.id AS case_id
FROM variants_case
JOIN projectroles_project ON variants_case.project_id = projectroles_project.id
JOIN projectroles_appsetting ON
projectroles_project.id = projectroles_appsetting.project_id AND
projectroles_appsetting.name = 'exclude_from_inhouse_db' AND
projectroles_appsetting.value = '1'
)
SELECT
row_number() OVER (PARTITION BY true) AS id,
release,
chromosome,
start,
"end",
bin,
reference,
alternative,
sum(num_hom_ref) AS count_hom_ref,
sum(num_het) AS count_het,
sum(num_hom_alt) AS count_hom_alt,
sum(num_hemi_ref) AS count_hemi_ref,
sum(num_hemi_alt) AS count_hemi_alt
FROM (
SELECT DISTINCT
variants.release,
variants.chromosome,
variants.start,
variants."end",
variants.bin,
variants.reference,
variants.alternative,
variants.num_hom_ref,
variants.num_het,
variants.num_hom_alt,
variants.num_hemi_ref,
variants.num_hemi_alt,
variants.case_id
FROM variants_smallvariant AS variants
WHERE NOT EXISTS (SELECT 1 from excluded_case_ids AS e WHERE e.case_id = variants.case_id)
) AS variants_per_case
GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
"""

SQL_INNER_REVERSE = r"""
SELECT
row_number() OVER (PARTITION BY true) AS id,
release,
chromosome,
start,
"end",
bin,
reference,
alternative,
sum(num_hom_ref) AS count_hom_ref,
sum(num_het) AS count_het,
sum(num_hom_alt) AS count_hom_alt,
sum(num_hemi_ref) AS count_hemi_ref,
sum(num_hemi_alt) AS count_hemi_alt
FROM (
SELECT DISTINCT
variants.release,
variants.chromosome,
variants.start,
variants."end",
variants.bin,
variants.reference,
variants.alternative,
variants.num_hom_ref,
variants.num_het,
variants.num_hom_alt,
variants.num_hemi_ref,
variants.num_hemi_alt,
variants.case_id
FROM variants_smallvariant AS variants
) AS variants_per_case
GROUP BY (release, chromosome, start, "end", bin, reference, alternative)
"""

if settings.IS_TESTING:
operations = []
else:
operations = [migrations.RunSQL(SQL_OUTER % SQL_INNER_FORWARD, SQL_OUTER % SQL_INNER_REVERSE,)]


class Migration(migrations.Migration):

dependencies = [
("variants", "0085_add_variant_index"),
]

operations = operations
11 changes: 11 additions & 0 deletions variants/plugins.py
Expand Up @@ -119,6 +119,17 @@ class ProjectAppPlugin(ProjectAppPluginPoint):
"Use this if the sex is unknown."
),
},
"exclude_from_inhouse_db": {
"scope": SODAR_CONSTANTS["APP_SETTING_SCOPE_PROJECT"],
"type": "BOOLEAN",
"default": False,
"label": "Exclude from in-house database",
"description": (
"Exclude project's cases from in-house database. This is intended to be used for cases containing "
"training data that may exist with multiple copies and thus introduce artifacts in the in-house "
"database (such as no variant of a case showing up because being in the in-house database many times)."
),
},
}

#: Additional columns to display for the projects.
Expand Down

0 comments on commit bc37b6c

Please sign in to comment.