Adding pg_dump admin command and documentation (#430)

Related-Issue: #430 Closes: #430 Projected-Results-Impact: none
varfish-org · Apr 7, 2022 · d54c879 · d54c879
1 parent bf8d5f0
commit d54c879
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 0 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -21,6 +21,7 @@ Full Change List
 - Schema and documentation for case QC info (#428)
 - Adding support for HGNC IDs in gene allow lists (#432)
 - PanelApp will now populate the gene allow list with HGNC gene IDs (#432)
+- Adding ``pg_dump`` admin command and documentation (#430)
 
 ------
 v1.1.2

diff --git a/docs_manual/admin_backup.rst b/docs_manual/admin_backup.rst
@@ -0,0 +1,42 @@
+.. _admin_backup:
+
+============
+Data Backups
+============
+
+This section describes how to create data backups in VarFish.
+The assumption is that you are running VarFish in the recommended way via Docker Compose.
+
+All valuable state is kept in the VarFish PostgreSQL database.
+VarFish provides a convenient way to call the PostgreSQL tool ``pg_dump``.
+
+You can call it in the following way when VarFish is running under Docker Compose and the postgres container is running as well.
+
+::
+
+    # docker exec -it varfish-docker-compose_varfish-web_1 \
+        python /usr/src/app/manage.py pg_dump --mode=MODE
+
+This will execute ``python /usr/src/app/manage.py pg_dump --mode=MODE`` in the docker container that is running the VarFish web server.
+
+You can use one of the following dump modes.
+
+``full``
+    This will perform a full data dump including all background data.
+
+``backup-large``
+    This will exclude the huge background data tables, e.g., dbSNP and gnomAD.
+
+``backup-small``
+    This will also exclude all imported variant data.
+    The assumption is that you have a separate backup of the imported TSV files or can easily regenerate them from the VCF files that you still have.
+
+Here is an example on how to create a compressed "small" dump file named ``varfish-${day_of_week}.sql.gz`` such that you get a rotating daily dump.
+
+::
+
+    # docker exec -it varfish-docker-compose_varfish-web_1 \
+        python /usr/src/app/manage.py pg_dump --mode=MODE \
+      | gzip -c \
+      > varfish-$(date +%a).sql.gz
+
diff --git a/docs_manual/index.rst b/docs_manual/index.rst
@@ -102,6 +102,7 @@ Currently, the main focus is on small/sequence variants called from high-througp
     admin_tuning
     admin_upgrade
     admin_pap
+    admin_backup
 
 .. raw:: latex
 

diff --git a/importer/management/commands/pg_dump.py b/importer/management/commands/pg_dump.py
@@ -0,0 +1,89 @@
+"""Django command that is a convenience wrapper around ``pg_dump``"""
+
+from itertools import chain
+import os
+import subprocess
+import sys
+
+from django.core.management.base import BaseCommand, CommandError
+from django.conf import settings
+
+#: The available dump modes.
+DUMP_MODES = ("full", "backup-large", "backup-small")
+#: The tables to be ignored in backup-large mode.
+IGNORE_LARGE = (
+    "clinvar_clinvar",
+    "conservation_knowngeneaa",
+    "dbsnp_dbsnp",
+    "extra_annos_extraanno",
+    "extra_annos_extraannofield",
+    "frequencies_exac",
+    "frequencies_gnomadexomes",
+    "frequencies_gnomadgenomes",
+    "frequencies_helixmtdb",
+    "frequencies_mitomap",
+    "frequencies_mtdb",
+    "frequencies_thousandgenomes",
+)
+#: The tables to be ignored in backup-thin mode.
+IGNORE_THIN = (
+    "variants_smallvariant_[0-9]*",
+    "svs_structuralvariant[0-9]+",
+    "svs_structuralvariantgeneannotation[0-9]+",
+)
+
+
+class Command(BaseCommand):
+    """Implementation wrapping ``pg_dump`` to support creating dumps of the underlying PostgreSQL database.
+    """
+
+    #: Help message displayed on the command line.
+    help = "Easily create database dumps with ``pg_dump``"
+
+    def add_arguments(self, parser):
+        """Add the command's argument to the ``parser``."""
+        parser.add_argument("--mode", help="Backup mode, one of %s" % (DUMP_MODES,), required=True)
+        parser.add_argument(
+            "--output-file", help="Optional path to write output to, default is stdout"
+        )
+        parser.add_argument(
+            "--force-overwrite", default=False, action="store_true", help="Overwrite output file"
+        )
+
+    def handle(self, *args, **options):
+        """The actual implementation is in ``_handle()``, splitting to get commit times."""
+        if options["output_file"]:
+            if os.path.exists(options["output_file"]) and not options["force_overwrite"]:
+                self.stderr.write(
+                    self.style.ERROR(
+                        "Refusing to overwrite %s; use --force to force overwriting"
+                        % options["output_file"]
+                    )
+                )
+                raise CommandError("Refusing to overwrite %s" % options["output_file"])
+            with open(options["output_file"], "wb") as outputf:
+                self._run(options, outputf)
+        else:
+            self._run(options, sys.stdout)
+
+    def _run(self, options, outputf):
+        database = settings.DATABASES["default"]
+        env = dict(os.environ)
+        env["PGPASSWORD"] = database["PASSWORD"]
+
+        cmd = [
+            "/usr/bin/pg_dump",
+            "--dbname=%s" % database["NAME"],
+            "--host=%s" % database["HOST"],
+            "--username=%s" % database["USER"],
+        ]
+
+        if options["mode"] == "backup-large":
+            cmd += ["--exclude-table-data=%s" % pat for pat in IGNORE_LARGE]
+        elif options["mode"] == "backup-small":
+            cmd += ["--exclude-table-data=%s" % pat for pat in chain(IGNORE_LARGE, IGNORE_THIN)]
+
+        self.stderr.write(self.style.NOTICE("Running the following command: %s" % cmd))
+        subprocess.check_call(cmd, stdout=outputf, env=env)
+        outputf.flush()
+        self.stderr.write(self.style.NOTICE("All done. Have a nice day!"))