Merge pull request #117 from erikyao/master

Fixes to Issue #33, #91, #110, and #115
biothings · Apr 21, 2021 · f3ad848 · f3ad848
2 parents cb9c113 + 024c9aa
commit f3ad848
Show file tree

Hide file tree

Showing 12 changed files with 537 additions and 109 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,5 @@ src/www/static/docs
 src/config_prod.py
 src/run/*.pickle
 src/run/done/*.pickle
-.vscode
+src/.idea/
+.vscode
diff --git a/src/hub/dataload/sources/cadd/cadd_parser.py b/src/hub/dataload/sources/cadd/cadd_parser.py
@@ -210,5 +210,5 @@ def fetch_generator(tabix, contig):
 
 def load_data(data_folder):
     for contig in [i for i in range(1,24)] + ["X","Y"]:
-        load_contig(config)
+        load_contig(contig)
 
diff --git a/src/hub/dataload/sources/clingen/upload.py b/src/hub/dataload/sources/clingen/upload.py
@@ -1,29 +1,33 @@
 import os, glob
 
+from hub.dataload.storage import MyVariantBasicStorage
 import biothings.hub.dataload.uploader as uploader
 from .parser import load_data
 
 
 class ClingenUploader(uploader.ParallelizedSourceUploader):
     name = "clingen"
-    __metadata__ = {"mapper": 'observed',
-            "assembly": "hg38",
-            "src_meta": {
-                "url": "https://www.clinicalgenome.org",
-                "license_url": "https://www.clinicalgenome.org/about/terms-of-use",
-                "license_url_short": "http://bit.ly/2kAtyoH",
-                "licence": "CC0 1.0 Universal",
-                }
-            }
+    __metadata__ = {
+        "mapper": 'observed',
+        "assembly": "hg38",
+        "src_meta": {
+            "url": "https://www.clinicalgenome.org",
+            "license_url": "https://www.clinicalgenome.org/about/terms-of-use",
+            "license_url_short": "http://bit.ly/2kAtyoH",
+            "licence": "CC0 1.0 Universal",
+        }
+    }
 
     GLOB_PATTERN = "mvi_ca.split.*"
 
+    storage_class = MyVariantBasicStorage
+
     def jobs(self):
         # tuple(input_file,) (only one arg, but still need a tuple
         return map(lambda e: (e, ),
                    glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN)))
 
-    def load_data(self,input_file):
+    def load_data(self, input_file):
         # there's one vcf file there, let's get it
         assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
         self.logger.info("Load data from file '%s'" % input_file)

diff --git a/src/hub/dataload/sources/clinvar/clinvar_upload.py b/src/hub/dataload/sources/clinvar/clinvar_upload.py
@@ -1,19 +1,22 @@
 from .clinvar_xml_parser import load_data as load_common
 from hub.dataload.uploader import SnpeffPostUpdateUploader
+from hub.dataload.storage import MyVariantTrimmingStorage
 
 
 SRC_META = {
-        "url" : "https://www.ncbi.nlm.nih.gov/clinvar/",
-        "license_url" : "https://www.ncbi.nlm.nih.gov/clinvar/intro/",
-        "license_url_short": "http://bit.ly/2SQdcI0"
-        }
+    "url": "https://www.ncbi.nlm.nih.gov/clinvar/",
+    "license_url": "https://www.ncbi.nlm.nih.gov/clinvar/intro/",
+    "license_url_short": "http://bit.ly/2SQdcI0"
+}
+
 
 class ClinvarBaseUploader(SnpeffPostUpdateUploader):
+    storage_class = MyVariantTrimmingStorage
 
     def get_pinfo(self):
         pinfo = super(ClinvarBaseUploader,self).get_pinfo()
         # clinvar parser has some memory requirements, ~1.5G
-        pinfo.setdefault("__reqs__",{})["mem"] = 1.5 * (1024**3)
+        pinfo.setdefault("__reqs__", {})["mem"] = 1.5 * (1024**3)
         return pinfo
 
     @classmethod
@@ -223,18 +226,18 @@ class ClinvarHG19Uploader(ClinvarBaseUploader):
     name = "clinvar_hg19"
     main_source = "clinvar"
     __metadata__ = {
-            "mapper" : 'observed_skipidtoolong',
-            "assembly" : "hg19",
-            "src_meta" : SRC_META,
-            }
+        "mapper": 'observed_skipidtoolong',
+        "assembly": "hg19",
+        "src_meta": SRC_META,
+    }
 
-    def load_data(self,data_folder):
+    def load_data(self, data_folder):
         self.logger.info("Load data from folder '%s'" % data_folder)
         try:
-            return load_common(data_folder,"hg19")
+            return load_common(data_folder, "hg19")
         except Exception as e:
             import traceback
-            self.logger.error("Error while uploading, %s:\n%s" % (e,traceback.format_exc()))
+            self.logger.error("Error while uploading, %s:\n%s" % (e, traceback.format_exc()))
             raise
 
 
@@ -243,17 +246,16 @@ class ClinvarHG38Uploader(ClinvarBaseUploader):
     name = "clinvar_hg38"
     main_source = "clinvar"
     __metadata__ = {
-            "mapper" : 'observed_skipidtoolong',
-            "assembly" : "hg38",
-            "src_meta" : SRC_META,
-            }
+        "mapper": 'observed_skipidtoolong',
+        "assembly": "hg38",
+        "src_meta": SRC_META,
+    }
 
-    def load_data(self,data_folder):
+    def load_data(self, data_folder):
         self.logger.info("Load data from folder '%s'" % data_folder)
         try:
-            return load_common(data_folder,"hg38")
+            return load_common(data_folder, "hg38")
         except Exception as e:
             import traceback
             self.logger.error("Error while uploading, %s:\n%s" % (e,traceback.format_exc()))
             raise
-
diff --git a/src/hub/dataload/sources/wellderly/__init__.py b/src/hub/dataload/sources/wellderly/__init__.py
@@ -1,2 +1,3 @@
-from .wellderly_upload import WellderlyFactoryUploader
+from .wellderly_dumper import WellderlyDumper
+from .wellderly_upload import WellderlyUploader
 
diff --git a/src/hub/dataload/sources/wellderly/wellderly_dumper.py b/src/hub/dataload/sources/wellderly/wellderly_dumper.py
@@ -0,0 +1,16 @@
+import os
+import biothings, config
+biothings.config_for_app(config)
+
+from config import DATA_ARCHIVE_ROOT
+from biothings.hub.dataload.dumper import ManualDumper
+
+
+class WellderlyDumper(ManualDumper):
+
+    SRC_NAME = "wellderly"
+    SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
+
+    def __init__(self, *args, **kwargs):
+        super(WellderlyDumper, self).__init__(*args, **kwargs)
+        self.logger.info("Assuming Wellderly.chr*.g.vcf.gz.tsv files were manual downloaded")
diff --git a/src/hub/dataload/sources/wellderly/wellderly_parser.py b/src/hub/dataload/sources/wellderly/wellderly_parser.py
@@ -0,0 +1,167 @@
+import pandas as pd
+from collections import namedtuple
+from itertools import combinations_with_replacement as cwr
+from utils.hgvs import _normalized_vcf
+
+# (`start`, `end`) can be different from `pos`
+# E.g `chr1:g.10429_10433del` with a genotype `CCCTAA/C`, has pos == 10428, start == 10429 and end == 10433
+HgvsID = namedtuple('HgvsID', ['id', 'chrom', 'start', 'end', 'vartype'])
+
+
+def format_hgvs(chrom, pos, ref, alt):
+    """
+    Get a valid hgvs name from VCF-style "chrom, pos, ref, alt" data.
+
+    This function is adapted from utils.hgvs.format_hgvs.
+
+    TODO: if utils.hgvs.format_hgvs is refactored to return (start, end), delete this and use that function instead
+    See https://github.com/biothings/myvariant.info/issues/111
+    """
+    chrom, pos = str(chrom), int(pos)
+    if chrom.lower().startswith('chr'):
+        # trim off leading "chr" if any
+        chrom = chrom[3:]
+
+    # python integer range is from -2,147,483,648 to 2,147,483,647
+    # The max sequence length is on chr1 (GRCh37.p13), 249,250,621
+    pos = int(pos)
+
+    if len(ref) == len(alt) == 1:
+        # this is a SNP
+        _id = 'chr{0}:g.{1}{2}>{3}'.format(chrom, pos, ref, alt)
+        return HgvsID(id=_id, chrom=chrom, start=pos, end=pos, vartype="snp")
+
+    if len(ref) > 1 and len(alt) == 1:
+        # this is a deletion:
+        if ref[0] == alt:
+            start = pos + 1
+            end = pos + len(ref) - 1
+
+            if start == end:
+                _id = 'chr{0}:g.{1}del'.format(chrom, start)
+            else:
+                _id = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
+            return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="del")
+        else:
+            start = pos
+            end = start + len(ref) - 1
+
+            _id = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, start, end, alt)
+            return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="delins")
+
+    if len(ref) == 1 and len(alt) > 1:
+        # this is an insertion
+        if alt[0] == ref:
+            start, end = pos, pos + 1
+            ins_seq = alt[1:]
+
+            _id = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, ins_seq)
+            return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="ins")
+        else:
+            _id = 'chr{0}:g.{1}delins{2}'.format(chrom, pos, alt)
+            return HgvsID(id=_id, chrom=chrom, start=pos, end=pos, vartype="delins")
+
+    if len(ref) > 1 and len(alt) > 1:
+        if ref[0] == alt[0]:
+            # if ref and alt overlap from the left, trim them first
+            _chrom, _pos, _ref, _alt = _normalized_vcf(chrom, pos, ref, alt)
+            return format_hgvs(_chrom, _pos, _ref, _alt)
+        else:
+            start, end = pos, pos + len(ref) - 1
+
+            _id = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, start, end, alt)
+            return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="delins")
+
+    raise ValueError("Cannot convert {} into HGVS id.".format((chrom, pos, ref, alt)))
+
+
+class Genotype:
+    def __init__(self, first_allele, second_allele):
+        self.first_allele = first_allele
+        self.second_allele = second_allele
+
+    def __str__(self):
+        return "{}/{}".format(self.first_allele, self.second_allele)
+
+
+def genotype_combinations(ref: str, alt_list: list) -> list:
+    """
+    The `GC` column (not VCF-standard) in the Wellderly tsv files represents the count of genotypes in the
+    following patterns.
+
+    Suppose a, b, c, ..., z represent the allele frequencies.
+
+    For n = 2 alleles, the genotype frequencies are expanded as (a+b)^2 = a^2 + 2ab + b^2.
+    For n = 3 alleles, (a+b+c)^2 = (a+b)^2 + 2(a+b)c + c^2 = a^2 + 2ab + b^2 + 2ac + 2bc + c^2
+    For n = 4 alleles, (a+b+c+d)^2 = (a+b+c)^2 + 2(a+b+c)d + d^2 = ...
+
+    Therefore, the genotype combinations (i.e. the order of the `GC` column) would be
+    0/0, 0/1, 1/1, 0/2, 1/2, 2/2, 0/3, 1/3, 2/3, 3/3, ...
+    ...
+
+    Note that this expanding scheme seems only apply to the Wellderly tsv files.
+
+    For n alleles, there would be {(n+1) choose 2} genotypes regardless of the combination order.
+    """
+    # The REF must be the first allele in the list
+    allele_list = [ref] + alt_list
+    genotype_list = [Genotype(first_allele=q, second_allele=p) for (p, q) in cwr(reversed(allele_list), 2)]
+    genotype_list.reverse()
+    return genotype_list
+
+
+class WellderlyTsvReader:
+    @classmethod
+    def generate_document(cls, row: pd.Series, assembly="hg19"):
+        # Read REF and ALT alleles
+        ref = row["REF"]
+
+        alt_list = row["ALT"].split(",")
+        alt_cnt = [int(ac) for ac in row["AC"].split(",")]
+        alt_freq = [float(af) for af in row["AF"].split(",")]
+        if not len(alt_list) == len(alt_cnt) == len(alt_freq):
+            raise ValueError("Inconsistent length between ALT, AC and AF fields. Got row={}".format(row.to_str()))
+
+        # Collect all alleles (REF + ALT)
+        allele_list = alt_list + [ref]
+        allele_freq = alt_freq + [1 - sum(alt_freq)]  # the `AF` column does not have the `REF` allele frequencies
+        allele_json = [{"allele": allele, "freq": freq} for (allele, freq) in zip(allele_list, allele_freq)]
+
+        # Collect all genotypes
+        genotype_cnt = [int(gc) for gc in row["GC"].split(",")]
+        genotype_list = genotype_combinations(ref=ref, alt_list=alt_list)
+
+        genotype_total = sum(genotype_cnt)
+        if genotype_total > 0:
+            genotype_freq = [cnt / genotype_total for cnt in genotype_cnt]
+        else:  # In some rare cases, `GC` can be nothing but 0's
+            genotype_freq = [0 for _ in genotype_cnt]
+        genotype_json = [{"count": cnt, "freq": freq, 'genotype': str(gt)} for (cnt, freq, gt)
+                         in zip(genotype_cnt, genotype_freq, genotype_list) if cnt > 0]
+
+        # Generate ID and document for each (REF, ALT) pair
+        chrom, pos = row["CHROM:POS"].split(":")
+        for alt in alt_list:
+            hgvsID = format_hgvs(chrom, pos, ref, alt)
+
+            document = {
+                '_id': hgvsID.id,
+                'wellderly': {
+                    'chrom': hgvsID.chrom,
+                    'pos': pos, 'ref': ref, 'alt': alt,
+                    assembly: {'start': hgvsID.start, 'end': hgvsID.end},
+                    'vartype': hgvsID.vartype,
+                    'alleles': allele_json,
+                    'genotypes': genotype_json
+                }
+            }
+
+            yield document
+
+    @classmethod
+    def load_data(cls, file, assembly="hg19"):
+        # Skip 'AN' column
+        data_types = {"CHROM:POS": str, "REF": str, "ALT": str, "AC": str, "GC": str, "AF": str}
+        df = pd.read_csv(file, sep="\t", usecols=data_types.keys(), dtype=data_types)
+        for _, row in df.iterrows():
+            yield from cls.generate_document(row, assembly)