Skip to content

Commit

Permalink
Merge pull request #117 from erikyao/master
Browse files Browse the repository at this point in the history
Fixes to Issue #33, #91, #110, and #115
  • Loading branch information
newgene committed Apr 21, 2021
2 parents cb9c113 + 024c9aa commit f3ad848
Show file tree
Hide file tree
Showing 12 changed files with 537 additions and 109 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ src/www/static/docs
src/config_prod.py
src/run/*.pickle
src/run/done/*.pickle
.vscode
src/.idea/
.vscode
2 changes: 1 addition & 1 deletion src/hub/dataload/sources/cadd/cadd_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,5 +210,5 @@ def fetch_generator(tabix, contig):

def load_data(data_folder):
for contig in [i for i in range(1,24)] + ["X","Y"]:
load_contig(config)
load_contig(contig)

24 changes: 14 additions & 10 deletions src/hub/dataload/sources/clingen/upload.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
import os, glob

from hub.dataload.storage import MyVariantBasicStorage
import biothings.hub.dataload.uploader as uploader
from .parser import load_data


class ClingenUploader(uploader.ParallelizedSourceUploader):
name = "clingen"
__metadata__ = {"mapper": 'observed',
"assembly": "hg38",
"src_meta": {
"url": "https://www.clinicalgenome.org",
"license_url": "https://www.clinicalgenome.org/about/terms-of-use",
"license_url_short": "http://bit.ly/2kAtyoH",
"licence": "CC0 1.0 Universal",
}
}
__metadata__ = {
"mapper": 'observed',
"assembly": "hg38",
"src_meta": {
"url": "https://www.clinicalgenome.org",
"license_url": "https://www.clinicalgenome.org/about/terms-of-use",
"license_url_short": "http://bit.ly/2kAtyoH",
"licence": "CC0 1.0 Universal",
}
}

GLOB_PATTERN = "mvi_ca.split.*"

storage_class = MyVariantBasicStorage

def jobs(self):
# tuple(input_file,) (only one arg, but still need a tuple
return map(lambda e: (e, ),
glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN)))

def load_data(self,input_file):
def load_data(self, input_file):
# there's one vcf file there, let's get it
assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
self.logger.info("Load data from file '%s'" % input_file)
Expand Down
40 changes: 21 additions & 19 deletions src/hub/dataload/sources/clinvar/clinvar_upload.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
from .clinvar_xml_parser import load_data as load_common
from hub.dataload.uploader import SnpeffPostUpdateUploader
from hub.dataload.storage import MyVariantTrimmingStorage


SRC_META = {
"url" : "https://www.ncbi.nlm.nih.gov/clinvar/",
"license_url" : "https://www.ncbi.nlm.nih.gov/clinvar/intro/",
"license_url_short": "http://bit.ly/2SQdcI0"
}
"url": "https://www.ncbi.nlm.nih.gov/clinvar/",
"license_url": "https://www.ncbi.nlm.nih.gov/clinvar/intro/",
"license_url_short": "http://bit.ly/2SQdcI0"
}


class ClinvarBaseUploader(SnpeffPostUpdateUploader):
storage_class = MyVariantTrimmingStorage

def get_pinfo(self):
pinfo = super(ClinvarBaseUploader,self).get_pinfo()
# clinvar parser has some memory requirements, ~1.5G
pinfo.setdefault("__reqs__",{})["mem"] = 1.5 * (1024**3)
pinfo.setdefault("__reqs__", {})["mem"] = 1.5 * (1024**3)
return pinfo

@classmethod
Expand Down Expand Up @@ -223,18 +226,18 @@ class ClinvarHG19Uploader(ClinvarBaseUploader):
name = "clinvar_hg19"
main_source = "clinvar"
__metadata__ = {
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg19",
"src_meta" : SRC_META,
}
"mapper": 'observed_skipidtoolong',
"assembly": "hg19",
"src_meta": SRC_META,
}

def load_data(self,data_folder):
def load_data(self, data_folder):
self.logger.info("Load data from folder '%s'" % data_folder)
try:
return load_common(data_folder,"hg19")
return load_common(data_folder, "hg19")
except Exception as e:
import traceback
self.logger.error("Error while uploading, %s:\n%s" % (e,traceback.format_exc()))
self.logger.error("Error while uploading, %s:\n%s" % (e, traceback.format_exc()))
raise


Expand All @@ -243,17 +246,16 @@ class ClinvarHG38Uploader(ClinvarBaseUploader):
name = "clinvar_hg38"
main_source = "clinvar"
__metadata__ = {
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg38",
"src_meta" : SRC_META,
}
"mapper": 'observed_skipidtoolong',
"assembly": "hg38",
"src_meta": SRC_META,
}

def load_data(self,data_folder):
def load_data(self, data_folder):
self.logger.info("Load data from folder '%s'" % data_folder)
try:
return load_common(data_folder,"hg38")
return load_common(data_folder, "hg38")
except Exception as e:
import traceback
self.logger.error("Error while uploading, %s:\n%s" % (e,traceback.format_exc()))
raise

3 changes: 2 additions & 1 deletion src/hub/dataload/sources/wellderly/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .wellderly_upload import WellderlyFactoryUploader
from .wellderly_dumper import WellderlyDumper
from .wellderly_upload import WellderlyUploader

16 changes: 16 additions & 0 deletions src/hub/dataload/sources/wellderly/wellderly_dumper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
import biothings, config
biothings.config_for_app(config)

from config import DATA_ARCHIVE_ROOT
from biothings.hub.dataload.dumper import ManualDumper


class WellderlyDumper(ManualDumper):

SRC_NAME = "wellderly"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)

def __init__(self, *args, **kwargs):
super(WellderlyDumper, self).__init__(*args, **kwargs)
self.logger.info("Assuming Wellderly.chr*.g.vcf.gz.tsv files were manual downloaded")
167 changes: 167 additions & 0 deletions src/hub/dataload/sources/wellderly/wellderly_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import pandas as pd
from collections import namedtuple
from itertools import combinations_with_replacement as cwr
from utils.hgvs import _normalized_vcf

# (`start`, `end`) can be different from `pos`
# E.g `chr1:g.10429_10433del` with a genotype `CCCTAA/C`, has pos == 10428, start == 10429 and end == 10433
HgvsID = namedtuple('HgvsID', ['id', 'chrom', 'start', 'end', 'vartype'])


def format_hgvs(chrom, pos, ref, alt):
"""
Get a valid hgvs name from VCF-style "chrom, pos, ref, alt" data.
This function is adapted from utils.hgvs.format_hgvs.
TODO: if utils.hgvs.format_hgvs is refactored to return (start, end), delete this and use that function instead
See https://github.com/biothings/myvariant.info/issues/111
"""
chrom, pos = str(chrom), int(pos)
if chrom.lower().startswith('chr'):
# trim off leading "chr" if any
chrom = chrom[3:]

# python integer range is from -2,147,483,648 to 2,147,483,647
# The max sequence length is on chr1 (GRCh37.p13), 249,250,621
pos = int(pos)

if len(ref) == len(alt) == 1:
# this is a SNP
_id = 'chr{0}:g.{1}{2}>{3}'.format(chrom, pos, ref, alt)
return HgvsID(id=_id, chrom=chrom, start=pos, end=pos, vartype="snp")

if len(ref) > 1 and len(alt) == 1:
# this is a deletion:
if ref[0] == alt:
start = pos + 1
end = pos + len(ref) - 1

if start == end:
_id = 'chr{0}:g.{1}del'.format(chrom, start)
else:
_id = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="del")
else:
start = pos
end = start + len(ref) - 1

_id = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, start, end, alt)
return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="delins")

if len(ref) == 1 and len(alt) > 1:
# this is an insertion
if alt[0] == ref:
start, end = pos, pos + 1
ins_seq = alt[1:]

_id = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, ins_seq)
return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="ins")
else:
_id = 'chr{0}:g.{1}delins{2}'.format(chrom, pos, alt)
return HgvsID(id=_id, chrom=chrom, start=pos, end=pos, vartype="delins")

if len(ref) > 1 and len(alt) > 1:
if ref[0] == alt[0]:
# if ref and alt overlap from the left, trim them first
_chrom, _pos, _ref, _alt = _normalized_vcf(chrom, pos, ref, alt)
return format_hgvs(_chrom, _pos, _ref, _alt)
else:
start, end = pos, pos + len(ref) - 1

_id = 'chr{0}:g.{1}_{2}delins{3}'.format(chrom, start, end, alt)
return HgvsID(id=_id, chrom=chrom, start=start, end=end, vartype="delins")

raise ValueError("Cannot convert {} into HGVS id.".format((chrom, pos, ref, alt)))


class Genotype:
def __init__(self, first_allele, second_allele):
self.first_allele = first_allele
self.second_allele = second_allele

def __str__(self):
return "{}/{}".format(self.first_allele, self.second_allele)


def genotype_combinations(ref: str, alt_list: list) -> list:
"""
The `GC` column (not VCF-standard) in the Wellderly tsv files represents the count of genotypes in the
following patterns.
Suppose a, b, c, ..., z represent the allele frequencies.
For n = 2 alleles, the genotype frequencies are expanded as (a+b)^2 = a^2 + 2ab + b^2.
For n = 3 alleles, (a+b+c)^2 = (a+b)^2 + 2(a+b)c + c^2 = a^2 + 2ab + b^2 + 2ac + 2bc + c^2
For n = 4 alleles, (a+b+c+d)^2 = (a+b+c)^2 + 2(a+b+c)d + d^2 = ...
Therefore, the genotype combinations (i.e. the order of the `GC` column) would be
0/0, 0/1, 1/1, 0/2, 1/2, 2/2, 0/3, 1/3, 2/3, 3/3, ...
...
Note that this expanding scheme seems only apply to the Wellderly tsv files.
For n alleles, there would be {(n+1) choose 2} genotypes regardless of the combination order.
"""
# The REF must be the first allele in the list
allele_list = [ref] + alt_list
genotype_list = [Genotype(first_allele=q, second_allele=p) for (p, q) in cwr(reversed(allele_list), 2)]
genotype_list.reverse()
return genotype_list


class WellderlyTsvReader:
@classmethod
def generate_document(cls, row: pd.Series, assembly="hg19"):
# Read REF and ALT alleles
ref = row["REF"]

alt_list = row["ALT"].split(",")
alt_cnt = [int(ac) for ac in row["AC"].split(",")]
alt_freq = [float(af) for af in row["AF"].split(",")]
if not len(alt_list) == len(alt_cnt) == len(alt_freq):
raise ValueError("Inconsistent length between ALT, AC and AF fields. Got row={}".format(row.to_str()))

# Collect all alleles (REF + ALT)
allele_list = alt_list + [ref]
allele_freq = alt_freq + [1 - sum(alt_freq)] # the `AF` column does not have the `REF` allele frequencies
allele_json = [{"allele": allele, "freq": freq} for (allele, freq) in zip(allele_list, allele_freq)]

# Collect all genotypes
genotype_cnt = [int(gc) for gc in row["GC"].split(",")]
genotype_list = genotype_combinations(ref=ref, alt_list=alt_list)

genotype_total = sum(genotype_cnt)
if genotype_total > 0:
genotype_freq = [cnt / genotype_total for cnt in genotype_cnt]
else: # In some rare cases, `GC` can be nothing but 0's
genotype_freq = [0 for _ in genotype_cnt]
genotype_json = [{"count": cnt, "freq": freq, 'genotype': str(gt)} for (cnt, freq, gt)
in zip(genotype_cnt, genotype_freq, genotype_list) if cnt > 0]

# Generate ID and document for each (REF, ALT) pair
chrom, pos = row["CHROM:POS"].split(":")
for alt in alt_list:
hgvsID = format_hgvs(chrom, pos, ref, alt)

document = {
'_id': hgvsID.id,
'wellderly': {
'chrom': hgvsID.chrom,
'pos': pos, 'ref': ref, 'alt': alt,
assembly: {'start': hgvsID.start, 'end': hgvsID.end},
'vartype': hgvsID.vartype,
'alleles': allele_json,
'genotypes': genotype_json
}
}

yield document

@classmethod
def load_data(cls, file, assembly="hg19"):
# Skip 'AN' column
data_types = {"CHROM:POS": str, "REF": str, "ALT": str, "AC": str, "GC": str, "AF": str}
df = pd.read_csv(file, sep="\t", usecols=data_types.keys(), dtype=data_types)
for _, row in df.iterrows():
yield from cls.generate_document(row, assembly)

0 comments on commit f3ad848

Please sign in to comment.