Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
erikyao committed Jul 16, 2021
2 parents 21b1c24 + 2bfba4e commit 1fb8739
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 36 deletions.
41 changes: 28 additions & 13 deletions src/hub/dataload/sources/dbsnp/dbsnp_json_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# TODO
# Index geneid field as string


def parse_one_rec(assembly, record):
"""Restructure JSON
"""
Expand Down Expand Up @@ -84,6 +85,7 @@ def restructure_allele_freq_info(allele_annotations):
alleles_data.append(freq)
return alleles_data


"""
def normalize_delins_hgvs(hgvs):
# handle delins, where no deleted nucleotides is specified
Expand All @@ -105,6 +107,7 @@ def normalize_delins_hgvs(hgvs):
return hgvs
"""


def restructure_gene_info(allele_annotations):
"""Restructure information related to gene
"""
Expand Down Expand Up @@ -150,35 +153,47 @@ def accession_2_chr(accession):


def get_hgvs_and_vcf(assembly, placements):
ASSEMBLY_NAME_MAPPING = {"hg19": "GRCh37.p13",
"hg38": "GRCh38.p12"}
hgvs = None
vcf = {}
# Note that hg38 data of dbsnp release 154 are based on "GRCh38.p12",
# while release 155 based on "GRCh38.p13"
ASSEMBLY_NAME_MAPPING = {"hg19": "GRCh37.p13", "hg38": "GRCh38.p13"}

if placements:
for _placement in placements:
seq = _placement.get('placement_annot').get('seq_id_traits_by_assembly')
if seq:
assembly_name = seq[0].get('assembly_name')
if assembly_name == ASSEMBLY_NAME_MAPPING[assembly]:
placement_assembly_name = seq[0].get('assembly_name')
expected_assembly_name = ASSEMBLY_NAME_MAPPING[assembly]

if placement_assembly_name == ASSEMBLY_NAME_MAPPING[assembly]:
for _allele in _placement.get('alleles'):
if _allele.get('allele').get('spdi').get('deleted_sequence') != _allele.get('allele').get('spdi').get('inserted_sequence') and _allele.get('hgvs').startswith('NC'):
hgvs = 'chr' + accession_2_chr(_allele.get('hgvs')) + ":" + _allele.get('hgvs').split(':')[-1]
ref = _allele.get("allele").get('spdi').get('deleted_sequence')
alt = _allele.get("allele").get('spdi').get('inserted_sequence')
# ref = _allele.get("allele").get('spdi').get('deleted_sequence')
# alt = _allele.get("allele").get('spdi').get('inserted_sequence')
vcf = (accession_2_chr(_allele.get('allele').get('spdi').get('seq_id')),
_allele.get("allele").get('spdi').get('position') + 1,
_allele.get("allele").get('spdi').get('deleted_sequence'),
_allele.get("allele").get('spdi').get('inserted_sequence'))
yield (hgvs, vcf)
yield (None, None)
yield hgvs, vcf
else:
# Take "GRCh38.p13" as an example.
# "GRCh38" is the build number; "p13" is the release number
placement_grch_build = placement_assembly_name.split(r".", 1)[0]
expected_grch_build = expected_assembly_name.split(r".", 1)[0]

if placement_grch_build == expected_grch_build:
raise ValueError("GRCh release numbers do not match. Expect {}. Got {}.".format(
expected_assembly_name, placement_assembly_name))

yield None, None


def load_data_file(input_file, version):
f = open_compressed_file(input_file)
for line in f:
record = parse_one_rec(version, json.loads(line.decode()))
for _doc in record:
new_doc = {}
new_doc = dict()
new_doc['_id'] = trim_delseq_from_hgvs(_doc.pop('_id'))
new_doc['dbsnp'] = _doc
yield new_doc
Expand All @@ -187,5 +202,5 @@ def load_data_file(input_file, version):
# load path and find files, pass to data_generator
def load_data(path_glob, version='hg19'):
for input_file in sorted(glob.glob(path_glob)):
for d in load_data_file(input_file, version):
yield d
for d in load_data_file(input_file, version):
yield d
45 changes: 22 additions & 23 deletions src/hub/dataload/sources/dbsnp/dbsnp_upload.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import itertools, glob, os
import glob, os

from .dbsnp_json_parser import load_data_file
import biothings.hub.dataload.uploader as uploader
Expand All @@ -7,33 +7,32 @@


SRC_META = {
"url" : "https://www.ncbi.nlm.nih.gov/projects/SNP/",
"license_url" : "https://www.ncbi.nlm.nih.gov/home/about/policies/",
"license_url_short": "http://bit.ly/2AqoLOc"
}
"url": "https://www.ncbi.nlm.nih.gov/projects/SNP/",
"license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",
"license_url_short": "http://bit.ly/2AqoLOc"
}


class DBSNPBaseUploader(uploader.ParallelizedSourceUploader,
SnpeffPostUpdateUploader):
class DBSNPBaseUploader(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader):

storage_class = MyVariantIgnoreDuplicatedStorage

def jobs(self):
files = glob.glob(os.path.join(self.data_folder,"refsnp-chr*.json.bz2"))
return [(f,) for f in files]
files = glob.glob(os.path.join(self.data_folder, "refsnp-chr*.json.bz2"))
return [(f, ) for f in files]

def load_data(self,input_file):
self.logger.info("Load data from '%s'",input_file)
return load_data_file(input_file,self.__class__.__metadata__["assembly"])
def load_data(self, input_file):
self.logger.info("Load data from '%s'", input_file)
return load_data_file(input_file, self.__class__.__metadata__["assembly"])

def post_update_data(self, *args, **kwargs):
super(DBSNPBaseUploader,self).post_update_data(*args,**kwargs)
super(DBSNPBaseUploader, self).post_update_data(*args, **kwargs)
self.logger.info("Indexing 'rsid'")
# background=true or it'll lock the whole database...
self.collection.create_index("dbsnp.rsid",background=True)
self.collection.create_index("dbsnp.rsid", background=True)

@classmethod
def get_mapping(klass):
def get_mapping(cls):
mapping = {
"dbsnp": {
"properties": {
Expand Down Expand Up @@ -260,19 +259,19 @@ class DBSNPHg19Uploader(DBSNPBaseUploader):
main_source = "dbsnp"
name = "dbsnp_hg19"
__metadata__ = {
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg19",
"src_meta" : SRC_META
}
"mapper": 'observed_skipidtoolong',
"assembly": "hg19",
"src_meta": SRC_META
}


class DBSNPHg38Uploader(DBSNPBaseUploader):

main_source = "dbsnp"
name = "dbsnp_hg38"
__metadata__ = {
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg38",
"src_meta" : SRC_META
}
"mapper": 'observed_skipidtoolong',
"assembly": "hg38",
"src_meta": SRC_META
}

0 comments on commit 1fb8739

Please sign in to comment.