Skip to content

Commit

Permalink
clingen added + renamed pantherdb
Browse files Browse the repository at this point in the history
  • Loading branch information
sirloon committed Oct 29, 2019
1 parent cb0a5ef commit 970d438
Show file tree
Hide file tree
Showing 9 changed files with 279 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/hub/dataload/sources/clingen/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Originally from https://github.com/chvbs2000/ClinGen
2 changes: 2 additions & 0 deletions src/hub/dataload/sources/clingen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dump import ClingenDumper
from .upload import ClingenUploader
49 changes: 49 additions & 0 deletions src/hub/dataload/sources/clingen/dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os

import biothings, config
biothings.config_for_app(config)
from config import DATA_ARCHIVE_ROOT

from biothings.utils.common import uncompressall

import biothings.hub.dataload.dumper


class ClingenDumper(biothings.hub.dataload.dumper.LastModifiedHTTPDumper):

SRC_NAME = "clingen"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
SCHEDULE = None
UNCOMPRESS = False
SRC_URLS = ['https://search.clinicalgenome.org/kb/gene-validity.csv']
SCHEDULE = "0 6 * * *"
__metadata__ = {
"src_meta": {
'license_url': 'https://www.clinicalgenome.org/docs/terms-of-use/',
'licence': 'CC0 1.0 Universal',
'url': 'https://search.clinicalgenome.org/kb/gene-validity'
}
}

def post_dump(self, *args, **kwargs):
if self.__class__.UNCOMPRESS:
self.logger.info("Uncompress all archive files in '%s'" %
self.new_data_folder)
uncompressall(self.new_data_folder)

def get_release(self):
"""
return the most updated version
"""

import requests
import re

response = self.client.head(self.SRC_URLS[0])
text = response.headers["Content-Disposition"]
date = re.findall(r'\d{4}-\d\d-\d\d', text)

return date[0]

def set_release(self):
self.release = self.get_release()
148 changes: 148 additions & 0 deletions src/hub/dataload/sources/clingen/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import os
import unicodedata
from collections import defaultdict
import csv
from datetime import date
from biothings.utils.dataload import dict_sweep, open_anyfile
import requests
import json


# function: data_entry, id_conversion, load_data
# load_data:
# 1. data_entry (yield documents)
# 2. id_conversion
# classification value to lower case

def load_data(data_access):
"""
return documents
"""

docs = parse_data(data_access)
for doc in docs:
yield doc

def parse_data(data_access):
"""
return: a list containing a nested dinctionary with ENTREZ ID as gene ID
"""

current_time = date.today().strftime("-%Y-%m-%d")
file_name = "ClinGen-Gene-Disease-Summary{}.csv".format(str(current_time))
data_dir = os.path.join(data_access, file_name)

# check if the file exist
assert os.path.exists(data_dir), "input file '%s' does not exist" % data_dir

# read file
with open_anyfile(data_dir) as input_file:

for _ in range(4):
next(input_file)

header = next(input_file).strip().split(",")
next(input_file)
reader = csv.DictReader(set(list(input_file)), fieldnames = header, delimiter = ",")
output = defaultdict(list)

# initialize a list to store HGNC ID
#hgnc_list = []

for row in reader:
# skip samples with empty HGNC
if not 'GENE ID (HGNC)' in row or not row['GENE ID (HGNC)']:
continue
# store HGNC gen ID for conversion
hgnc_id = row['GENE ID (HGNC)'].split(':')[1]
#hgnc_list.append(hgnc_id)

# store every gene's information into a nested dictionary
gene = {}
gene['_id'] = hgnc_id
gene['clingen'] = {}
gene['clingen']['clinical_validity'] = {}
key_list = ['DISEASE LABEL', 'DISEASE ID (MONDO)', 'SOP', 'CLASSIFICATION', 'ONLINE REPORT']

# for each key, store the value into the gene dictionary
for key in key_list:

# disease value: "MONDO_ID" -> "MONDO:ID"
if key == 'DISEASE ID (MONDO)':
old_key = key
complete_key = 'mondo'
gene['clingen']['clinical_validity'][complete_key] = row.get(old_key, None).replace("_",":")

elif key == 'CLASSIFICATION':
old_key = key
complete_key = key.lower().replace(' ', '_') # key to lower case
gene['clingen']['clinical_validity'][complete_key] = row.get(old_key, None).lower() # value to lower case

else:
old_key = key
complete_key = key.lower().replace(' ', '_') # key to lower case
gene['clingen']['clinical_validity'][complete_key] = row.get(old_key, None)

gene = dict_sweep(gene, vals = ['','null','N/A',None, [],{}])
output[gene['_id']].append(gene)

#entrez_hgnc_dict = hgnc2entrenz(hgnc_list)
temp_output = []

# merge duplicates, this amy happen when a gene causes multiple diseases amd has multiple labels
for value in output.values():
# genes without duplicate
if len(value) == 1:
temp_output.append(value[0])

# genes in duplicate
else:
temp_output.append({
'_id':value[0]['_id'],
'clingen': {
'clinical_validity':[v['clingen']['clinical_validity']for v in value]
}
})

return hgnc2entrez(temp_output)

def hgnc2entrez(data_dict_list):
"""
converts HGNC_ID to ENTREN_ID
"""

hgnc_list = []

# build a list containing all HGNC ID
for element in data_dict_list:
hgnc_list.append(element['_id'])

# romve duplicate HGNC gene
hgnc_set = list(map(int, set(hgnc_list)))

# retrieve ENTRNZ ID from mygene.info based on HGNC ID
headers = {'content-type':'application/x-www-form-urlencoded'}
params = 'q={}&scopes=HGNC&fields=_id'.format(str(hgnc_set).replace('[','').replace(']',''))
res = requests.post('http://mygene.info/v3/query', data=params, headers=headers)
json_data = json.loads(res.text)

# build ID conversion dictionary
entrez_hgnc_dict = {}
for i in range(len(json_data)):
entrez_hgnc_dict[json_data[i]['query']] = json_data[i]['_id']

final_output = []

# store updated gene dictionary in final_output list
for element in data_dict_list:
# convert HGNC ID to ENTREZ ID
key = element['_id']
element['_id'] = entrez_hgnc_dict[key]
final_output.append(element)

return final_output





64 changes: 64 additions & 0 deletions src/hub/dataload/sources/clingen/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os

import biothings, config
biothings.config_for_app(config)

import biothings.hub.dataload.uploader

# when code is exported, import becomes relative
try:
from ClinGen.parser import load_data as parser_func
except ImportError:
from .parser import load_data as parser_func


class ClingenUploader(biothings.hub.dataload.uploader.BaseSourceUploader):

name = "clingen"
__metadata__ = {
"src_meta": {
'license_url': 'https://www.clinicalgenome.org/docs/terms-of-use/',
'licence': 'CC0 1.0 Universal',
'url': 'https://search.clinicalgenome.org/kb/gene-validity'
}
}
idconverter = None
storage_class = biothings.hub.dataload.storage.IgnoreDuplicatedStorage

def load_data(self, data_folder):
self.logger.info("Load data from directory: '%s'" % data_folder)
return parser_func(data_folder)

@classmethod
def get_mapping(klass):
return {
'clingen': {
'properties': {
'clinical_validity': {
'properties': {
'classification': {
'normalizer': 'keyword_lowercase_normalizer',
'type': 'keyword'
},
'disease_label': {
'type': 'text'
},
'mondo': {
'copy_to': ['all'],
'normalizer': 'keyword_lowercase_normalizer',
'type': 'keyword'
},
'online_report': {
'index': False,
'type': 'text'
},
'sop': {
'normalizer': 'keyword_lowercase_normalizer',
'type': 'keyword'
}
}
}
}
}
}

15 changes: 15 additions & 0 deletions src/hub/dataload/sources/clingen/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
def get_release(self):
"""
return the most updated version
"""

import requests
import re

response = self.client.head(self.SRC_URLS[0])
text = response.headers["Content-Disposition"]
date = re.findall(r'\d{4}-\d\d-\d\d',text)

return date[0]


File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 970d438

Please sign in to comment.