clingen added + renamed pantherdb

biothings · Oct 29, 2019 · 970d438 · 970d438
1 parent cb0a5ef
commit 970d438
Show file tree

Hide file tree

Showing 9 changed files with 279 additions and 0 deletions.
diff --git a/src/hub/dataload/sources/clingen/README b/src/hub/dataload/sources/clingen/README
@@ -0,0 +1 @@
+Originally from https://github.com/chvbs2000/ClinGen
diff --git a/src/hub/dataload/sources/clingen/__init__.py b/src/hub/dataload/sources/clingen/__init__.py
@@ -0,0 +1,2 @@
+from .dump import ClingenDumper
+from .upload import ClingenUploader
diff --git a/src/hub/dataload/sources/clingen/dump.py b/src/hub/dataload/sources/clingen/dump.py
@@ -0,0 +1,49 @@
+import os
+
+import biothings, config
+biothings.config_for_app(config)
+from config import DATA_ARCHIVE_ROOT
+
+from biothings.utils.common import uncompressall
+
+import biothings.hub.dataload.dumper
+
+
+class ClingenDumper(biothings.hub.dataload.dumper.LastModifiedHTTPDumper):
+
+    SRC_NAME = "clingen"
+    SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
+    SCHEDULE = None
+    UNCOMPRESS = False
+    SRC_URLS = ['https://search.clinicalgenome.org/kb/gene-validity.csv']
+    SCHEDULE = "0 6 * * *"
+    __metadata__ = {
+        "src_meta": {
+            'license_url': 'https://www.clinicalgenome.org/docs/terms-of-use/',
+            'licence': 'CC0 1.0 Universal',
+            'url': 'https://search.clinicalgenome.org/kb/gene-validity'
+        }
+    }
+
+    def post_dump(self, *args, **kwargs):
+        if self.__class__.UNCOMPRESS:
+            self.logger.info("Uncompress all archive files in '%s'" %
+                             self.new_data_folder)
+            uncompressall(self.new_data_folder)
+
+    def get_release(self):
+        """
+        return the most updated version
+        """
+
+        import requests
+        import re
+
+        response = self.client.head(self.SRC_URLS[0])
+        text = response.headers["Content-Disposition"]
+        date = re.findall(r'\d{4}-\d\d-\d\d', text)
+
+        return date[0]
+
+    def set_release(self):
+        self.release = self.get_release()
diff --git a/src/hub/dataload/sources/clingen/parser.py b/src/hub/dataload/sources/clingen/parser.py
@@ -0,0 +1,148 @@
+import os
+import unicodedata
+from collections import defaultdict
+import csv
+from datetime import date
+from biothings.utils.dataload import dict_sweep, open_anyfile
+import requests
+import json
+
+
+# function: data_entry, id_conversion, load_data
+# load_data:
+#   1. data_entry (yield documents)
+#   2. id_conversion
+# classification value to lower case
+
+def load_data(data_access):
+    """
+    return documents
+    """
+
+    docs = parse_data(data_access)
+    for doc in docs:
+        yield doc
+
+def parse_data(data_access):
+    """
+    return: a list containing a nested dinctionary with ENTREZ ID as gene ID  
+    """
+
+    current_time = date.today().strftime("-%Y-%m-%d")
+    file_name = "ClinGen-Gene-Disease-Summary{}.csv".format(str(current_time))
+    data_dir = os.path.join(data_access, file_name)
+
+    # check if the file exist
+    assert os.path.exists(data_dir), "input file '%s' does not exist" % data_dir
+
+    # read file
+    with open_anyfile(data_dir) as input_file:
+
+        for _ in range(4):
+            next(input_file)
+
+        header = next(input_file).strip().split(",")
+        next(input_file)
+        reader = csv.DictReader(set(list(input_file)), fieldnames = header, delimiter = ",")
+        output = defaultdict(list)
+
+        # initialize a list to store HGNC ID
+        #hgnc_list = []
+
+        for row in reader:
+            # skip samples with empty HGNC 
+            if not 'GENE ID (HGNC)' in row or not row['GENE ID (HGNC)']:
+                continue
+            # store HGNC gen ID for conversion
+            hgnc_id = row['GENE ID (HGNC)'].split(':')[1]
+            #hgnc_list.append(hgnc_id)
+
+            # store every gene's information into a nested dictionary 
+            gene = {}
+            gene['_id'] = hgnc_id
+            gene['clingen'] = {}
+            gene['clingen']['clinical_validity'] = {}
+            key_list = ['DISEASE LABEL', 'DISEASE ID (MONDO)', 'SOP', 'CLASSIFICATION', 'ONLINE REPORT']
+
+            # for each key, store the value into the gene dictionary 
+            for key in key_list:
+
+                # disease value: "MONDO_ID" -> "MONDO:ID"
+                if key == 'DISEASE ID (MONDO)':
+                    old_key = key
+                    complete_key = 'mondo'
+                    gene['clingen']['clinical_validity'][complete_key] = row.get(old_key, None).replace("_",":")
+
+                elif key == 'CLASSIFICATION':
+                    old_key = key
+                    complete_key = key.lower().replace(' ', '_') # key to lower case
+                    gene['clingen']['clinical_validity'][complete_key] = row.get(old_key, None).lower() # value to lower case
+
+                else:
+                    old_key = key
+                    complete_key = key.lower().replace(' ', '_') # key to lower case
+                    gene['clingen']['clinical_validity'][complete_key] = row.get(old_key, None)
+
+            gene = dict_sweep(gene, vals = ['','null','N/A',None, [],{}])
+            output[gene['_id']].append(gene)
+
+        #entrez_hgnc_dict = hgnc2entrenz(hgnc_list)
+        temp_output = []
+
+        # merge duplicates, this amy happen when a gene causes multiple diseases amd has multiple labels
+        for value in output.values():
+            # genes without duplicate
+            if len(value) == 1:
+                temp_output.append(value[0])
+
+            # genes in duplicate
+            else:
+                temp_output.append({
+                    '_id':value[0]['_id'],
+                    'clingen': {
+                        'clinical_validity':[v['clingen']['clinical_validity']for v in value]
+                        }
+                    })
+
+    return hgnc2entrez(temp_output)
+
+def hgnc2entrez(data_dict_list):
+    """
+    converts HGNC_ID to ENTREN_ID
+    """
+
+    hgnc_list = []
+
+    # build a list containing all HGNC ID
+    for element in data_dict_list:
+        hgnc_list.append(element['_id'])
+
+    # romve duplicate HGNC gene
+    hgnc_set = list(map(int, set(hgnc_list)))
+
+    # retrieve ENTRNZ ID from mygene.info based on HGNC ID
+    headers = {'content-type':'application/x-www-form-urlencoded'}
+    params = 'q={}&scopes=HGNC&fields=_id'.format(str(hgnc_set).replace('[','').replace(']',''))
+    res = requests.post('http://mygene.info/v3/query', data=params, headers=headers)
+    json_data = json.loads(res.text)
+
+    # build ID conversion dictionary
+    entrez_hgnc_dict = {}
+    for i in range(len(json_data)):
+        entrez_hgnc_dict[json_data[i]['query']] = json_data[i]['_id']
+
+    final_output = []
+
+    # store updated gene dictionary in final_output list
+    for element in data_dict_list:
+        # convert HGNC ID to ENTREZ ID
+        key = element['_id']
+        element['_id'] = entrez_hgnc_dict[key]
+        final_output.append(element)
+
+    return final_output
+
+
+
+
+
diff --git a/src/hub/dataload/sources/clingen/upload.py b/src/hub/dataload/sources/clingen/upload.py
@@ -0,0 +1,64 @@
+import os
+
+import biothings, config
+biothings.config_for_app(config)
+
+import biothings.hub.dataload.uploader
+
+# when code is exported, import becomes relative
+try:
+    from ClinGen.parser import load_data as parser_func
+except ImportError:
+    from .parser import load_data as parser_func
+
+
+class ClingenUploader(biothings.hub.dataload.uploader.BaseSourceUploader):
+
+    name = "clingen"
+    __metadata__ = {
+        "src_meta": {
+            'license_url': 'https://www.clinicalgenome.org/docs/terms-of-use/',
+            'licence': 'CC0 1.0 Universal',
+            'url': 'https://search.clinicalgenome.org/kb/gene-validity'
+        }
+    }
+    idconverter = None
+    storage_class = biothings.hub.dataload.storage.IgnoreDuplicatedStorage
+
+    def load_data(self, data_folder):
+        self.logger.info("Load data from directory: '%s'" % data_folder)
+        return parser_func(data_folder)
+
+    @classmethod
+    def get_mapping(klass):
+        return         {
+            'clingen': {
+                'properties': {
+                    'clinical_validity': {
+                        'properties': {
+                            'classification': {
+                                'normalizer': 'keyword_lowercase_normalizer',
+                                'type': 'keyword'
+                            },
+                            'disease_label': {
+                                'type': 'text'
+                            },
+                            'mondo': {
+                                'copy_to': ['all'],
+                                'normalizer': 'keyword_lowercase_normalizer',
+                                'type': 'keyword'
+                            },
+                            'online_report': {
+                                'index': False,
+                                'type': 'text'
+                            },
+                            'sop': {
+                                'normalizer': 'keyword_lowercase_normalizer',
+                                'type': 'keyword'
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
diff --git a/src/hub/dataload/sources/clingen/version.py b/src/hub/dataload/sources/clingen/version.py
@@ -0,0 +1,15 @@
+def get_release(self):
+    """
+    return the most updated version
+    """
+
+    import requests
+    import re
+
+    response = self.client.head(self.SRC_URLS[0])
+    text = response.headers["Content-Disposition"]
+    date = re.findall(r'\d{4}-\d\d-\d\d',text)
+
+    return date[0]
+
+
diff --git a/src/plugins/PantherDB/README.md → src/plugins/pantherdb/README.md b/src/plugins/PantherDB/README.md → src/plugins/pantherdb/README.md
diff --git a/src/plugins/PantherDB/manifest.json → src/plugins/pantherdb/manifest.json b/src/plugins/PantherDB/manifest.json → src/plugins/pantherdb/manifest.json
diff --git a/src/plugins/PantherDB/parser.py → src/plugins/pantherdb/parser.py b/src/plugins/PantherDB/parser.py → src/plugins/pantherdb/parser.py