NLM Pubmed record processing

belbio · Jan 19, 2018 · 059672f · 059672f
1 parent b74cab5
commit 059672f
Showing 1 changed file with 249 additions and 0 deletions.
diff --git a/bel/nanopub/pubmed.py b/bel/nanopub/pubmed.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Pubmed related utilities
+
+Given PMID - collect Pubmed data and Pubtator Bioconcepts used for the BELMgr
+or enhancing BEL Nanopubs
+"""
+
+import os
+import requests
+from typing import Mapping, Any
+from lxml import etree
+import re
+import copy
+import datetime
+
+import logging
+import logging.config
+
+from bel.Config import config
+import bel.utils
+
+# Replace pmid
+PUBMED_TMPL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=PMID'
+PUBTATOR_TMPL = 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept/PMID/JSON'
+
+pubtator_ns_convert = {'CHEBI': 'CHEBI', 'Species': 'TAX', 'Gene': 'EG', 'Chemical': 'MESH', 'Disease': 'MESH'}
+pubtator_entity_convert = {'Chemical': 'Abundance', 'Gene': 'Gene', 'Disease': 'Pathology', 'Species': 'Organism', }
+pubtator_annotation_convert = {'Disease': 'Pathology', 'Species': 'Organism', }
+pubtator_known_types = [key for key in pubtator_ns_convert.keys()]
+
+
+def get_pubtator(pmid):
+    """Get Pubtator Bioconcepts from Pubmed Abstract
+
+    Re-configure the denotations into an annotation dictionary format
+    and collapse duplicate terms so that their spans are in a list.
+    """
+    r = bel.utils.get_url(PUBTATOR_TMPL.replace('PMID', pmid), timeout=10)
+    if r and r.status_code == 200:
+        pubtator = r.json()
+    else:
+        return None
+
+    known_types = ['CHEBI', 'Chemical', 'Disease', 'Gene', 'Species', ]
+
+    for idx, anno in enumerate(pubtator["denotations"]):
+        s_match = re.match('(\w+):(\w+)', anno['obj'])
+        c_match = re.match('(\w+):(\w+):(\w+)', anno['obj'])
+        if c_match:
+            (ctype, namespace, cid) = (c_match.group(1), c_match.group(2), c_match.group(3), )
+
+            if ctype not in known_types:
+                log.info(f'{ctype} not in known_types for Pubtator')
+            if namespace not in known_types:
+                log.info(f'{namespace} not in known_types for Pubtator')
+
+            pubtator["denotations"][idx]['obj'] = f'{pubtator_ns_convert.get(namespace, "UNKNOWN")}:{cid}'
+            pubtator["denotations"][idx]['entity_type'] = pubtator_entity_convert.get(ctype, None)
+            pubtator["denotations"][idx]['annotation_type'] = pubtator_annotation_convert.get(ctype, None)
+        elif s_match:
+            (ctype, cid) = (s_match.group(1), s_match.group(2), )
+
+            if ctype not in known_types:
+                log.info(f'{ctype} not in known_types for Pubtator')
+
+            pubtator["denotations"][idx]['obj'] = f'{pubtator_ns_convert.get(ctype, "UNKNOWN")}:{cid}'
+            pubtator["denotations"][idx]['entity_type'] = pubtator_entity_convert.get(ctype, None)
+            pubtator["denotations"][idx]['annotation_type'] = pubtator_annotation_convert.get(ctype, None)
+
+    annotations = {}
+    for anno in pubtator['denotations']:
+        log.info(anno)
+        if anno['obj'] not in annotations:
+            annotations[anno['obj']] = {'spans': [anno['span']]}
+            annotations[anno['obj']]['entity_types'] = [anno['entity_type']]
+            annotations[anno['obj']]['annotation_types'] = [anno['annotation_type']]
+
+        else:
+            annotations[anno['obj']]['spans'].append(anno['span'])
+
+    del pubtator['denotations']
+    pubtator['annotations'] = copy.deepcopy(annotations)
+
+    return pubtator
+
+
+def process_pub_date(year, mon, day):
+    """Create pub_date from what Pubmed provides in Journal PubDate entry
+    """
+
+    pub_date = None
+    if year and re.match('[a-zA-Z]+', mon):
+        pub_date = datetime.datetime.strptime(f'{year}-{mon}-{day}', '%Y-%b-%d').strftime('%Y-%m-%d')
+    elif year:
+        pub_date = f'{year}-{mon}-{day}'
+
+    return pub_date
+
+
+def get_pubmed(pmid: str) -> Mapping[str, Any]:
+    """Get pubmed xml for pmid and convert to JSON
+
+    Remove MESH terms if they are duplicated in the compound term set
+
+    ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate>
+    Only getting pub_year at this point from the <PubDate> element.
+
+    Args:
+        pmid: pubmed id number as a string
+
+    Returns:
+        pubmed json
+    """
+    r = bel.utils.get_url(PUBMED_TMPL.replace('PMID', pmid))
+
+    root = etree.fromstring(r.content)
+    doc = {}
+    doc['pmid'] = root.xpath("//PMID/text()")[0]
+    doc['title'] = next(iter(root.xpath("//ArticleTitle/text()")), '')
+    doc['abstract'] = next(iter(root.xpath('//Abstract/AbstractText/text()')), '')
+
+    doc['authors'] = []
+    for author in root.xpath('//Author'):
+        last_name = next(iter(author.xpath('LastName/text()')), '')
+        first_name = next(iter(author.xpath('ForeName/text()')), '')
+        initials = next(iter(author.xpath('Initials/text()')), '')
+        if not first_name and initials:
+            first_name = initials
+        doc['authors'].append(f'{last_name}, {first_name}')
+
+    pub_year = next(iter(root.xpath("//Journal/JournalIssue/PubDate/Year/text()")), None)
+    pub_mon = next(iter(root.xpath("//Journal/JournalIssue/PubDate/Month/text()")), 'Jan')
+    pub_day = next(iter(root.xpath("//Journal/JournalIssue/PubDate/Day/text()")), '01')
+
+    pub_date = process_pub_date(pub_year, pub_mon, pub_day)
+
+    doc['pub_date'] = pub_date
+    doc['journal_title'] = next(iter(root.xpath('//Journal/Title/text()')), '')
+    doc['joural_iso_title'] = next(iter(root.xpath('//Journal/ISOAbbreviation/text()')), '')
+    doc['doi'] = next(iter(root.xpath('//ArticleId[@IdType="doi"]/text()')), None)
+
+    doc['compounds'] = []
+    for chem in root.xpath("//ChemicalList/Chemical/NameOfSubstance"):
+        chem_id = chem.get('UI')
+        doc['compounds'].append({'id': f"MESH:{chem_id}", 'name': chem.text})
+
+    compounds = [cmpd['id'] for cmpd in doc['compounds']]
+    doc['mesh'] = []
+    for mesh in root.xpath("//MeshHeading/DescriptorName"):
+        mesh_id = f"MESH:{mesh.get('UI')}"
+        if mesh_id in compounds:
+            continue
+        doc['mesh'].append({'id': mesh_id, 'name': mesh.text})
+
+    return doc
+
+
+def enhance_pubmed_annotations(pubmed: Mapping[str, Any]) -> Mapping[str, Any]:
+    """Enhance pubmed namespace IDs
+
+    Add additional entity and annotation types to annotations
+    Use preferred id for namespaces as needed
+    Add strings from Title, Abstract matching Pubtator BioConcept spans
+
+    NOTE - basically duplicated code with bel_api:api.services.pubmed
+
+    Args:
+        pubmed
+
+    Returns:
+        pubmed object
+    """
+
+    text = pubmed['title'] + pubmed['abstract']
+
+    annotations = {}
+
+    for nsarg in pubmed['annotations']:
+        url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{nsarg}'
+        log.info(f'URL: {url}')
+        r = bel.utils.get_url(url)
+        log.info(f'Result: {r}')
+        new_nsarg = ''
+        if r and r.status_code == 200:
+            term = r.json()
+            new_nsarg = bel.lang.bel_utils.convert_nsarg(term['id'], decanonicalize=True)
+
+            pubmed['annotations'][nsarg]['name'] = term['name']
+            pubmed['annotations'][nsarg]['label'] = term['label']
+            pubmed['annotations'][nsarg]['entity_types'] = list(set(pubmed['annotations'][nsarg]['entity_types'] + term.get('entity_types', [])))
+            pubmed['annotations'][nsarg]['annotation_types'] = list(set(pubmed['annotations'][nsarg]['annotation_types'] + term.get('annotation_types', [])))
+
+        if new_nsarg != nsarg:
+            annotations[new_nsarg] = copy.deepcopy(pubmed['annotations'][nsarg])
+        else:
+            annotations[nsarg] = copy.deepcopy(pubmed['annotations'][nsarg])
+
+    for nsarg in annotations:
+        for idx, span in enumerate(annotations[nsarg]['spans']):
+            string = text[span['begin'] - 1:span['end'] - 1]
+            annotations[nsarg]['spans'][idx]['text'] = string
+
+    pubmed['annotations'] = copy.deepcopy(annotations)
+
+    return pubmed
+
+
+def get_pubmed_for_beleditor(pmid: str) -> Mapping[str, Any]:
+    """Get fully annotated pubmed doc with Pubtator and full entity/annotation_types
+
+    Args:
+        pmid: Pubmed PMID
+
+    Returns:
+        Mapping[str, Any]: pubmed dictionary
+    """
+
+    pubmed = get_pubmed(pmid)
+    pubtator = get_pubtator(pmid)
+    pubmed['annotations'] = copy.deepcopy(pubtator['annotations'])
+
+    # Add entity types and annotation types to annotations
+    pubmed = enhance_pubmed_annotations(pubmed)
+
+    return pubmed
+
+
+def main():
+
+    pmid = '19894120'
+
+    pubmed = get_pubmed_for_beleditor(pmid)
+
+    import json
+    print('DumpVar:\n', json.dumps(pubmed, indent=4))
+
+
+if __name__ == '__main__':
+
+    logging.config.dictConfig(config['logging'])
+    log = logging.getLogger(__name__)
+
+    main()
+
+else:
+    log = logging.getLogger(__name__)