-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
Pubmed related utilities | ||
Given PMID - collect Pubmed data and Pubtator Bioconcepts used for the BELMgr | ||
or enhancing BEL Nanopubs | ||
""" | ||
|
||
import os | ||
import requests | ||
from typing import Mapping, Any | ||
from lxml import etree | ||
import re | ||
import copy | ||
import datetime | ||
|
||
import logging | ||
import logging.config | ||
|
||
from bel.Config import config | ||
import bel.utils | ||
|
||
# Replace pmid | ||
PUBMED_TMPL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=PMID' | ||
PUBTATOR_TMPL = 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept/PMID/JSON' | ||
|
||
pubtator_ns_convert = {'CHEBI': 'CHEBI', 'Species': 'TAX', 'Gene': 'EG', 'Chemical': 'MESH', 'Disease': 'MESH'} | ||
pubtator_entity_convert = {'Chemical': 'Abundance', 'Gene': 'Gene', 'Disease': 'Pathology', 'Species': 'Organism', } | ||
pubtator_annotation_convert = {'Disease': 'Pathology', 'Species': 'Organism', } | ||
pubtator_known_types = [key for key in pubtator_ns_convert.keys()] | ||
|
||
|
||
def get_pubtator(pmid): | ||
"""Get Pubtator Bioconcepts from Pubmed Abstract | ||
Re-configure the denotations into an annotation dictionary format | ||
and collapse duplicate terms so that their spans are in a list. | ||
""" | ||
r = bel.utils.get_url(PUBTATOR_TMPL.replace('PMID', pmid), timeout=10) | ||
if r and r.status_code == 200: | ||
pubtator = r.json() | ||
else: | ||
return None | ||
|
||
known_types = ['CHEBI', 'Chemical', 'Disease', 'Gene', 'Species', ] | ||
|
||
for idx, anno in enumerate(pubtator["denotations"]): | ||
s_match = re.match('(\w+):(\w+)', anno['obj']) | ||
c_match = re.match('(\w+):(\w+):(\w+)', anno['obj']) | ||
if c_match: | ||
(ctype, namespace, cid) = (c_match.group(1), c_match.group(2), c_match.group(3), ) | ||
|
||
if ctype not in known_types: | ||
log.info(f'{ctype} not in known_types for Pubtator') | ||
if namespace not in known_types: | ||
log.info(f'{namespace} not in known_types for Pubtator') | ||
|
||
pubtator["denotations"][idx]['obj'] = f'{pubtator_ns_convert.get(namespace, "UNKNOWN")}:{cid}' | ||
pubtator["denotations"][idx]['entity_type'] = pubtator_entity_convert.get(ctype, None) | ||
pubtator["denotations"][idx]['annotation_type'] = pubtator_annotation_convert.get(ctype, None) | ||
elif s_match: | ||
(ctype, cid) = (s_match.group(1), s_match.group(2), ) | ||
|
||
if ctype not in known_types: | ||
log.info(f'{ctype} not in known_types for Pubtator') | ||
|
||
pubtator["denotations"][idx]['obj'] = f'{pubtator_ns_convert.get(ctype, "UNKNOWN")}:{cid}' | ||
pubtator["denotations"][idx]['entity_type'] = pubtator_entity_convert.get(ctype, None) | ||
pubtator["denotations"][idx]['annotation_type'] = pubtator_annotation_convert.get(ctype, None) | ||
|
||
annotations = {} | ||
for anno in pubtator['denotations']: | ||
log.info(anno) | ||
if anno['obj'] not in annotations: | ||
annotations[anno['obj']] = {'spans': [anno['span']]} | ||
annotations[anno['obj']]['entity_types'] = [anno['entity_type']] | ||
annotations[anno['obj']]['annotation_types'] = [anno['annotation_type']] | ||
|
||
else: | ||
annotations[anno['obj']]['spans'].append(anno['span']) | ||
|
||
del pubtator['denotations'] | ||
pubtator['annotations'] = copy.deepcopy(annotations) | ||
|
||
return pubtator | ||
|
||
|
||
def process_pub_date(year, mon, day): | ||
"""Create pub_date from what Pubmed provides in Journal PubDate entry | ||
""" | ||
|
||
pub_date = None | ||
if year and re.match('[a-zA-Z]+', mon): | ||
pub_date = datetime.datetime.strptime(f'{year}-{mon}-{day}', '%Y-%b-%d').strftime('%Y-%m-%d') | ||
elif year: | ||
pub_date = f'{year}-{mon}-{day}' | ||
|
||
return pub_date | ||
|
||
|
||
def get_pubmed(pmid: str) -> Mapping[str, Any]: | ||
"""Get pubmed xml for pmid and convert to JSON | ||
Remove MESH terms if they are duplicated in the compound term set | ||
ArticleDate vs PubDate gets complicated: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html see <ArticleDate> and <PubDate> | ||
Only getting pub_year at this point from the <PubDate> element. | ||
Args: | ||
pmid: pubmed id number as a string | ||
Returns: | ||
pubmed json | ||
""" | ||
r = bel.utils.get_url(PUBMED_TMPL.replace('PMID', pmid)) | ||
|
||
root = etree.fromstring(r.content) | ||
doc = {} | ||
doc['pmid'] = root.xpath("//PMID/text()")[0] | ||
doc['title'] = next(iter(root.xpath("//ArticleTitle/text()")), '') | ||
doc['abstract'] = next(iter(root.xpath('//Abstract/AbstractText/text()')), '') | ||
|
||
doc['authors'] = [] | ||
for author in root.xpath('//Author'): | ||
last_name = next(iter(author.xpath('LastName/text()')), '') | ||
first_name = next(iter(author.xpath('ForeName/text()')), '') | ||
initials = next(iter(author.xpath('Initials/text()')), '') | ||
if not first_name and initials: | ||
first_name = initials | ||
doc['authors'].append(f'{last_name}, {first_name}') | ||
|
||
pub_year = next(iter(root.xpath("//Journal/JournalIssue/PubDate/Year/text()")), None) | ||
pub_mon = next(iter(root.xpath("//Journal/JournalIssue/PubDate/Month/text()")), 'Jan') | ||
pub_day = next(iter(root.xpath("//Journal/JournalIssue/PubDate/Day/text()")), '01') | ||
|
||
pub_date = process_pub_date(pub_year, pub_mon, pub_day) | ||
|
||
doc['pub_date'] = pub_date | ||
doc['journal_title'] = next(iter(root.xpath('//Journal/Title/text()')), '') | ||
doc['joural_iso_title'] = next(iter(root.xpath('//Journal/ISOAbbreviation/text()')), '') | ||
doc['doi'] = next(iter(root.xpath('//ArticleId[@IdType="doi"]/text()')), None) | ||
|
||
doc['compounds'] = [] | ||
for chem in root.xpath("//ChemicalList/Chemical/NameOfSubstance"): | ||
chem_id = chem.get('UI') | ||
doc['compounds'].append({'id': f"MESH:{chem_id}", 'name': chem.text}) | ||
|
||
compounds = [cmpd['id'] for cmpd in doc['compounds']] | ||
doc['mesh'] = [] | ||
for mesh in root.xpath("//MeshHeading/DescriptorName"): | ||
mesh_id = f"MESH:{mesh.get('UI')}" | ||
if mesh_id in compounds: | ||
continue | ||
doc['mesh'].append({'id': mesh_id, 'name': mesh.text}) | ||
|
||
return doc | ||
|
||
|
||
def enhance_pubmed_annotations(pubmed: Mapping[str, Any]) -> Mapping[str, Any]: | ||
"""Enhance pubmed namespace IDs | ||
Add additional entity and annotation types to annotations | ||
Use preferred id for namespaces as needed | ||
Add strings from Title, Abstract matching Pubtator BioConcept spans | ||
NOTE - basically duplicated code with bel_api:api.services.pubmed | ||
Args: | ||
pubmed | ||
Returns: | ||
pubmed object | ||
""" | ||
|
||
text = pubmed['title'] + pubmed['abstract'] | ||
|
||
annotations = {} | ||
|
||
for nsarg in pubmed['annotations']: | ||
url = f'{config["bel_api"]["servers"]["api_url"]}/terms/{nsarg}' | ||
log.info(f'URL: {url}') | ||
r = bel.utils.get_url(url) | ||
log.info(f'Result: {r}') | ||
new_nsarg = '' | ||
if r and r.status_code == 200: | ||
term = r.json() | ||
new_nsarg = bel.lang.bel_utils.convert_nsarg(term['id'], decanonicalize=True) | ||
|
||
pubmed['annotations'][nsarg]['name'] = term['name'] | ||
pubmed['annotations'][nsarg]['label'] = term['label'] | ||
pubmed['annotations'][nsarg]['entity_types'] = list(set(pubmed['annotations'][nsarg]['entity_types'] + term.get('entity_types', []))) | ||
pubmed['annotations'][nsarg]['annotation_types'] = list(set(pubmed['annotations'][nsarg]['annotation_types'] + term.get('annotation_types', []))) | ||
|
||
if new_nsarg != nsarg: | ||
annotations[new_nsarg] = copy.deepcopy(pubmed['annotations'][nsarg]) | ||
else: | ||
annotations[nsarg] = copy.deepcopy(pubmed['annotations'][nsarg]) | ||
|
||
for nsarg in annotations: | ||
for idx, span in enumerate(annotations[nsarg]['spans']): | ||
string = text[span['begin'] - 1:span['end'] - 1] | ||
annotations[nsarg]['spans'][idx]['text'] = string | ||
|
||
pubmed['annotations'] = copy.deepcopy(annotations) | ||
|
||
return pubmed | ||
|
||
|
||
def get_pubmed_for_beleditor(pmid: str) -> Mapping[str, Any]: | ||
"""Get fully annotated pubmed doc with Pubtator and full entity/annotation_types | ||
Args: | ||
pmid: Pubmed PMID | ||
Returns: | ||
Mapping[str, Any]: pubmed dictionary | ||
""" | ||
|
||
pubmed = get_pubmed(pmid) | ||
pubtator = get_pubtator(pmid) | ||
pubmed['annotations'] = copy.deepcopy(pubtator['annotations']) | ||
|
||
# Add entity types and annotation types to annotations | ||
pubmed = enhance_pubmed_annotations(pubmed) | ||
|
||
return pubmed | ||
|
||
|
||
def main(): | ||
|
||
pmid = '19894120' | ||
|
||
pubmed = get_pubmed_for_beleditor(pmid) | ||
|
||
import json | ||
print('DumpVar:\n', json.dumps(pubmed, indent=4)) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
logging.config.dictConfig(config['logging']) | ||
log = logging.getLogger(__name__) | ||
|
||
main() | ||
|
||
else: | ||
log = logging.getLogger(__name__) |