Retrieve Information for Reactome Failed Searches
---
---
### Purpose
The notebook processes the failed query terms to get the PMIDs where the term was seen.
Using these PMIDs MESH terms and article metadata will be extracted and presented in a tab seperatted file.
### How to Run
All code cells needs to be run sequentially


### Set Parameters

In [None]:
# Register at https://utslogin.nlm.nih.gov/cas/login
mti_email_id = "example@example.com"
mti_username = "username"
mti_password = "password"
pmid_threshold = 20

### Setup

In [None]:
%%capture
!pip install -r requirements.txt

In [None]:
import jnius_config
jnius_config.add_classpath("./lib/*")
from jnius import autoclass
GenericBatchNew = autoclass("GenericBatchNew")

import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.WARNING)

### Get PMID from failed query terms
`getPMID(terms)`  
The method takes the list of terms and queries in Pubmed database to get the List of PMID containing the term.  
`_extractListID(filecontent, term)`  
Extracts list of PMIDs from the XML and save it in file for further processing.

In [None]:
# getPMID.py

import requests
import time
import json
import xml.etree.ElementTree as ET


def _extractListID(filecontent, term):
    tree = ET.fromstring(filecontent, ET.XMLParser(encoding='utf-8'))
    ID = tree.findall('./IdList/Id')
    count = tree.find('./Count').text

    with open("pmid_list.txt", "a") as op_file:
        for i in ID:
            print(i.text + "~" + term + "~" + count, file=op_file)


def getPMID(terms, pmid_threshold=20):
    """
    Get PMID for the Query terms
    Parameters:
    terms: List of failed query terms
    pmid_threshold: Limit of Pubmed articles to process, default is 20
    """
    if pmid_threshold < 1:
        pmid_threshold = 20
    for term in terms:
        term = term.strip().rpartition(",")[0]
        flag = True
        while flag:
            try:
                xml_content = requests.get(
                    "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax="+pmid_threshold+"&term=hasabstract%20AND%20"+term)
                _extractListID(xml_content.text, term)
                flag = False
            except:
                time.sleep(.5)

### Extraction of MESH terms

`getAbstracts()`   
Reads the PMID list and gets the Abstract from Pubmed. The output file is sent to [Interactive Medical Text Indexer (MTI)](https://ii.nlm.nih.gov/Batch/index.shtml) for batch processing of these abstracts

In [None]:
# getMESH.py
from xml.etree.ElementTree import parse
from urllib.request import urlopen
import time
import os
import requests


def getAbstracts(abstract_filepath):
    """
    Get abstracts from PMID and generate input file for MESH Batch processing
    """
    with open("pmid_list.txt") as file:
        with open(abstract_filepath, 'wb') as o:
            for inp in file:
                pmid = inp.strip().split("~")[0]
                flag = True
                while flag:
                    try:
                        var_url = urlopen(
                            f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={pmid}')
                        flag = False
                    except:
                        time.sleep(.5)
                xmldoc = parse(var_url)
                for item in xmldoc.iterfind('PubmedArticle'):
                    try:
                        abstract_text = item.findtext(
                            'MedlineCitation/Article/Abstract/AbstractText')
                        article_title = item.findtext(
                            'MedlineCitation/Article/ArticleTitle')
                        if abstract_text:
                            print('UI  - ', pmid, file=o)
                            print(
                                'TI  - ', article_title.encode("ascii", "ignore"), file=o)
                            print(
                                'AB  - ', abstract_text.encode("ascii", "ignore"), file=o)
                            print("\n", file=o)
                        else:
                            print("Err: MESH: ", "Undefined Abstract")
                    except Exception as e:
                        print("Err: MESH: ", e)


def getMESH():
    abstract_filepath = 'abstract.txt'
    getAbstracts(abstract_filepath)

#     mti_email_id = os.environ['MTI_EMAIL_ID']
#     mti_username = os.environ['MTI_USERNAME']
#     mti_password = os.environ['MTI_PASSWORD']

    batch = GenericBatchNew()
    result = batch.processor(
        ["--email", mti_email_id, abstract_filepath], mti_username, mti_password)
        
    with open("mesh.txt", "w") as op_file:
        op_file.write(result)

### Extraction of Metadata from the PMIDs

Following details are retrieved using EUtils, INDRA and OpenCitations using the PMID
`JOURNAL_TITLE`, `YEAR`, `PMCID`, `DOI`, `PMC_CITATION_COUNT`, `INDRA_STATEMENT_COUNT`, `OC_CITATION_COUNT`, `INDRA_QUERY_TERM_STATEMENT_COUNT`, `PMID_COUNT`

`getIndraQueryTermStmtCount`  
This method uses [Gilda](https://github.com/indralab/gilda) for grounding the failed query term

In [None]:
# getEUtilsInfo.py

import os
import gzip
import time
import sys
import csv
import requests
import json
import indra.literature.pubmed_client as parser
import xml.etree.ElementTree as ET
from indra.sources import indra_db_rest
from indra.assemblers.html.assembler import HtmlAssembler
from urllib.parse import urljoin
from indra.statements.statements import stmts_to_json


def citationCount(pmid):
    """
    Gets Citation count for PMID

    Parameters
    ----------
    pmid : string
        PMID of the medical paper

    Returns
    -------
    string
        Citation Count
    """
    flag = True
    while flag:
        try:
            citationCount_url = requests.get(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&linkname=pubmed_pmc_refs&id="+pmid)
            flag = False
        except Exception as e:
            time.sleep(.5)
    try:
        fileContent = citationCount_url.text
        tree = ET.fromstring(fileContent, ET.XMLParser(encoding='utf-8'))
        ID = tree.findall('./LinkSet/LinkSetDb/Link')
        return len(ID)
    except:
        return 0


def getIndraQueryTermStmtCount(txt, source_apis=None):
    """
    Get number of statments generated by INDRA from the query term

    Parameters
    ----------
        txt : string
            Query term to be processed
        source_apis : [], optional
            APIs to be searched from, default is all

    Returns
    ------
        integer
            Number of Indra Statements
    """
    grounding_service_url = 'http://grounding.indra.bio/'
    resp = requests.post(urljoin(grounding_service_url,
                                 'ground'), json={'text': txt})
    grounding_results = resp.json()
    if len(grounding_results) > 0:
        term_id = grounding_results[0]['term']['id']
        term_db = grounding_results[0]['term']['db']
        term = term_id + '@' + term_db
    else:
        return 0
    stmts = indra_db_rest.get_statements(agents=[term]).statements
    stmts_json = stmts_to_json(stmts)
    valid_stmts = set()
    if source_apis:
        idx = 0
        for stmt in stmts_json:
            evidences = stmt.get("evidence", [])
            for ev in evidences:
                if ev["source_api"] in source_apis:
                    valid_stmts.add(stmts[idx])
            idx += 1
        return len(valid_stmts)
    return len(stmts)


def extractFromXML(pmid,  term, total_pmid):
    """
    Extracts information from XML

    Parameters
    ----------
    fileContent : 
        XML Content for the journal
    citationCount : 
        Citation count for the PMID
    term:
        Reactome query term
    total_pmid :
        Number of articles where the term is seen
    """

    flag = True
    while flag:
        try:
            xmlContent = requests.get(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id="+pmid)
            flag = False
        except Exception as e:
            time.sleep(.5)

    fileContent = xmlContent.text
    destFileName = "eutils_output.tsv"
    if(os.path.isfile(destFileName)):
        destCSV = open(destFileName, 'a')
    else:
        destCSV = open(destFileName, 'w')
        print('\t'.join(["PMID", "TERM", "JOURNAL_TITLE", "YEAR", "PMCID", "DOI", "PMC_CITATION_COUNT",
                         "INDRA_STATEMENT_COUNT", "OC_CITATION_COUNT", "INDRA_QUERY_TERM_STATEMENT_COUNT", "PMID_COUNT"]), file=destCSV)
    writer = csv.writer(destCSV, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    tree = ET.fromstring(fileContent, ET.XMLParser(encoding='utf-8'))
    pm_articles = tree.findall('./PubmedArticle')
    citation_count = citationCount(pmid)
    for art_ix, pm_article in enumerate(pm_articles):
        medline_citation = pm_article.find('./MedlineCitation')
        pubmed = pm_article.find('./PubmedData')
        try:
            history_pub_date = pubmed.find(
                './History/PubMedPubDate[@PubStatus="pubmed"]')
            year = parser._find_elem_text(history_pub_date, 'Year')
            PublicationTypeList = medline_citation.find(
                './Article/PublicationTypeList')
            pubType = parser._find_elem_text(
                PublicationTypeList, 'PublicationType')
            topics = []
            for topic in medline_citation.findall('./MeshHeadingList/MeshHeading'):
                topics.append(topic.find('DescriptorName').text)
            topics_string = ' , '.join(topics)
        except Exception as err:
            print("Err: EUtils:", err)
            continue

        pub_year = None if (year is None) else int(year)
        article_info = parser._get_article_info(
            medline_citation, pm_article.find('PubmedData'))
        journal_info = parser._get_journal_info(medline_citation, False)

        # Preparing results
        title = journal_info["journal_abbrev"] or ""
        year = pub_year
        DOI = article_info["doi"] or ""
        PMCID = article_info["pmcid"] or ""
        PMID = article_info["pmid"] or ""
        pmc_citation_count = citation_count
        OC_CITATION_COUNT = 0
        try:
            if DOI != "":
                output = requests.get(
                    "https://opencitations.net/api/v1/metadata/" + DOI).json()
                if len(output) > 0:
                    OC_CITATION_COUNT = output[0]["citation_count"]
        except:
            pass
        stmt = indra_db_rest.get_statements_for_paper(
            [('pmid', PMID)]).statements
        indra_stmt_count = len(stmt)
        # storing in tsv file
        writer.writerow([PMID, term, title, year, PMCID, DOI, pmc_citation_count,
                         indra_stmt_count, OC_CITATION_COUNT,  getIndraQueryTermStmtCount(term), total_pmid])
    # Closing file
    destCSV.close()


def getEUtilsInfo():
    """
    Generate a TSV containing meta details of PMID from EUtils
    """
    with open("pmid_list.txt") as file:
        for line in file:
            try:
                line = line.strip().split("~")
                pmid = line[0]
                term = line[1]
                total_pmid = line[2]
                extractFromXML(pmid, term, total_pmid)
            except Exception as e:
                print("Err: EUtils: ", e, line)

### Generate Output TSV
Merge output MESH terms from MTI Batch Processing and the metadata to generate a TSV output file.

In [None]:
# mergeOutputs.py

import csv
import json
import sys
import os


def mergeOutputs(path_eutils, path_mesh, path_output_dir):
    """
    Merge outputs from EUtils and MESH terms from 

    Parameters
    ----------
    path_eutils:
        Path to TSV file containing metadata from EUtils
    path_mesh:
        Path to MESH terms extracted by Web API
    path_output_dir:
        Path to Output directory
    """

    details = {}

    with open(path_eutils) as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            if row["PMID"] not in details:
                details[row["PMID"]] = {}
            details[row["PMID"]][row["TERM"]] = {
                "journal": row["JOURNAL_TITLE"],
                "year": row["YEAR"],
                "pmc": row["PMCID"],
                "doi": row["DOI"],
                "citation_count": row["PMC_CITATION_COUNT"],
                "indra_stmt_count": row["INDRA_STATEMENT_COUNT"],
                "oc_citation_count": row["OC_CITATION_COUNT"],
                "indra_query_term_stmt_count": row["INDRA_QUERY_TERM_STATEMENT_COUNT"],
                "pmid_count": row["PMID_COUNT"],
                "mesh": []
            }

    with open(path_mesh) as mesh:
        for line in mesh:
            inp = line.split("|")
            mesh_term = inp[1]
            pmid = inp[0]
            for term in details[pmid]:
                details[pmid][term]["mesh"].append(mesh_term)

    with open(os.path.join(path_output_dir, "output.tsv"), 'w') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL, delimiter='\t')
        writer.writerow(["QUERY_TERM", "PMID", "JOURNAL_TITLE", "YEAR", "PMCID",
                         "DOI", "PMC_CITATION_COUNT", "INDRA_STATEMENT_COUNT", 
                         "OC_CITATION_COUNT", "INDRA_QUERY_TERM_STATEMENT_COUNT", 
                         "MESH_TERMS", "PMID_COUNT"])
        for key in details:
            for term in details[key]:
                writer.writerow([term, key, details[key][term]["journal"], details[key][term]["year"], details[key][term]["pmc"], details[key][term]
                                 ["doi"], details[key][term]["citation_count"], details[key][term][
                    "indra_stmt_count"], details[key][term]["oc_citation_count"],
                    details[key][term]["indra_query_term_stmt_count"], "|".join(details[key][term]["mesh"]), details[key][term]["pmid_count"]])

### Driver Function

In [None]:
import requests
import multiprocessing
import os
import datetime

from tqdm import tqdm_notebook as tqdm


history_file_path = "./processor/history"


def saveInHistory(terms):
    """
    Save processed terms in file
    Parameters
    ----------
    terms:  []
        List of processed terms
    """
    with open(history_file_path, "a") as out_file:
        out_file.write('\n'.join(terms)+'\n')


if __name__ == "__main__":
    history = set()

    if os.path.isfile(history_file_path):
        with open(history_file_path, "r") as history_file:
            for line in history_file:
                history.add(line.strip())

    terms = [[]]

    terms_request = requests.get(
        "https://gist.githubusercontent.com/PritiShaw/03ce10747835390ec8a755fed9ea813d/raw/cc72cb5479f09b574e03ed22c8d4e3147e09aa0c/Reactome.csv")
    inp_terms = terms_request.text.splitlines()

    for term in inp_terms[1:]:
        term_parts = term.split(",")
        if len(term_parts) == 2 and int(term_parts[1]) > 9 and term not in history:
            terms[-1].append(term)
            if len(terms[-1]) == 10:
                terms.append([])

    for chunk in tqdm(terms):
        getPMID(chunk, pmid_threshold)
        process_mesh = multiprocessing.Process(target=getMESH)
        process_meta = multiprocessing.Process(target=getEUtilsInfo)

        process_meta.start()
        process_mesh.start()
        process_meta.join()
        process_mesh.join()

        mergeOutputs("eutils_output.tsv", "mesh.txt", "./processor")
        history.update(chunk)
        saveInHistory(chunk)
        os.system("rm eutils_output.csv abstract.txt mesh.txt pmid_list.txt")