# Reactome PMID Metadata Extraction

### Purpose
PMIDs fetched from Reactome are processed to get Metadata and MeSH terms, output by default goes to `reactome_pmid_metadata.tsv` file. 

### How to Run
Enter MTI credentials in cell tagged "parameters"  
All code cells needs to be run sequentially

### Set Parameters

In [None]:
# Register at https://utslogin.nlm.nih.gov/cas/login
mti = {
    'email_id' : "example@example.com",
    'username' : "username",
    'password' : "password"
}

reactome_pmid_url = "https://reactome.org/download/current/ReactionPMIDS.txt"
pmid_chunk_limit = 0 # chunk_size is 200
pmid_metadata_output_path = "reactome_pmid_metadata.tsv"

### Additional Steps to run in Google Colaboratory for Pyjnius

%%capture  
!pip install -r ./dependencies/requirements.txt

!mkdir -p /usr/lib/jvm/java-1.11.0-openjdk-amd64/jre/lib/amd64/server/  
!ln -s /usr/lib/jvm/java-1.11.0-openjdk-amd64/lib/server/libjvm.so /usr/lib/jvm/  java-1.11.0-openjdk-amd64/jre/lib/amd64/server/libjvm.so

import os  
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

## Setup

In [2]:
import os
import csv
import requests
import logging
import indra.literature.pubmed_client as pubmed_parser
import xml.etree.ElementTree as ET
from tqdm import tqdm

In [None]:
import jnius_config
jnius_config.add_classpath("./lib/*")
from jnius import autoclass
GenericBatchNew = autoclass("GenericBatchNew")

## Driver Function

In [None]:
source_file_body = requests.get(reactome_pmid_url).text
pmid_list = list(set([line.split('\t')[1] for line in source_file_body.splitlines()]))
pmid_chunks = list(pmid_list[pos:pos + 200] for pos in range(0, len(pmid_list), 200))

if pmid_chunk_limit>0:
    pmid_chunks = pmid_chunks[:pmid_chunk_limit]

temp_abstract_file_path = "tmp_abstract.txt"
batch = GenericBatchNew()
with open(pmid_metadata_output_path,"w") as output_file:
    writer = csv.writer(output_file, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["PMID","JOURNAL_TITLE","YEAR","PMCID","MESH_TERMS"])
    
    for chunk in tqdm(pmid_chunks, ascii=True):

        metadata_list = pubmed_parser.get_metadata_for_ids(chunk, get_abstracts=True)

        with open(temp_abstract_file_path, "wb") as abstract_file:
            for pmid in metadata_list:
                abstract = metadata_list[pmid].get("abstract",None)
                if abstract:
                    text = f"UI  -  {pmid}\nAB  -  {abstract}\n\n"
                    abstract_file.write(text.encode('ascii', 'replace'))
                    metadata_list[pmid]["mesh"] = []

        mti_process_output = batch.processor(["--email", mti['email_id'], temp_abstract_file_path], mti['username'], mti['password'])
                
        for line in mti_process_output.splitlines():
            try:
                inp  = line.split("|")
                mesh_term = inp[1]
                pmid = inp[0]
                metadata_list[pmid]["mesh"].append(mesh_term)
            except Exception as err:
                logging.error(f"{err} \t {line}")
        
        for pmid in metadata_list:
            metadata = metadata_list[pmid]
            writer.writerow([
                pmid, 
                metadata["journal_abbrev"], 
                metadata["publication_date"].get("year",""),
                metadata["pmcid"],
                "|".join(metadata.get("mesh",[]))
            ])

os.system("rm " + temp_abstract_file_path)