# Test Set Gathering

Note: This is for the production API of niad

In [37]:
import requests

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [38]:
results = requests.get("https://api.data.niaid.nih.gov/v1/query?=&q=*pmid*&extra_filter=&size=1000&from=0&sort=_score&use_metadata_score=true")

In [33]:
data = results.json()

In [39]:
pmids = []
for hits in data['hits']:
    if type(hits['citation']) is list:
        pmids.append(hits['citation'][0]['pmid'])

pmids = np.unique(pmids)

In [40]:
print(f"Gathered {len(pmids)} unique PMIDs")

Gathered 896 unique PMIDs


## Getting MeSH Terms

In [41]:
from Bio import Entrez

Entrez.email = "zqazi@scripps.edu"

dataset = pd.DataFrame(columns=['PMID', 'Abstract', 'MeSH Terms'])

def get_mesh_terms(pmid):

    try:
        # Fetch article details using the PMID
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        article_data = Entrez.read(handle)
        handle.close()

        # Extract MeSH terms from the article data
        mesh_terms = []
        abstract = None
        if "PubmedArticle" in article_data:
            for article in article_data["PubmedArticle"]:
                if "MeshHeadingList" in article["MedlineCitation"]:
                    mesh_headings = article["MedlineCitation"]["MeshHeadingList"]
                    for heading in mesh_headings:
                        descriptor_name = heading["DescriptorName"]
                        mesh_terms.append(descriptor_name)
                    
                    # Get abstract if there are MeSH terms
                    if "Abstract" in article["MedlineCitation"]['Article']:
                        abstract = article["MedlineCitation"]['Article']["Abstract"]["AbstractText"][0]

        return abstract, mesh_terms

    except Exception as e:
        print(f"Error: {e}")
        return None


for idx, pmid_to_query in enumerate(tqdm(pmids)):

    # Get MeSH terms for the specified PMID
    abstract, mesh_terms = get_mesh_terms(pmid_to_query)

    if mesh_terms and abstract:
        # print(f"MeSH terms for PMID {pmid_to_query}:\n")
        # for term in mesh_terms_result:
        #     print(term)
        
        # {'PMID': pmid_to_query, 'Abstract': str(abstract), 'MeSH Terms': [str(term) for term in mesh_terms]}
        dataset.loc[len(dataset.index)] = [pmid_to_query, abstract, [str(term) for term in np.unique(mesh_terms)]]

    if not mesh_terms and not abstract:
        print(f"Failed to retrieve MeSH terms AND Abstract for PMID {pmid_to_query}.")
    elif not mesh_terms:
        print(f"Failed to retrieve MeSH terms for PMID {pmid_to_query}.")
    elif not abstract:
        print(f"Failed to retrieve Abstract for PMID {pmid_to_query}.")
    


  0%|          | 0/896 [00:00<?, ?it/s]

Failed to retrieve MeSH terms AND Abstract for PMID 18700954.
Failed to retrieve MeSH terms AND Abstract for PMID 19505295.
Failed to retrieve MeSH terms AND Abstract for PMID 20148099.
Failed to retrieve MeSH terms AND Abstract for PMID 23178508.
Failed to retrieve MeSH terms AND Abstract for PMID 28868355.
Failed to retrieve MeSH terms AND Abstract for PMID 29326929.
Failed to retrieve MeSH terms AND Abstract for PMID 29675033.
Failed to retrieve Abstract for PMID 29778662.
Failed to retrieve MeSH terms AND Abstract for PMID 29844930.
Failed to retrieve MeSH terms AND Abstract for PMID 30258415.
Failed to retrieve MeSH terms AND Abstract for PMID 30352973.
Failed to retrieve MeSH terms AND Abstract for PMID 30404194.
Failed to retrieve MeSH terms AND Abstract for PMID 30456215.
Failed to retrieve MeSH terms AND Abstract for PMID 30456384.
Failed to retrieve MeSH terms AND Abstract for PMID 30680064.
Failed to retrieve MeSH terms AND Abstract for PMID 30809157.
Failed to retrieve MeSH

In [42]:
dataset.head()

Unnamed: 0,PMID,Abstract,MeSH Terms
0,12524544,Nuclear export of mRNA is mediated by a comple...,"[Active Transport, Cell Nucleus, DNA-Binding P..."
1,14512514,"The synthesis of fatty acids and cholesterol, ...","[Animals, CCAAT-Enhancer-Binding Proteins, Cho..."
2,14981237,"RNase E, an essential endoribonuclease of Esch...","[Bacterial Proteins, Endoribonucleases, Escher..."
3,15054141,Diminished activity of peroxisome proliferator...,"[Animals, Aorta, Gene Expression Profiling, Ge..."
4,15129285,The flood of high-throughput biological data h...,"[Aerobiosis, Anaerobiosis, Computational Biolo..."


## Mapping MeSH terms to EDAM ontology

First, a proof of concept...

In [43]:
import text2term

* 'underscore_attrs_are_private' has been removed


In [44]:
edam_ontology = text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb", "EDAM")

2023-11-17 19:12:38 INFO [text2term.term_collector]: Loading ontology https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb...
2023-11-17 19:12:42 INFO [text2term.term_collector]: ...done (ontology loading time: 3.81s)
2023-11-17 19:12:42 INFO [text2term.term_collector]: Collecting ontology term details...
2023-11-17 19:12:42 INFO [text2term.term_collector]: ...done: collected 3577 ontology terms (collection time: 0.11s)
2023-11-17 19:12:42 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.ANY
2023-11-17 19:12:42 INFO [text2term.t2t]: Caching ontology https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb to: cache/EDAM


In [45]:
mapped_terms = text2term.map_terms([str(term) for term in dataset.iloc[0]['MeSH Terms']], "EDAM", use_cache=True)
mapped_terms[['Source Term', 'Mapped Term Label']].head()

2023-11-17 19:12:42 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-17 19:12:42 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-17 19:12:42 INFO [text2term.t2t]: Mapping 20 source terms to EDAM
2023-11-17 19:12:46 INFO [text2term.t2t]: ...done (mapping time: 3.51s seconds)


Unnamed: 0,Source Term,Mapped Term Label
0,"Active Transport, Cell Nucleus",Cell line report
1,"Active Transport, Cell Nucleus",Nucleic acid report
2,"Active Transport, Cell Nucleus",Active site prediction
3,DNA-Binding Proteins,DNA binding sites
4,DNA-Binding Proteins,DNA binding site prediction


### Map All MeSH Terms

In [46]:
def convert_mesh_to_edam(mesh_terms):
    mapped_terms = text2term.map_terms(mesh_terms, "EDAM", use_cache=True)

    return mapped_terms['Mapped Term Label'].unique().tolist()

dataset['EDAM Topics'] = dataset['MeSH Terms'].apply(lambda terms: convert_mesh_to_edam(terms))

2023-11-17 19:12:46 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-17 19:12:46 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-17 19:12:46 INFO [text2term.t2t]: Mapping 20 source terms to EDAM
2023-11-17 19:12:46 INFO [text2term.t2t]: ...done (mapping time: 0.07s seconds)
2023-11-17 19:12:46 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-17 19:12:46 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-17 19:12:46 INFO [text2term.t2t]: Mapping 17 source terms to EDAM
2023-11-17 19:12:46 INFO [text2term.t2t]: ...done (mapping time: 0.07s seconds)
2023-11-17 19:12:46 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-17 19:12:46 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-17 19:12:46 INFO [text2term.t2t]: Mappi

In [47]:
dataset[['MeSH Terms', 'EDAM Topics']]

Unnamed: 0,MeSH Terms,EDAM Topics
0,"[Active Transport, Cell Nucleus, DNA-Binding P...","[Cell line report, Nucleic acid report, Active..."
1,"[Animals, CCAAT-Enhancer-Binding Proteins, Cho...","[Zoology, Laboratory animal science, Animal st..."
2,"[Bacterial Proteins, Endoribonucleases, Escher...","[Proteins, protein, Protein-protein interactio..."
3,"[Animals, Aorta, Gene Expression Profiling, Ge...","[Zoology, Laboratory animal science, Animal st..."
4,"[Aerobiosis, Anaerobiosis, Computational Biolo...","[Analysis, Protein analysis, Sequence analysis..."
...,...,...
810,"[Astrocytes, Humans, Proteomics, Sleep, Sleep ...","[Human biology, Human genetics, Human disease,..."
811,"[Cell Cycle Proteins, Chromatin, DNA Damage, D...","[Cell cycle, Proteins, protein, Immunoprecipit..."
812,"[Acetylation, Animals, Corpus Striatum, Diseas...","[ACE, DNA methylation, Translation initiation ..."
813,"[Antioxidants, Humans, Inflammation, NAD, NF-E...","[Plant biology, Protein variants, Human biolog..."


In [48]:
print("Any unmapped MeSH Terms?: ", dataset['EDAM Topics'].isna().any())

Any unmapped MeSH Terms?:  False


In [49]:
text2term.clear_cache("EDAM")

Cache has been cleared successfully


## Checking the validity of EDAM topics
Based on the OpenAI API input token limit, we can't pass in every EDAM term. 

In [50]:
with open('full_edam_list.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [51]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

In [52]:
for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

Index 350: Terms not in edam_topics: [locstr('Hidden Markov model', 'en')]


In [53]:
dataset.drop(index=indices_true, axis = 0, inplace=True)

## Save Testing Set

In [54]:
print("Check for N/A values:")

dataset.isna().any()

Check for N/A values:


PMID           False
Abstract       False
MeSH Terms     False
EDAM Topics    False
dtype: bool

In [55]:
print("Number of entries: ", len(dataset))

Number of entries:  814


In [56]:
dataset.to_csv(input("Enter file path and name: "))