# Test Set Gathering

Note: This is for the production API of niad

In [None]:
import requests

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [None]:
results = requests.get("https://api.data.niaid.nih.gov/v1/query?=&q=*pmid*&extra_filter=&size=1000&from=0&sort=_score&use_metadata_score=true")

In [None]:
data = results.json()

In [None]:
pmids = []
for hits in data['hits']:
    if type(hits['citation']) is list:
        pmids.append(hits['citation'][0]['pmid'])

pmids = np.unique(pmids)

In [None]:
print(f"Gathered {len(pmids)} unique PMIDs")

## Getting MeSH Terms

In [None]:
from Bio import Entrez

Entrez.email = "zqazi@scripps.edu"

dataset = pd.DataFrame(columns=['PMID', 'Abstract', 'MeSH Terms'])

def get_mesh_terms(pmid):

    try:
        # Fetch article details using the PMID
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        article_data = Entrez.read(handle)
        handle.close()

        # Extract MeSH terms from the article data
        mesh_terms = []
        abstract = None
        if "PubmedArticle" in article_data:
            for article in article_data["PubmedArticle"]:
                if "MeshHeadingList" in article["MedlineCitation"]:
                    mesh_headings = article["MedlineCitation"]["MeshHeadingList"]
                    for heading in mesh_headings:
                        descriptor_name = heading["DescriptorName"]
                        mesh_terms.append(descriptor_name)
                    
                    # Get abstract if there are MeSH terms
                    if "Abstract" in article["MedlineCitation"]['Article']:
                        abstract = article["MedlineCitation"]['Article']["Abstract"]["AbstractText"][0]

        return abstract, mesh_terms

    except Exception as e:
        print(f"Error: {e}")
        return None


for idx, pmid_to_query in enumerate(tqdm(pmids)):

    # Get MeSH terms for the specified PMID
    abstract, mesh_terms = get_mesh_terms(pmid_to_query)

    if mesh_terms and abstract:
        # print(f"MeSH terms for PMID {pmid_to_query}:\n")
        # for term in mesh_terms_result:
        #     print(term)
        
        # {'PMID': pmid_to_query, 'Abstract': str(abstract), 'MeSH Terms': [str(term) for term in mesh_terms]}
        dataset.loc[len(dataset.index)] = [pmid_to_query, abstract, [str(term) for term in np.unique(mesh_terms)]]

    if not mesh_terms and not abstract:
        print(f"Failed to retrieve MeSH terms AND Abstract for PMID {pmid_to_query}.")
    elif not mesh_terms:
        print(f"Failed to retrieve MeSH terms for PMID {pmid_to_query}.")
    elif not abstract:
        print(f"Failed to retrieve Abstract for PMID {pmid_to_query}.")
    


In [None]:
dataset.head()

## Mapping MeSH terms to EDAM ontology

First, a proof of concept...

In [None]:
import text2term

In [None]:
edam_ontology = text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb", "EDAM")

In [None]:
mapped_terms = text2term.map_terms([str(term) for term in dataset.iloc[0]['MeSH Terms']], "EDAM", use_cache=True)
mapped_terms[['Source Term', 'Mapped Term Label']].head()

### Map All MeSH Terms

In [None]:
def convert_mesh_to_edam(mesh_terms):
    mapped_terms = text2term.map_terms(mesh_terms, "EDAM", use_cache=True)

    return mapped_terms['Mapped Term Label'].unique().tolist()

dataset['EDAM Topics'] = dataset['MeSH Terms'].apply(lambda terms: convert_mesh_to_edam(terms))

In [None]:
dataset[['MeSH Terms', 'EDAM Topics']]

In [None]:
print("Any unmapped MeSH Terms?: ", dataset['EDAM Topics'].isna().any())

In [None]:
text2term.clear_cache("EDAM")

## Checking the validity of EDAM topics
Based on the OpenAI API input token limit, we can't pass in every EDAM term. 

In [None]:
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [None]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

In [None]:
for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

In [None]:
dataset.drop(index=indices_true, axis = 0, inplace=True)

## Save Testing Set

In [None]:
print("Check for N/A values:")

dataset.isna().any()

In [None]:
print("Number of entries: ", len(dataset))

In [None]:
dataset.to_csv(input("Enter file path and name: "))