# Test Set Gathering

Note: This is for the staging API of niad

In [62]:
import requests

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [44]:
total_results = requests.get("https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=-_exists_:date&facet_size=0&size=0").json()['total']

print('Total Number of Results with PMID:', total_results)

Total Number of Results with PMID: 99342


In [None]:
PAGE_SIZE = 1000
NUM_PAGES = 3

In [46]:
pmids = []
descriptions = []
for i in range(NUM_PAGES):
    results = requests.get(f"https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=-_exists_:date&facet_size=0&size={PAGE_SIZE}&from={i * PAGE_SIZE}")
    data = results.json()

    for idx,hits in enumerate(data['hits']):
        if 'citation' in hits:
            if type(hits['citation']) is list and 'pmid' in hits['citation'][0]:
                pmids.append(hits['citation'][0]['pmid'])
                descriptions.append(hits['description'])

# Remove duplicate PMIDs

# Use a set to track unique elements
unique_set = set()

# Create new lists without duplicates and corresponding indices
filtered_list = []
filtered_corresponding_list = []

for item, corresponding_item in zip(pmids, descriptions):
    if item not in unique_set:
        # Add the item to the set to track uniqueness
        unique_set.add(item)
        
        # Append to the new lists
        filtered_list.append(item)
        filtered_corresponding_list.append(corresponding_item)

# Print the result
print("Filtered List:", len(filtered_list))
print("Filtered Corresponding List:", len(filtered_corresponding_list))

pmids = filtered_list
descriptions = filtered_corresponding_list

Filtered List: 1337
Filtered Corresponding List: 1337


## Getting MeSH Terms

In [49]:
from Bio import Entrez, Medline

Entrez.email = "zqazi@scripps.edu"

dataset = pd.DataFrame(columns=['PMID', 'Description', 'Abstract', 'MeSH Terms'])

def get_mesh_terms(pmid):

    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype='medline', retmode="text")
        article_data = Medline.parse(handle)

        mesh_terms = []
        abstract = None
        for record in article_data:
            mesh_terms = record.get('MH', [])
            abstract = record.get('AB', None)

        return abstract, mesh_terms

    except Exception as e:
        print(f"Error: {e}")
        return None


for idx, pmid_to_query in enumerate(tqdm(pmids)):

    # Get MeSH terms for the specified PMID
    abstract, mesh_terms = get_mesh_terms(pmid_to_query)

    if mesh_terms and abstract:
        # print(f"MeSH terms for PMID {pmid_to_query}:\n")
        # for term in mesh_terms_result:
        #     print(term)
        
        # {'PMID': pmid_to_query, 'Abstract': str(abstract), 'MeSH Terms': [str(term) for term in mesh_terms]}
        dataset.loc[len(dataset.index)] = [pmid_to_query, descriptions[idx], abstract, [str(term) for term in np.unique(mesh_terms)]]

    if not mesh_terms and not abstract:
        print(f"Failed to retrieve MeSH terms AND Abstract for PMID {pmid_to_query}.")
    elif not mesh_terms:
        print(f"Failed to retrieve MeSH terms for PMID {pmid_to_query}.")
    elif not abstract:
        print(f"Failed to retrieve Abstract for PMID {pmid_to_query}.")
    


  0%|          | 0/1337 [00:00<?, ?it/s]

Failed to retrieve MeSH terms for PMID 29355227.
Failed to retrieve MeSH terms for PMID 34988373.
Failed to retrieve MeSH terms for PMID 27379119.
Failed to retrieve MeSH terms for PMID 31239396.
Failed to retrieve MeSH terms for PMID 34909793.
Failed to retrieve MeSH terms for PMID 32994287.
Failed to retrieve MeSH terms for PMID 27602409.
Failed to retrieve Abstract for PMID 29055911.
Failed to retrieve MeSH terms for PMID 21564833.
Failed to retrieve MeSH terms for PMID 28421042.
Failed to retrieve MeSH terms for PMID 29577086.
Failed to retrieve MeSH terms for PMID 24995004.
Failed to retrieve MeSH terms for PMID 26229982.
Failed to retrieve MeSH terms for PMID 26819854.
Failed to retrieve MeSH terms for PMID 25324835.
Failed to retrieve MeSH terms for PMID 30619992.
Failed to retrieve MeSH terms for PMID 27822543.
Failed to retrieve MeSH terms for PMID 30832316.
Failed to retrieve MeSH terms for PMID 25013457.
Failed to retrieve MeSH terms for PMID 25681390.
Failed to retrieve MeS

In [50]:
def split_strings(lst):
    new_list = []
    for string in lst:
        filtered_str = string.replace('*', '')
        if '/' in filtered_str:
            new_list.extend([str.strip() for str in filtered_str.split('/')])
        elif ',' in filtered_str:
            new_list.extend([str.strip() for str in filtered_str.split(',')])
        else:
            new_list.append(filtered_str)
    return np.unique(new_list).tolist()

dataset['Filtered MeSH Terms'] = dataset['MeSH Terms'].apply(split_strings)

In [53]:
dataset.head()

Unnamed: 0,PMID,Description,Abstract,MeSH Terms,Filtered MeSH Terms
0,18270564,Comparative expression analysis on Plasmodium ...,A fundamental problem in systems biology and w...,"[*Gene Expression Profiling, *Genome, Protozoa...","[Animals, Computational Biology, Gene Expressi..."
1,31959989,The BONUS study that was conducted during regu...,Most infants with cystic fibrosis (CF) have pa...,"[Body Size, Case-Control Studies, Cystic Fibro...","[Body Size, Case-Control Studies, Cystic Fibro..."
2,31296739,Thirty-six members of a birth cohort with cons...,Characterizing the organization of the human g...,"[*Diet, *Germ-Free Life, Animals, Bacteria/*cl...","[Animal, Animals, Bacteria, Bangladesh, Bottle..."
3,29884786,The microbiome of 2684 fecal specimens collect...,The human gut microbiota plays a vital role in...,"[*Gastrointestinal Microbiome, Bacteria/classi...","[Bacteria, Bifidobacterium, Feces, Female, Fir..."
4,22699609,This HMP production phase includes untargeted ...,Studies of the human microbiome have revealed ...,"[*Biodiversity, *Health, *Metagenome, Adolesce...","[Adolescent, Adult, Bacteria, Biodiversity, Ec..."


## Mapping MeSH terms to EDAM ontology

First, a proof of concept...

In [1]:
import text2term

* 'underscore_attrs_are_private' has been removed


In [None]:
edam_ontology = text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb", "EDAM")

In [None]:
mapped_terms = text2term.map_terms([str(term) for term in dataset.iloc[0]['Filtered MeSH Terms']], "EDAM", use_cache=True)
mapped_terms[['Source Term', 'Mapped Term Label']].head()

### Map All MeSH Terms

In [None]:
def convert_mesh_to_edam(mesh_terms):
    mapped_terms = text2term.map_terms(mesh_terms, "EDAM", use_cache=True)

    # Remove any EDAM mappings that are not a 'TOPIC'
    mapped_terms = mapped_terms[mapped_terms['Mapped Term IRI'].str.contains('topic')]

    return mapped_terms['Mapped Term Label'].unique().tolist()

dataset['EDAM Topics'] = dataset['Filtered MeSH Terms'].apply(lambda terms: convert_mesh_to_edam(terms))

In [None]:
dataset[['MeSH Terms', 'EDAM Topics']]

In [None]:
print("Any unmapped MeSH Terms?: ", dataset['EDAM Topics'].isna().any())

In [None]:
# text2term.clear_cache("EDAM")

## Checking the validity of EDAM topics
Based on the OpenAI API input token limit, we can't pass in every EDAM term. 

In [None]:
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [None]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

In [None]:
## Uncomment and use this line to remove the rows if necessary (GPT.ipynb will filter out the terms from each list anyways)
# dataset.drop(index=indices_true, axis = 0, inplace=True)

## Save Testing Set

In [None]:
print("Check for N/A values:")

dataset.isna().any()

In [None]:
print('Empty Descriptions:', len(dataset[dataset["Description"].str.len() == 0]))
print('Empty Abstracts:', len(dataset[dataset["Abstract"].str.len() == 0]))

print("\nNumber of entries: ", len(dataset))

In [None]:
# Remove HTML
dataset = dataset[~dataset['Description'].str.startswith(('<b>', 'b>'))]

In [None]:
dataset.to_csv(input("Enter file path and name: "), index=False)