# Test Set Gathering

Note: This is for the production API of niad

In [1]:
import requests

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
total_results = requests.get("https://api.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=-_exists_:date&facet_size=0&size=0").json()['total']

print('Total Number of Results with PMID:', total_results)

Total Number of Results with PMID: 120259


In [3]:
PAGE_SIZE = 1000
NUM_PAGES = 3

In [5]:
pmids = []
for i in range(NUM_PAGES):
    results = requests.get(f"https://api.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=-_exists_:date&facet_size=0&size={PAGE_SIZE}&from={i * PAGE_SIZE}")
    data = results.json()

    for idx,hits in enumerate(data['hits']):
        if 'citation' in hits:
            if type(hits['citation']) is list and 'pmid' in hits['citation'][0]:
                pmids.append(hits['citation'][0]['pmid'])

# Remove duplicate PMIDs
pmids = list(np.unique(pmids))

# Print the result
print("Filtered List:", len(pmids))

Filtered List: 2884


## Getting MeSH Terms

In [6]:
from Bio import Entrez, Medline

Entrez.email = "zqazi@scripps.edu"

dataset = pd.DataFrame(columns=['PMID', 'Abstract', 'MeSH Terms'])

def get_mesh_terms(pmid):

    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype='medline', retmode="text")
        article_data = Medline.parse(handle)

        mesh_terms = []
        abstract = None
        for record in article_data:
            mesh_terms = record.get('MH', [])
            abstract = record.get('AB', None)

        return abstract, mesh_terms

    except Exception as e:
        print(f"Error: {e}")
        return None


for idx, pmid_to_query in enumerate(tqdm(pmids)):

    # Get MeSH terms for the specified PMID
    abstract, mesh_terms = get_mesh_terms(pmid_to_query)

    if mesh_terms and abstract:
        # print(f"MeSH terms for PMID {pmid_to_query}:\n")
        # for term in mesh_terms_result:
        #     print(term)
        
        # {'PMID': pmid_to_query, 'Abstract': str(abstract), 'MeSH Terms': [str(term) for term in mesh_terms]}
        dataset.loc[len(dataset.index)] = [pmid_to_query, abstract, [str(term) for term in np.unique(mesh_terms)]]

    if not mesh_terms and not abstract:
        print(f"Failed to retrieve MeSH terms AND Abstract for PMID {pmid_to_query}.")
    elif not mesh_terms:
        print(f"Failed to retrieve MeSH terms for PMID {pmid_to_query}.")
    elif not abstract:
        print(f"Failed to retrieve Abstract for PMID {pmid_to_query}.")
    


  0%|          | 0/2884 [00:00<?, ?it/s]

Failed to retrieve MeSH terms for PMID 16103920.
Failed to retrieve MeSH terms for PMID 18923939.
Failed to retrieve Abstract for PMID 19097959.
Failed to retrieve Abstract for PMID 19796614.
Failed to retrieve MeSH terms for PMID 20546595.
Failed to retrieve MeSH terms for PMID 21127740.
Failed to retrieve Abstract for PMID 21345093.
Failed to retrieve MeSH terms for PMID 22593731.
Failed to retrieve MeSH terms for PMID 23130016.
Failed to retrieve MeSH terms for PMID 23304554.
Failed to retrieve MeSH terms for PMID 23469351.
Failed to retrieve MeSH terms for PMID 23570311.
Failed to retrieve MeSH terms for PMID 23765928.
Failed to retrieve MeSH terms for PMID 24009881.
Failed to retrieve MeSH terms for PMID 24050303.
Failed to retrieve MeSH terms for PMID 24355397.
Failed to retrieve MeSH terms for PMID 24692423.
Failed to retrieve Abstract for PMID 24727770.
Failed to retrieve MeSH terms for PMID 24748042.
Failed to retrieve MeSH terms for PMID 24794133.
Failed to retrieve MeSH term

In [13]:
def split_strings(lst):
    new_list = []
    for string in lst:
        filtered_str = string.replace('*', '')
        if '/' in filtered_str:
            new_list.extend([str.strip() for str in filtered_str.split('/')])
        elif ',' in filtered_str:
            new_list.extend([str.strip() for str in filtered_str.split(',')])
        else:
            new_list.append(filtered_str)
    return np.unique(new_list).tolist()

dataset['Filtered MeSH Terms'] = dataset['MeSH Terms'].apply(split_strings)

In [18]:
dataset.head()

Unnamed: 0,PMID,Abstract,MeSH Terms,Filtered MeSH Terms,EDAM Topics
0,10827989,Cooperativity in contractile behavior of myofi...,"[*Models, Biological, Actin Cytoskeleton/*phys...","[Actin Cytoskeleton, Biological, Calcium, Chem...","[Cytometry, Biology, Chemical biology, Genetic..."
1,11082306,A mathematical model of the insulin-glucose fe...,"[*Models, Biological, Blood Glucose/*metabolis...","[Biological, Blood Glucose, Feedback, Humans, ...","[Biology, Human biology, Human genetics, Human..."
2,11124023,Human cytomegalovirus (HCMV) has been shown to...,"[*Gene Expression Profiling, *Transcription, G...","[Cell Cycle, Cells, Cultured, Cytomegalovirus,...","[Cell cycle, Carbon cycle, Cell culture collec..."
3,11182887,BACKGROUND: We have developed and tested a met...,"[Immunoassay/*methods/standards, Proteins/*ana...","[Immunoassay, Proteins, Reference Standards, R...","[Immunomics, Immunology, Proteins, Protein dom..."
4,11466441,Because multiple molecular signal transduction...,"[*Computer Simulation, *Models, Neurological, ...","[Cerebellum, Computer Simulation, Feedback, Ki...","[Computer science, Genetics, Biological system..."


## Mapping MeSH terms to EDAM ontology

First, a proof of concept...

In [15]:
import text2term

In [16]:
edam_ontology = text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb", "EDAM")

2023-12-18 13:42:49 INFO [text2term.term_collector]: Loading ontology https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb...
2023-12-18 13:42:54 INFO [text2term.term_collector]: ...done (ontology loading time: 5.02s)
2023-12-18 13:42:54 INFO [text2term.term_collector]: Collecting ontology term details...
2023-12-18 13:42:54 INFO [text2term.term_collector]: ...done: collected 3577 ontology terms (collection time: 0.11s)
2023-12-18 13:42:54 INFO [text2term.t2t]: Filtered ontology terms to those of type: any
2023-12-18 13:42:54 INFO [text2term.t2t]: Caching ontology https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb to: cache/EDAM


### Map All MeSH Terms

In [17]:
threshold  = 0.7

def convert_mesh_to_edam(mesh_terms):
    mapped_terms = text2term.map_terms(mesh_terms, "EDAM", use_cache=True)

    # Remove any EDAM mappings that are not a 'TOPIC'
    mapped_terms = mapped_terms[mapped_terms['Mapped Term IRI'].str.contains('topic')]
    # Remove any mappings with scores under the threshold
    mapped_terms = mapped_terms[mapped_terms['Mapping Score'] >= threshold] 
    return mapped_terms['Mapped Term Label'].unique().tolist()

dataset['EDAM Topics'] = dataset['Filtered MeSH Terms'].apply(lambda terms: convert_mesh_to_edam(terms))

2023-12-18 13:42:58 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-12-18 13:42:58 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-18 13:42:58 INFO [text2term.t2t]: Mapping 10 source terms to EDAM
2023-12-18 13:42:59 INFO [text2term.t2t]: ...done (mapping time: 0.08s seconds)
2023-12-18 13:42:59 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-12-18 13:42:59 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-18 13:42:59 INFO [text2term.t2t]: Mapping 12 source terms to EDAM
2023-12-18 13:42:59 INFO [text2term.t2t]: ...done (mapping time: 0.07s seconds)
2023-12-18 13:42:59 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-12-18 13:42:59 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-18 13:42:59 INFO [text2term.t2t]: Mapping 19 source terms to EDAM
2023-12-18 13:42:59 INFO

In [19]:
dataset[['MeSH Terms', 'EDAM Topics']]

Unnamed: 0,MeSH Terms,EDAM Topics
0,"[*Models, Biological, Actin Cytoskeleton/*phys...","[Cytometry, Biology, Chemical biology, Genetic..."
1,"[*Models, Biological, Blood Glucose/*metabolis...","[Biology, Human biology, Human genetics, Human..."
2,"[*Gene Expression Profiling, *Transcription, G...","[Cell cycle, Carbon cycle, Cell culture collec..."
3,"[Immunoassay/*methods/standards, Proteins/*ana...","[Immunomics, Immunology, Proteins, Protein dom..."
4,"[*Computer Simulation, *Models, Neurological, ...","[Computer science, Genetics, Biological system..."
...,...,...
2729,"[Anti-Bacterial Agents/pharmacology, Corynebac...","[Human biology, Human genetics, Human disease,..."
2730,"[Actinomycetales/*classification, Base Composi...","[Pathology, Ribosomes, Ribosome Profiling, DNA..."
2731,"[*Chromosome Mapping, *Genetic Linkage, *Genom...","[Evolutionary biology, Genes and proteins reso..."
2732,"[*Membrane Transport Proteins, *Plasmids, *Pro...","[Zoology, Laboratory animal science, Animal st..."


In [20]:
print("Any unmapped MeSH Terms?: ", dataset['EDAM Topics'].isna().any())

Any unmapped MeSH Terms?:  False


In [None]:
# text2term.clear_cache("EDAM")

## Checking the validity of EDAM topics
Based on the OpenAI API input token limit, we can't pass in every EDAM term. 

In [21]:
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [22]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

In [23]:
for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

Index 0: Terms not in edam_topics: ['Cytometry']
Index 1: Terms not in edam_topics: ['Human disease', 'G protein-coupled receptors (GPCR)', 'Metabolites']
Index 2: Terms not in edam_topics: ['Cell cycle', 'Cell and tissue culture', 'Cytometry', 'Human disease', 'PCR experiment', 'Transcription', 'Gene transcription', 'Metabolites']
Index 3: Terms not in edam_topics: ['Protein domains', 'Chromosome conformation capture', 'Laboratory techniques']
Index 4: Terms not in edam_topics: ['Biological system modelling', 'Phosphorylation sites', 'Protein databases', 'G protein-coupled receptors (GPCR)', 'Signal or transit peptide', 'Metabolites']
Index 5: Terms not in edam_topics: ['Classification']
Index 6: Terms not in edam_topics: ['Protein-protein interactions', 'Human disease', 'Signal or transit peptide', 'Chromosome conformation capture', 'Laboratory techniques', 'Pathogens']
Index 7: Terms not in edam_topics: ['Scents', 'Transcription', 'Gene transcription', 'Metabolites']
Index 8: Terms 

In [None]:
## GPT.ipynb will filter out the terms already, so dropping the rows is unnecessary
# dataset.drop(index=indices_true, axis = 0, inplace=True)

### Remove undesired EDAM topics from each list

In [None]:
# edam_topics.txt 
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    full_edam_topics = edam_file.readlines()

full_edam_topics = [topic.strip() for topic in full_edam_topics]

In [None]:
dataset['EDAM Topics'] = dataset['EDAM Topics'].apply(lambda x: [item for item in x if item in full_edam_topics])

## Save Testing Set

In [24]:
print("Check for N/A values:")

dataset.isna().any()

Check for N/A values:


PMID                   False
Abstract               False
MeSH Terms             False
Filtered MeSH Terms    False
EDAM Topics            False
dtype: bool

In [25]:
print('Empty Abstracts:', len(dataset[dataset["Abstract"].str.len() == 0]))

print("\nNumber of entries: ", len(dataset))

Empty Abstracts: 0

Number of entries:  2734


In [None]:
dataset = dataset[dataset["Abstract"].apply(len) > 0]

In [27]:
dataset.to_csv(input("Enter file path and name: "), index=False)