# Test Set Gathering

Note: This is for the staging API of niad

In [55]:
import requests

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from Bio import Entrez, Medline
import text2term

from bs4 import BeautifulSoup
import re

In [9]:
total_results = requests.get("https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=-_exists_:date&facet_size=0&size=0").json()['total']

print('Total Number of Results with PMID:', total_results)

Total Number of Results with PMID: 99342


In [13]:
PAGE_SIZE = 1000 # MAX
NUM_PAGES = 5

In [15]:
pmids = []
descriptions = []
for i in range(NUM_PAGES):
    results = requests.get(f"https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=-_exists_:date&facet_size=0&size={PAGE_SIZE}&from={i * PAGE_SIZE}")
    data = results.json()

    for idx,hits in enumerate(data['hits']):
        if 'citation' in hits:
            if type(hits['citation']) is list and 'pmid' in hits['citation'][0]:
                pmids.append(hits['citation'][0]['pmid'])
                descriptions.append(hits['description']) if 'description' in hits else descriptions.append([])

# Remove duplicate PMIDs

# Use a set to track unique elements
unique_set = set()

# Create new lists without duplicates and corresponding indices
filtered_list = []
filtered_corresponding_list = []

for item, corresponding_item in zip(pmids, descriptions):
    if item not in unique_set:
        # Add the item to the set to track uniqueness
        unique_set.add(item)
        
        # Append to the new lists
        filtered_list.append(item)
        filtered_corresponding_list.append(corresponding_item)

# Print the result
print("Filtered List:", len(filtered_list))
print("Filtered Corresponding List:", len(filtered_corresponding_list))

pmids = filtered_list
descriptions = filtered_corresponding_list

Filtered List: 3225
Filtered Corresponding List: 3225


## Getting MeSH Terms

In [16]:
Entrez.email = "zqazi@scripps.edu"

dataset = pd.DataFrame(columns=['PMID', 'Description', 'Abstract', 'MeSH Terms'])

def get_mesh_terms(pmid):

    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype='medline', retmode="text")
        article_data = Medline.parse(handle)

        mesh_terms = []
        abstract = None
        for record in article_data:
            mesh_terms = record.get('MH', [])
            abstract = record.get('AB', None)

        return abstract, mesh_terms

    except Exception as e:
        print(f"Error: {e}")
        return None


for idx, pmid_to_query in enumerate(tqdm(pmids)):

    # Get MeSH terms for the specified PMID
    abstract, mesh_terms = get_mesh_terms(pmid_to_query)

    if mesh_terms and abstract:
        # print(f"MeSH terms for PMID {pmid_to_query}:\n")
        # for term in mesh_terms_result:
        #     print(term)
        
        # {'PMID': pmid_to_query, 'Abstract': str(abstract), 'MeSH Terms': [str(term) for term in mesh_terms]}
        dataset.loc[len(dataset.index)] = [pmid_to_query, descriptions[idx], abstract, [str(term) for term in np.unique(mesh_terms)]]

    if not mesh_terms and not abstract:
        print(f"Failed to retrieve MeSH terms AND Abstract for PMID {pmid_to_query}.")
    elif not mesh_terms:
        print(f"Failed to retrieve MeSH terms for PMID {pmid_to_query}.")
    elif not abstract:
        print(f"Failed to retrieve Abstract for PMID {pmid_to_query}.")
    


  0%|          | 0/3225 [00:00<?, ?it/s]

Failed to retrieve MeSH terms for PMID 29355227.
Failed to retrieve MeSH terms for PMID 34988373.
Failed to retrieve MeSH terms for PMID 27379119.
Failed to retrieve MeSH terms for PMID 31239396.
Failed to retrieve MeSH terms for PMID 34909793.
Failed to retrieve MeSH terms for PMID 32994287.
Failed to retrieve MeSH terms for PMID 27602409.
Failed to retrieve Abstract for PMID 29055911.
Failed to retrieve MeSH terms for PMID 21564833.
Failed to retrieve MeSH terms for PMID 28421042.
Failed to retrieve MeSH terms for PMID 29577086.
Failed to retrieve MeSH terms for PMID 24995004.
Failed to retrieve MeSH terms for PMID 26229982.
Failed to retrieve MeSH terms for PMID 26819854.
Failed to retrieve MeSH terms for PMID 25324835.
Failed to retrieve MeSH terms for PMID 30619992.
Failed to retrieve MeSH terms for PMID 27822543.
Failed to retrieve MeSH terms for PMID 30832316.
Failed to retrieve MeSH terms for PMID 25013457.
Failed to retrieve MeSH terms for PMID 25681390.
Failed to retrieve MeS

In [17]:
def split_strings(lst):
    new_list = []
    for string in lst:
        filtered_str = string.replace('*', '')
        if '/' in filtered_str:
            new_list.extend([str.strip() for str in filtered_str.split('/')])
        elif ',' in filtered_str:
            new_list.extend([str.strip() for str in filtered_str.split(',')])
        else:
            new_list.append(filtered_str)
    return np.unique(new_list).tolist()

dataset['Filtered MeSH Terms'] = dataset['MeSH Terms'].apply(split_strings)

In [18]:
dataset.head()

Unnamed: 0,PMID,Description,Abstract,MeSH Terms,Filtered MeSH Terms
0,18270564,Comparative expression analysis on Plasmodium ...,A fundamental problem in systems biology and w...,"[*Gene Expression Profiling, *Genome, Protozoa...","[Animals, Computational Biology, Gene Expressi..."
1,31959989,The BONUS study that was conducted during regu...,Most infants with cystic fibrosis (CF) have pa...,"[Body Size, Case-Control Studies, Cystic Fibro...","[Body Size, Case-Control Studies, Cystic Fibro..."
2,31296739,Thirty-six members of a birth cohort with cons...,Characterizing the organization of the human g...,"[*Diet, *Germ-Free Life, Animals, Bacteria/*cl...","[Animal, Animals, Bacteria, Bangladesh, Bottle..."
3,29884786,The microbiome of 2684 fecal specimens collect...,The human gut microbiota plays a vital role in...,"[*Gastrointestinal Microbiome, Bacteria/classi...","[Bacteria, Bifidobacterium, Feces, Female, Fir..."
4,22699609,This HMP production phase includes untargeted ...,Studies of the human microbiome have revealed ...,"[*Biodiversity, *Health, *Metagenome, Adolesce...","[Adolescent, Adult, Bacteria, Biodiversity, Ec..."


## Mapping MeSH terms to EDAM ontology

First, a proof of concept...

In [None]:
edam_ontology = text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb", "EDAM")

In [None]:
mapped_terms = text2term.map_terms([str(term) for term in dataset.iloc[0]['Filtered MeSH Terms']], "EDAM", use_cache=True)
mapped_terms[['Source Term', 'Mapped Term Label']].head()

### Map All MeSH Terms

In [71]:
# import ast 

# dataset = pd.read_csv('datasets/staging_test_set.csv')
# dataset['Filtered MeSH Terms'] = dataset['Filtered MeSH Terms'].apply(lambda lst: ast.literal_eval(lst))

In [72]:
threshold  = 0.7

def convert_mesh_to_edam(mesh_terms):
    mapped_terms = text2term.map_terms(mesh_terms, "EDAM", use_cache=True)

    # Remove any EDAM mappings that are not a 'TOPIC'
    mapped_terms = mapped_terms[mapped_terms['Mapped Term IRI'].str.contains('topic')]
    # Remove any mappings with scores under the threshold
    mapped_terms = mapped_terms[mapped_terms['Mapping Score'] >= threshold]

    return mapped_terms['Mapped Term Label'].unique().tolist()

dataset['EDAM Topics'] = dataset['Filtered MeSH Terms'].apply(lambda terms: convert_mesh_to_edam(terms))

2023-12-19 12:24:14 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-12-19 12:24:14 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-19 12:24:14 INFO [text2term.t2t]: Mapping 13 source terms to EDAM
2023-12-19 12:24:14 INFO [text2term.t2t]: ...done (mapping time: 0.11s seconds)
2023-12-19 12:24:14 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-12-19 12:24:14 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-19 12:24:14 INFO [text2term.t2t]: Mapping 23 source terms to EDAM
2023-12-19 12:24:15 INFO [text2term.t2t]: ...done (mapping time: 0.07s seconds)
2023-12-19 12:24:15 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-12-19 12:24:15 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-19 12:24:15 INFO [text2term.t2t]: Mapping 24 source terms to EDAM
2023-12-19 12:24:15 INFO

In [73]:
dataset[['MeSH Terms', 'Filtered MeSH Terms', 'EDAM Topics']]

Unnamed: 0,MeSH Terms,Filtered MeSH Terms,EDAM Topics
0,"['*Gene Expression Profiling', '*Genome, Proto...","[Animals, Computational Biology, Gene Expressi...","[Zoology, Computational biology, Immunoinforma..."
1,"['Body Size', 'Case-Control Studies', 'Cystic ...","[Body Size, Case-Control Studies, Cystic Fibro...","[DNA, Human biology, DNA mutation, Sequence an..."
2,"['*Diet', '*Germ-Free Life', 'Animals', 'Bacte...","[Animal, Animals, Bacteria, Bangladesh, Bottle...","[Zoology, Human biology, Classification, Genet..."
3,"['*Gastrointestinal Microbiome', 'Bacteria/cla...","[Bacteria, Bifidobacterium, Feces, Female, Fir...","[Human biology, Chemistry, Biochemistry, Medic..."
4,"['*Biodiversity', '*Health', '*Metagenome', 'A...","[Adolescent, Adult, Bacteria, Biodiversity, Ec...","[Biodiversity, Biomedical science, Medical inf..."
...,...,...,...
3033,"['*CRISPR-Cas Systems', '*Evolution, Molecular...","[Bacterial Typing Techniques, Base Sequence, C...","[Evolutionary biology, Molecular evolution, Ge..."
3034,"['*Genome, Bacterial', 'Molecular Sequence Dat...","[Bacterial, Genome, Molecular Sequence Data, R...","[Classification, Genetics]"
3035,"['Animals', 'Animals, Genetically Modified/gen...","[Animal, Animals, Animals, Genetically Modifie...","[Zoology, Pathology, Genetics, Immunology, Imm..."
3036,"['*Mycobacterium tuberculosis', '*Tuberculosis...","[CD4-Positive T-Lymphocytes, HLA-DR Antigens, ...","[Human biology, Immunology, Immunogenetics, Im..."


In [74]:
print("Any unmapped MeSH Terms?: ", dataset['EDAM Topics'].isna().any())

Any unmapped MeSH Terms?:  False


In [None]:
# text2term.clear_cache("EDAM")

## Checking the validity of EDAM topics
Based on the OpenAI API input token limit, we can't pass in every EDAM term. 

In [75]:
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [76]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

Index 2: Terms not in edam_topics: ['Classification']
Index 3: Terms not in edam_topics: ['Classification']
Index 4: Terms not in edam_topics: ['Metabolic pathways', 'Metagenomic sequencing', 'Classification']
Index 5: Terms not in edam_topics: ['Cell cycle']
Index 6: Terms not in edam_topics: ['Protein membrane regions', 'PCR experiment', 'Transcription', 'Gene transcription']
Index 7: Terms not in edam_topics: ['PCR experiment']
Index 8: Terms not in edam_topics: ['Cytometry', 'Proteome', 'Proteomics experiment']
Index 9: Terms not in edam_topics: ['Protein membrane regions', 'Light microscopy', 'Proteomics experiment', 'Sequence alignment']
Index 10: Terms not in edam_topics: ['Mitochondria', 'Proteome', 'Mass spectrometry']
Index 12: Terms not in edam_topics: ['Primers', 'Protein sequence analysis']
Index 13: Terms not in edam_topics: ['Protein databases', 'Sequence databases', 'Metabolic pathways']
Index 15: Terms not in edam_topics: ['Proteome', 'Proteomics experiment']
Index 17:

In [None]:
## Uncomment and use this line to remove the rows if necessary (GPT.ipynb will filter out the terms from each list anyways)
# dataset.drop(index=indices_true, axis = 0, inplace=True)

### Remove undesired EDAM topics from each list

In [77]:
# edam_topics.txt 
with open('EDAM/edam_topics.txt', 'r') as edam_file:
    full_edam_topics = edam_file.readlines()

full_edam_topics = [topic.strip() for topic in full_edam_topics]

In [78]:
dataset['EDAM Topics'] = dataset['EDAM Topics'].apply(lambda x: [item for item in x if item in full_edam_topics])

## Save Testing Set

In [79]:
print("Check for N/A values:")

dataset.isna().any()

Check for N/A values:


PMID                   False
Description            False
Abstract               False
MeSH Terms             False
Filtered MeSH Terms    False
EDAM Topics            False
dtype: bool

In [80]:
print('Empty Descriptions:', len(dataset[dataset["Description"].str.len() == 0]))
print('Empty Abstracts:', len(dataset[dataset["Abstract"].str.len() == 0]))

print("\nNumber of entries: ", len(dataset))

Empty Descriptions: 0
Empty Abstracts: 0

Number of entries:  3038


In [81]:
# Drop any rows with no description or abstract

dataset = dataset[dataset["Description"].apply(len) > 0]
dataset = dataset[dataset["Abstract"].apply(len) > 0]

In [82]:
# Remove HTML and escape characters
dataset["Description"] = dataset["Description"].apply(lambda desc: BeautifulSoup(desc, 'html.parser').get_text().strip().replace('\n', ' ').replace('\r', ''))

  dataset["Description"] = dataset["Description"].apply(lambda desc: BeautifulSoup(desc, 'html.parser').get_text().strip().replace('\n', ' ').replace('\r', ''))


In [83]:
dataset['Description'] = dataset['Description'].apply(lambda desc: re.sub('\s+', ' ', desc))

In [84]:
dataset.to_csv(input("Enter file path and name: "), index=False)