# Test Set Gathering

Note: This is for the staging API of niad

In [1]:
import requests

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
results = requests.get("https://api-staging.data.niaid.nih.gov/v1/query?=&q=*pmid*&extra_filter=&size=1000&from=0&sort=_score&use_metadata_score=true")

In [3]:
data = results.json()

In [4]:
pmids = []
descriptions = []

In [5]:
for idx,hits in enumerate(data['hits']):
    if 'citation' in hits:
        if type(hits['citation']) is list and 'pmid' in hits['citation'][0]:
            pmids.append(hits['citation'][0]['pmid'])
            descriptions.append(hits['description'])

In [6]:
# Grabbing more data
results = requests.get("https://api-staging.data.niaid.nih.gov/v1/query?=&q=*pmid*&extra_filter=&size=1000&from=1001&sort=_score&use_metadata_score=true")

In [7]:
data = results.json()

for idx,hits in enumerate(data['hits']):
    if 'citation' in hits:
        if type(hits['citation']) is list and 'pmid' in hits['citation'][0]:
            pmids.append(hits['citation'][0]['pmid'])
            descriptions.append(hits['description'])

print(len(pmids))
print(len(descriptions))

1573
1573


In [8]:
# Remove duplicate PMIDs

# Use a set to track unique elements
unique_set = set()

# Create new lists without duplicates and corresponding indices
filtered_list = []
filtered_corresponding_list = []

for item, corresponding_item in zip(pmids, descriptions):
    if item not in unique_set:
        # Add the item to the set to track uniqueness
        unique_set.add(item)
        
        # Append to the new lists
        filtered_list.append(item)
        filtered_corresponding_list.append(corresponding_item)

# Print the result
print("Filtered List:", len(filtered_list))
print("Filtered Corresponding List:", len(filtered_corresponding_list))

Filtered List: 1528
Filtered Corresponding List: 1528


In [9]:
pmids = filtered_list
descriptions = filtered_corresponding_list

## Getting MeSH Terms

In [10]:
from Bio import Entrez

Entrez.email = "zqazi@scripps.edu"

dataset = pd.DataFrame(columns=['PMID', 'Description', 'Abstract', 'MeSH Terms'])

def get_mesh_terms(pmid):

    try:
        # Fetch article details using the PMID
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        article_data = Entrez.read(handle)
        handle.close()

        # Extract MeSH terms from the article data
        mesh_terms = []
        abstract = None
        if "PubmedArticle" in article_data:
            for article in article_data["PubmedArticle"]:
                if "MeshHeadingList" in article["MedlineCitation"]:
                    mesh_headings = article["MedlineCitation"]["MeshHeadingList"]
                    for heading in mesh_headings:
                        descriptor_name = heading["DescriptorName"]
                        mesh_terms.append(descriptor_name)
                    
                    # Get abstract if there are MeSH terms
                    if "Abstract" in article["MedlineCitation"]['Article']:
                        abstract = article["MedlineCitation"]['Article']["Abstract"]["AbstractText"][0]

        return abstract, mesh_terms

    except Exception as e:
        print(f"Error: {e}")
        return None


for idx, pmid_to_query in enumerate(tqdm(pmids)):

    # Get MeSH terms for the specified PMID
    abstract, mesh_terms = get_mesh_terms(pmid_to_query)

    if mesh_terms and abstract:
        # print(f"MeSH terms for PMID {pmid_to_query}:\n")
        # for term in mesh_terms_result:
        #     print(term)
        
        # {'PMID': pmid_to_query, 'Abstract': str(abstract), 'MeSH Terms': [str(term) for term in mesh_terms]}
        dataset.loc[len(dataset.index)] = [pmid_to_query, descriptions[idx], abstract, [str(term) for term in np.unique(mesh_terms)]]

    if not mesh_terms and not abstract:
        print(f"Failed to retrieve MeSH terms AND Abstract for PMID {pmid_to_query}.")
    elif not mesh_terms:
        print(f"Failed to retrieve MeSH terms for PMID {pmid_to_query}.")
    elif not abstract:
        print(f"Failed to retrieve Abstract for PMID {pmid_to_query}.")
    


  0%|          | 0/1528 [00:00<?, ?it/s]

Failed to retrieve MeSH terms AND Abstract for PMID 28935729.
Failed to retrieve MeSH terms AND Abstract for PMID 28694998.
Failed to retrieve MeSH terms AND Abstract for PMID 27822525.
Failed to retrieve MeSH terms AND Abstract for PMID 34124444.
Failed to retrieve MeSH terms AND Abstract for PMID 34359663.
Failed to retrieve MeSH terms AND Abstract for PMID 32785975.
Failed to retrieve MeSH terms AND Abstract for PMID 31551709.
Failed to retrieve MeSH terms AND Abstract for PMID 31500094.
Failed to retrieve MeSH terms AND Abstract for PMID 34109199.
Failed to retrieve MeSH terms AND Abstract for PMID 34955646.
Failed to retrieve MeSH terms AND Abstract for PMID 31186338.
Failed to retrieve MeSH terms AND Abstract for PMID 32260483.
Failed to retrieve MeSH terms AND Abstract for PMID 34131305.
Failed to retrieve MeSH terms AND Abstract for PMID 33352881.
Failed to retrieve Abstract for PMID 33502684.
Failed to retrieve MeSH terms AND Abstract for PMID 33863728.
Failed to retrieve MeSH

In [11]:
dataset.head()

Unnamed: 0,PMID,Description,Abstract,MeSH Terms
0,33333024,We performed genome-wide CRISPR KO screens in ...,The Coronaviridae are a family of viruses that...,"[A549 Cells, Animals, Biosynthetic Pathways, C..."
1,29116155,CRISPR-cas9 mutagenesis plasmid Tn916 oriT pla...,Clostridium difficile is a significant concern...,"[CRISPR-Associated Protein 9, CRISPR-Cas Syste..."
2,33199862,We developed Transcriptional Regulator Induced...,Transposon-based strategies provide a powerful...,"[Antitubercular Agents, Gene Expression Regula..."
3,28506317,Gut microbiota play a key role in maintaining ...,Gut microbiota play a key role in maintaining ...,"[Adult, Aged, Aged, 80 and over, Bacteria, Clo..."
4,24939885,"Within the last decade, C. difficile infection...",Clostridium difficile infection is one of the ...,"[Adult, Aged, Aged, 80 and over, Bacteria, Bio..."


## Mapping MeSH terms to EDAM ontology

First, a proof of concept...

In [12]:
import text2term

* 'underscore_attrs_are_private' has been removed


In [13]:
edam_ontology = text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb", "EDAM")

2023-11-18 14:59:38 INFO [text2term.term_collector]: Loading ontology https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb...
2023-11-18 14:59:42 INFO [text2term.term_collector]: ...done (ontology loading time: 3.20s)
2023-11-18 14:59:42 INFO [text2term.term_collector]: Collecting ontology term details...
2023-11-18 14:59:42 INFO [text2term.term_collector]: ...done: collected 3577 ontology terms (collection time: 0.11s)
2023-11-18 14:59:42 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.ANY
2023-11-18 14:59:42 INFO [text2term.t2t]: Caching ontology https://data.bioontology.org/ontologies/EDAM/submissions/44/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb to: cache/EDAM


In [14]:
mapped_terms = text2term.map_terms([str(term) for term in dataset.iloc[0]['MeSH Terms']], "EDAM", use_cache=True)
mapped_terms[['Source Term', 'Mapped Term Label']].head()

2023-11-18 14:59:42 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-18 14:59:42 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-18 14:59:42 INFO [text2term.t2t]: Mapping 22 source terms to EDAM
2023-11-18 14:59:45 INFO [text2term.t2t]: ...done (mapping time: 3.44s seconds)


Unnamed: 0,Source Term,Mapped Term Label
0,A549 Cells,cel
1,A549 Cells,CellML
2,A549 Cells,Cell type accession
3,Animals,Zoology
4,Animals,Laboratory animal science


### Map All MeSH Terms

In [15]:
def convert_mesh_to_edam(mesh_terms):
    mapped_terms = text2term.map_terms(mesh_terms, "EDAM", use_cache=True)

    return mapped_terms['Mapped Term Label'].unique().tolist()

dataset['EDAM Topics'] = dataset['MeSH Terms'].apply(lambda terms: convert_mesh_to_edam(terms))

2023-11-18 15:00:32 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-18 15:00:32 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-18 15:00:32 INFO [text2term.t2t]: Mapping 22 source terms to EDAM
2023-11-18 15:00:32 INFO [text2term.t2t]: ...done (mapping time: 0.07s seconds)
2023-11-18 15:00:32 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-18 15:00:32 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-18 15:00:32 INFO [text2term.t2t]: Mapping 9 source terms to EDAM
2023-11-18 15:00:32 INFO [text2term.t2t]: ...done (mapping time: 0.07s seconds)
2023-11-18 15:00:32 INFO [text2term.t2t]: Loading cached ontology from: cache/EDAM/EDAM-term-details.pickle
2023-11-18 15:00:32 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2023-11-18 15:00:32 INFO [text2term.t2t]: Mappin

In [16]:
dataset[['MeSH Terms', 'EDAM Topics']]

Unnamed: 0,MeSH Terms,EDAM Topics
0,"[A549 Cells, Animals, Biosynthetic Pathways, C...","[cel, CellML, Cell type accession, Zoology, La..."
1,"[CRISPR-Associated Protein 9, CRISPR-Cas Syste...","[protein, Rate of association, Image metadata,..."
2,"[Antitubercular Agents, Gene Expression Regula...","[Gene expression, Regular expression, Gene reg..."
3,"[Adult, Aged, Aged, 80 and over, Bacteria, Clo...","[MGED, MGED concept ID, Protein signal peptide..."
4,"[Adult, Aged, Aged, 80 and over, Bacteria, Bio...","[MGED, MGED concept ID, Protein signal peptide..."
...,...,...
1398,"[Animals, Collagen Type VI, Female, Gene Expre...","[Zoology, Laboratory animal science, Animal st..."
1399,"[Animals, Cell Proliferation, Colitis, Colon, ...","[Zoology, Laboratory animal science, Animal st..."
1400,"[Contig Mapping, Genome, Fungal, Genotype, Phe...","[Mapping, ID mapping, Genome map, Genome repor..."
1401,"[Cell Movement, Down-Regulation, Gene Expressi...","[Cell type accession, Cell biology, Cell line ..."


In [17]:
print("Any unmapped MeSH Terms?: ", dataset['EDAM Topics'].isna().any())

Any unmapped MeSH Terms?:  False


In [18]:
text2term.clear_cache("EDAM")

Cache has been cleared successfully


## Checking the validity of EDAM topics
Based on the OpenAI API input token limit, we can't pass in every EDAM term. 

In [29]:
with open('full_edam_list.txt', 'r') as edam_file:
    edam_topics = edam_file.readlines()

edam_topics = [topic.strip() for topic in edam_topics]

In [30]:
indices_true = dataset.loc[dataset['EDAM Topics'].apply(lambda edam_list: not all(term in edam_topics for term in edam_list))].index

In [31]:
for index in indices_true:
    edam_list = dataset.loc[index, 'EDAM Topics']
    terms_not_in_edam_topics = [term for term in edam_list if term not in edam_topics]
    
    print(f"Index {index}: Terms not in edam_topics: {terms_not_in_edam_topics}")

In [32]:
dataset.drop(index=indices_true, axis = 0, inplace=True)

## Save Testing Set

In [33]:
print("Check for N/A values:")

dataset.isna().any()

Check for N/A values:


PMID           False
Description    False
Abstract       False
MeSH Terms     False
EDAM Topics    False
dtype: bool

In [41]:
print('Empty Descriptions:', len(dataset[dataset["Description"].str.len() == 0]))
print('Empty Abstracts:', len(dataset[dataset["Abstract"].str.len() == 0]))

print("\nNumber of entries: ", len(dataset))

Empty Descriptions: 0
Empty Abstracts: 0

Number of entries:  1400


In [53]:
# Remove HTML
dataset = dataset[~dataset['Description'].str.startswith(('<b>', 'b>'))]

In [55]:
dataset.to_csv(input("Enter file path and name: "), index=False)