In [58]:
import json
from Bio import Entrez
from tqdm import tqdm
import json
import os

In [59]:
class PMIDFetcher:
    def __init__(self, cache_filename="cached_abstracts.json", api_key=None):
        self.cache_filename = cache_filename
        Entrez.email = "REPLACE EMAIL"
        if api_key:
            Entrez.api_key = api_key

        try:
            with open(self.cache_filename, "r") as cache_file:
                self.cached_abstracts = json.load(cache_file)
        except FileNotFoundError:
            self.cached_abstracts = {}

    def fetch_abstracts(self, pmids):
        # First, post the list of PMIDs using epost
        post_handle = Entrez.epost("pubmed", id=",".join(pmids))
        post_results = Entrez.read(post_handle)
        post_handle.close()

        query_key = post_results["QueryKey"]
        webenv = post_results["WebEnv"]

        # Then, fetch the results using efetch
        fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract", retmode="xml", webenv=webenv, query_key=query_key)
        fetch_results = Entrez.read(fetch_handle)
        fetch_handle.close()

        abstracts = {}
        no_abstract_pmids = []
        for article in fetch_results["PubmedArticle"]:
            pmid = article["MedlineCitation"]["PMID"]
            # Check if the 'Abstract' key is present
            if "Abstract" in article["MedlineCitation"]["Article"]:
                abstract_text = article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]
                abstracts[pmid] = abstract_text
            else:
                no_abstract_pmids.append(pmid)

        # Save PMIDs without abstracts to no_abstract.tsv
        with open("no_abstract.tsv", "a") as file:
            for pmid in no_abstract_pmids:
                file.write(pmid + '\n')

        return abstracts

    def retrieve(self, pmids, limit=None):
        pmids_to_fetch = [pmid for pmid in pmids if pmid not in self.cached_abstracts]
        
        if limit:
            pmids_to_fetch = pmids_to_fetch[:limit]

        new_abstracts = 0
        batch_size = 200
        for i in tqdm(range(0, len(pmids_to_fetch), batch_size), desc="Fetching abstracts"):
            batch = pmids_to_fetch[i:i + batch_size]
            abstracts = self.fetch_abstracts(batch)
            self.cached_abstracts.update(abstracts)
            new_abstracts += len(abstracts)

            # Save to cache periodically
            with open(self.cache_filename, "w") as cache_file:
                json.dump(self.cached_abstracts, cache_file)
        
        return new_abstracts
    

In [60]:
def read_pmids_from_file(filename="snp_mutation_data.tsv"):
    with open(filename, "r") as file:
        next(file)  # Skip header
        pmids = list(set(line.split("\t")[0] for line in file))  # Unique PMIDs
    return pmids


In [None]:
## FETCH HERE ##

In [53]:
if __name__ == "__main__":
    # Using API key
    fetcher = PMIDFetcher(api_key="REPLACE API KEY")
    pmids = read_pmids_from_file()
    limit = 10000  # Set limit
    new_abstracts_count = fetcher.retrieve(pmids, limit=limit)
    print(f"Retrieved {new_abstracts_count} new abstracts.")

Fetching abstracts: 100%|██████████████████████████████████████████████████████████| 391/391 [1:20:44<00:00, 12.39s/it]

Retrieved 62453 new abstracts.





In [61]:
## Stats ##

In [62]:
#Total PMID's & Skipped PMID's
with open("cached_abstracts.json", "r") as f:
    data = json.load(f)

with open("no_abstract.tsv", "r") as f:
    no_abstract_pmids = [line.strip() for line in f.readlines()]
    
total_pmids = len(data)    
total_no_abstract_pmids = len(no_abstract_pmids)

file_size = os.path.getsize("cached_abstracts.json")

In [63]:
print(f"Total PMIDs in the file ##FETCH-SUCESS##: {total_pmids}")
print(f"Total PMIDs in with no_abstracts ##FETCH-SKIPPED##: {total_no_abstract_pmids}")

print(f"Size of the file: {file_size / (1024 * 1024):.2f} MB")


Total PMIDs in the file ##FETCH-SUCESS##: 429985
Total PMIDs in with no_abstracts ##FETCH-SKIPPED##: 78364
Size of the file: 450.58 MB


In [64]:
#Read PubMed Abstract Fetcher Tool Summary.md