In [None]:
#Start

#### Code Summary

The code imports necessary libraries like Bio and tqdm for fetching PubMed abstracts. It defines a PMIDFetcher class to handle caching and retrieving abstracts for given PMIDs.

It loads a list of unique PMIDs from the filtered SNP data file generated in Part 2. Using the PMIDFetcher, it retrieves abstracts for these PMIDs by batch querying the NCBI API. Progress is shown using tqdm.

Retrieved abstracts are cached to file as JSON. Stats are calculated:

- Total PMIDs in the JSON file: 364,302 </br>
- Total PMIDs with no abstracts: 9,425 </br>
- Size of the JSON file: 383.99 MB </br>

This code provides a way to efficiently fetch PubMed abstracts for a large set of PMIDs using the NCBI API and a caching system. It stores the results as a JSON file and provides metrics on the total PMIDs, missing abstracts, and file size.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import json
from Bio import Entrez
from tqdm import tqdm
import json
import os

In [None]:
class PMIDFetcher:
    def __init__(self, cache_filename="snp_abstracts.json", api_key=None):
        self.cache_filename = cache_filename
        Entrez.email = "saisiva002@gmail.com"

        if api_key:
            Entrez.api_key = api_key

        try:
            with open(self.cache_filename, "r") as cache_file:
                self.cached_abstracts = json.load(cache_file)
        except FileNotFoundError:
            self.cached_abstracts = {}

    def fetch_abstracts(self, pmids):
        # post the list of PMIDs using epost
        post_handle = Entrez.epost("pubmed", id=",".join(pmids))
        post_results = Entrez.read(post_handle)
        post_handle.close()

        query_key = post_results["QueryKey"]
        webenv = post_results["WebEnv"]

        # fetch the results using efetch
        fetch_handle = Entrez.efetch(db="pubmed", rettype="abstract", retmode="xml", webenv=webenv, query_key=query_key)
        fetch_results = Entrez.read(fetch_handle)
        fetch_handle.close()

        abstracts = {}
        no_abstract_pmids = []
        for article in fetch_results["PubmedArticle"]:
            pmid = article["MedlineCitation"]["PMID"]
            # Check if the 'Abstract' key is present
            if "Abstract" in article["MedlineCitation"]["Article"]:
                abstract_text = article["MedlineCitation"]["Article"]["Abstract"]["AbstractText"][0]
                abstracts[pmid] = abstract_text
            else:
                no_abstract_pmids.append(pmid)

        # Save PMIDs without abstracts to no_abstracts.tsv
        with open("no_abstracts.tsv", "a") as file:
            for pmid in no_abstract_pmids:
                file.write(pmid + '\n')

        return abstracts

    def retrieve(self, pmids, limit=None):
        pmids_to_fetch = [pmid for pmid in pmids if pmid not in self.cached_abstracts]

        if limit:
            pmids_to_fetch = pmids_to_fetch[:limit]

        new_abstracts = 0
        batch_size = 200

        for i in tqdm(range(0, len(pmids_to_fetch), batch_size), desc="Fetching abstracts"):
            batch = pmids_to_fetch[i:i + batch_size]
            abstracts = self.fetch_abstracts(batch)
            self.cached_abstracts.update(abstracts)
            new_abstracts += len(abstracts)

            # Save to cache periodically
            with open(self.cache_filename, "w") as cache_file:
                json.dump(self.cached_abstracts, cache_file)

        return new_abstracts

In [None]:
#Input file from part 2
filename = "filtered_snp_data.tsv"

with open(filename, "r") as file:
    next(file)  # Skip header
    pmids = list(set(line.split("\t")[0] for line in file))  # Get Unique PMIDs

print(len(pmids))

373734


In [None]:
## Fetch PubMed Abstracts Here ##

In [8]:
print(f"Fetching {len(pmids)} abstracts from Natinal Library of Medicine.")

#Fill the NCBI API Key
ncbi_key = "8cf8451ca098c39731c8c0b6c45b920bb208"

if __name__ == "__main__":
    fetcher = PMIDFetcher(api_key=ncbi_key)
    fetch_limit = len(pmids)  # Set limit #total pmid's
    new_abstracts_count = fetcher.retrieve(pmids, limit=fetch_limit)
    print(f"Retrieved {new_abstracts_count} new abstracts.")

Fetching 373734 abstracts from Natinal Library of Medicine.


Fetching abstracts: 100%|██████████| 1869/1869 [2:14:29<00:00,  4.32s/it]

Retrieved 364302 new abstracts.





In [9]:
## Stats ##

In [11]:
#Total PMID's & Skipped PMID's
with open("snp_abstracts.json", "r") as f:
    data = json.load(f)

with open("no_abstracts.tsv", "r") as f:
    no_abstract_pmids = [line.strip() for line in f.readlines()]

total_pmids = len(data)
total_no_abstract_pmids = len(no_abstract_pmids)

file_size = os.path.getsize("snp_abstracts.json")

In [12]:
print(f"Total PMIDs in the file: {total_pmids}")
print(f"Total PMIDs in with no_abstracts: {total_no_abstract_pmids}")

print(f"Size of the file: {file_size / (1024 * 1024):.2f} MB")

Total PMIDs in the file: 364302
Total PMIDs in with no_abstracts: 9425
Size of the file: 383.99 MB


In [1]:
#END