In [1]:
import requests
import pickle
import xml.etree.ElementTree as ET
import csv
import sys
import math

In [2]:
# Function to retrieve articles from a specified database using a provided query string
# Query string can be a single word/phrase or a list of words/phrase separated using '_'
# Note that if a list of words/phrases is provided, this search will require every term
# to be present in any articles it retrieves (i.e., 'AND' operation for multiple-term lists)
# TODO: Please add your tool name and email ID in the base_url variable

def db_extract(db, query):

    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={}&tool=TOOLNAME&email=EMAILID&retmax=100000&term=".format(db)

    article_ids = set()
    query = query.split('_')
    query = '+'.join(["%22"+x.replace(" ", "%20")+"%22[MeSH Terms]" for x in query])
    print("Running query: {}".format(query))
    query_url = base_url + query

    response = requests.get(query_url)
    root = ET.fromstring(response.content)
    count = root.find("Count").text
    id_list = root.find("IdList").findall("Id")
    article_ids.update([x.text for x in id_list])
    print(len(article_ids))
    if int(count) > 100000:
        cur = 100000
        while cur < int(count):
            new_query = base_url + query + "&retstart={}".format(cur)
            print("Running additional query: {}".format(query))
            response = requests.get(new_query)
            root = ET.fromstring(response.content)
            id_list = root.find("IdList").findall("Id")
            cur += len(id_list)
            article_ids.update([x.text for x in id_list])
            print(len(article_ids))
        print('Retrieved {}/{} results'.format(cur, count))
    else:
        print('Retrieved {} results'.format(count))
    return article_ids

In [3]:
# Example querying procedure for mortality outcome
# TODO: Specify query terms according to your outcomes of interest

# Retrieve mortality related articles from the PMC database
pmc_ids = db_extract("pmc", "hospital mortality")
pmc_ids = pmc_ids.union(db_extract("pmc", "mortality_risk factors_humans"))

# Retrieve mortality related articles from the PubMed database
pubmed_ids = db_extract("pubmed", "hospital mortality")
pubmed_ids = pubmed_ids.union(db_extract("pubmed", "mortality_risk factors_humans"))

Running query: %22hospital%20mortality%22[MeSH Terms]
9614
Retrieved 9614 results
Running query: %22mortality%22[MeSH Terms]+%22risk%20factors%22[MeSH Terms]+%22humans%22[MeSH Terms]
12408
Retrieved 12408 results
Running query: %22hospital%20mortality%22[MeSH Terms]
47756
Retrieved 47756 results
Running query: %22mortality%22[MeSH Terms]+%22risk%20factors%22[MeSH Terms]+%22humans%22[MeSH Terms]
62261
Retrieved 62261 results


In [4]:
# Procedure to combine articles retrieved from both PMC and PubMed databases
# To do this combination, PMC article IDs need to be mapped to their corresponding PubMed IDs first
# to avoid double-counting of articles included in both databases
def combine_ids(pmc, pubmed):
    reader = csv.reader(open('../data/PMC_id_map.csv'))
    id_dict = {}
    next(reader, None)
    for row in reader:
        id_dict[row[-4][3:]] = row[-3]
    correct_pmc = set()
    for id in pmc:
        if id not in id_dict or id_dict[id] == '':
            correct_pmc.add('PMC'+id)
            continue
        correct_pmc.add(id_dict[id])
    final_ids = correct_pmc.union(pubmed)
    return final_ids

In [6]:
# TODO: Specify filename
article_ids = combine_ids(pmc_ids, pubmed_ids)
print("Final collection for {} has {} articles".format("mortality", len(article_ids)))
pickle.dump(article_ids, open('../data/outcome-literature/FILENAME', 'wb'))

Final collection for mortality has 96282 articles


In [7]:
# Split abstracts according to database they are retrieved from
# This needs to be done to ensure that we are checking the correct database while retrieving text
def split_abstracts(abstracts):
    pubmed = []
    pmc = []
    for abstract in abstracts:
        if abstract.startswith('PMC'):
            pmc.append(abstract[3:])  # Drop PMC prefix since it is no longer needed to distinguish between PubMed/PMC
        else:
            pubmed.append(abstract)
    return pubmed, pmc

In [8]:
# Function to retrieve complete data for a batch of abstract IDs from a provided database
# Results will be retrieved in XML format
# TODO: Please add your tool name and email ID in the base_url variable
def retrieve_abstract_batch(id_batch, database):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={}&id={}&retmode=xml&tool=TOOLNAME&email=EMAIL"
    query = base_url.format(database, ','.join(id_batch))
    response = requests.get(query)
    xml_abstracts = response.content
    return xml_abstracts

In [9]:
# Function to parse out abstract elements from retrieved XMLs
def parse_abstract_xml(xml, database):
    top_tag = {'pubmed': 'PubmedArticle', 'pmc': 'article'}  # PubMed and PMC use different XML tags
    parsed_xml = ET.fromstring(xml)
    if database == 'pmc':
        print(ET.tostring(parsed_xml))
    articles = parsed_xml.findall(top_tag[database])
    return articles

In [10]:
# Procedure that takes a large set of IDs, breaks it into manageable batches,
# queries the provided database and extracts abstracts from retrieved XMLs
# TODO: Change file path for storage if needed
def retrieve_all_abstracts(id_list, database):
    max_query_size = 200  # PubMed only accepts 200 IDs at a time when retrieving abstract text
    print('Retrieval will require {} queries'.format(math.ceil(len(id_list)/float(max_query_size))))
    retrieved_abstracts = []
    texts = {}
    for i in range(0, len(id_list), max_query_size):
        start = i
        end = min(len(id_list), start+max_query_size)
        cur_ids = id_list[start:end]
        cur_abstracts = retrieve_abstract_batch(cur_ids, database)
        cur_parsed_abstracts = parse_abstract_xml(cur_abstracts, database)
        if len(cur_parsed_abstracts) != (end-start):
            error_log.write('Missing abstracts:\n')
            error_log.write(','.join(cur_ids)+'\n')
        retrieved_abstracts += cur_parsed_abstracts
        for abstract in retrieved_abstracts:
            pmid = -99999
            abstract_text = ""
            year = -1000
            for element in abstract.iter():
                if element.tag == 'PMID':
                    if pmid == -99999:
                        pmid = element.text
                if element.tag == 'AbstractText':
                    if element.text:
                        abstract_text += element.text + '\n'
                if element.tag == 'PubDate':
                    for subelement in element.iter():
                        if subelement.tag == 'Year':
                            year = int(subelement.text )
            texts[pmid] = {'text': abstract_text, 'year': year}
        if len(texts) % 1000 == 0 or end == len(id_list):
            print('Retrieved {} abstracts'.format(end))
            retrieved_abstracts = []
        pickle.dump(texts, open('../data/{}_texts_and_dates.pkl'.format(database, end), 'wb'))
    return

In [None]:
# Running text retrieval for IDs retrieved by outcome-specific queries
error_log = open('retrieval_errors.txt', 'w')
pubmed_abs, pmc_abs = split_abstracts(article_ids)
print('{} abstracts will be scraped from PubMed'.format(len(pubmed_abs)))
print('{} abstracts will be scraped from PMC'.format(len(pmc_abs)))
retrieve_all_abstracts(pubmed_abs, 'pubmed')
error_log.close()

95988 abstracts will be scraped from PubMed
294 abstracts will be scraped from PMC
Retrieval will require 480 queries
Retrieved 1000 abstracts
