In [1]:
import os
from tqdm import tqdm
import pandas as pd
from Bio import Entrez
Entrez.email = "example@example.com"
from urllib.error import HTTPError

In [2]:
def get_pmids_from_one_day(topic, date):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]", 
                            retmax=1000,
                            datetype="pdat", 
                            mindate=date, 
                            maxdate=date, )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
# print(color.BOLD + 'Hello, World!' + color.END)

def print_err(text):
    err_text = color.BOLD + color.RED + text + color.END
    print(err_text)
    return err_text

def get_pmids_from_period(topic, span):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]", 
                            retmax=1000,
                            datetype="pdat", 
                            mindate=span[0], 
                            maxdate=span[1], )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


import time
def get_xml(id):
    attempt = 1
    while attempt <= 3:
        try:
            handle = Entrez.efetch(db="pubmed", 
                            id=id, 
                            retmode="xml", 
                            rettype="abstract")
            xml = handle.read()
            return xml
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print_err(f">>>>> Attempt {attempt}: Received error from server {err}")
                attempt += 1
                time.sleep(15)
            else:
                print_err(f">>>>> Failed to download {id}. Error: {err}")
                return ""
    
    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(json.dumps(xmltodict.parse(xml)))



def get_xmls_ids(ids):
    pairs = []
    for id in tqdm(ids):
        xml = get_xml(id)
        pairs += [(id, xml)]
    return pairs


import csv
def save_span(span, list_of_pairs):
    file_name = span[0].replace("/", "_") + "__" + span[1].replace("/", "_") + ".csv"
    with open(file_name,'w') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
    return file_name


def save_date(date, list_of_pairs):
    file_name = date.replace("/", "_") + ".csv"
    with open(file_name,'w') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
        print(f"Saved {len(list_of_pairs)} papers published on {date}.")

In [3]:
month = '03'

for day in range(1, 32):
    date = "2022/" + month + "/" + f'{day:02}'
    print(f"Starting with {date}")
    ids = get_pmids_from_one_day("breast cancer", date)
    if len(ids) == 0:
        break
    print(f"Number of papers published on  {date}': {len(ids)}")
    pairs = get_xmls_ids(ids)
    file_name = save_date(date, pairs)

Starting with 2022/03/01
Number of papers published on  2022/03/01': 719


  3%|▎         | 24/719 [00:24<20:31,  1.77s/it]

[1m[91m>>>>> Failed to download 35428464. Error: HTTP Error 400: Bad Request[0m


  8%|▊         | 59/719 [00:59<11:21,  1.03s/it]

In [None]:
# file_name = '2022_01_02__2022_01_10.csv'
# df = pd.read_csv(file_name, header=None)
# df