<a href="https://colab.research.google.com/github/angirov/pubmed_crawler/blob/main/save_bydate_saver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python3 --version
!pip install Bio

In [2]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [3]:
import os

dir = "/drive/MyDrive/dsr/pubmed_data/"
os.chdir(dir)
os.getcwd()

'/drive/MyDrive/dsr/pubmed_data'

In [4]:
import os
from tqdm import tqdm
import pandas as pd
from Bio import Entrez
Entrez.email = "example@example.com"
from urllib.error import HTTPError
from pathlib import Path

In [5]:
def get_pmids_from_one_day(topic, date):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]", 
                            retmax=10000,
                            datetype="pdat", 
                            mindate=date, 
                            maxdate=date, )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
# print(color.BOLD + 'Hello, World!' + color.END)


def print_err(text):
    err_text = color.BOLD + color.RED + text + color.END
    print(err_text)
    return err_text


def get_pmids_from_period(topic, span):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]",
                            retmax=1000,
                            datetype="pdat",
                            mindate=span[0],
                            maxdate=span[1], )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


import time
def get_xml(id):
    attempt = 1
    while attempt <= 5:
        try:
            handle = Entrez.efetch(db="pubmed", 
                            id=id, 
                            retmode="xml", 
                            rettype="abstract")
            xml = handle.read()
            return xml
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print_err(f">>>>> Attempt {attempt}: Received error from server {err}")
                attempt += 1
                time.sleep(15)
            else:
                attempt += 1
                time.sleep(15)
    print_err(f">>>>> Failed to download {id}. Error: {err}")
    return ""

    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(json.dumps(xmltodict.parse(xml)))


def get_xmls_ids(ids):
    pairs = []
    for id in tqdm(ids):
        xml = get_xml(id)
        pairs += [(id, xml)]
    return pairs


def save_pair(csv_writer, id, xml):
    row = id, xml
    csv_writer.writerow(row)
    # print(f"Saved paper {id}.")


def save_xmls_ids(file_name, date, ids):
    with open(file_name,'a+') as out:
        csv_writer=csv.writer(out, dialect='unix')
        for id in tqdm(ids):
            xml = get_xml(id)
            save_pair(csv_writer, id, xml)
    print(f"Saved {len(ids)} papers published on {date}.")
    return file_name


import csv
def save_span(span, list_of_pairs):
    file_name = span[0].replace("/", "_") + "__" + span[1].replace("/", "_") + ".csv"
    with open(file_name,'a+') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
    return file_name


def save_date(date, list_of_pairs):
    file_name = date2filename(date)
    with open(file_name,'a+') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
        # print(f"Saved {len(list_of_pairs)} papers published on {date}.")
    return file_name


def ymd2date(year, month, day):
    return f'{year}/{month:02}/{day:02}'


def date2ymd(date):
    year, month, day = str.split(date, sep="/")
    return year, month, day


def date2filename(dir, date):
    year, month, _ = date2ymd(date)
    filename = year + "/" + month + "/" + date.replace("/", "_") + ".csv"
    print(f"File name: {filename}")
    return filename


In [None]:
year = 2022
month = 12

# for month in range(10, 13):

for day in range(1, 32):
    date = ymd2date(year, month, day)

    ids = get_pmids_from_one_day("breast cancer", date)
    print(f"Number of papers published on  {date}': {len(ids)}")
    if len(ids) == 0:
        break

    file_name = Path(date2filename(dir, date))
    path = Path(dir + "/" + f"{year}" + "/" + f"{month:02}" + "/")
    path.mkdir(parents=True, exist_ok=True)


    if file_name.absolute().is_file():
        print(f"File {file_name} already exists.")
        df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
        ids_done = df.id.astype(int).astype(str).to_list()
        ids = [id for id in ids if id not in ids_done]
        print(f"Number of papers already saved': {len(ids_done)}. To save: {len(ids)}")

    save_xmls_ids(file_name.absolute(), date, ids)


Number of papers published on  2022/12/01': 826
File name: 2022/12/2022_12_01.csv


 27%|██▋       | 220/826 [01:39<03:47,  2.66it/s]

In [None]:
# from pathlib import Path

# year = "2022"
# month = '04'
# day = '01'

# date = ymd2date(year, month, day)
# file_name = Path(date2filename)
# if file_name.is_file():
#     df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
#     ids_done = df.id.to_list()
#     ids = 
