<a href="https://colab.research.google.com/github/angirov/pubmed_crawler/blob/main/save_bydate_saver2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python3 --version
!pip install Bio

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
import os

dir = "/gdrive/MyDrive/dsr/pubmed_data/"
os.chdir(dir)
os.getcwd()

'/gdrive/MyDrive/dsr/pubmed_data'

In [None]:
import os
from tqdm import tqdm
import pandas as pd
from Bio import Entrez
Entrez.email = "example@example.com"
from urllib.error import HTTPError
from pathlib import Path

In [None]:
def get_pmids_from_one_day(topic, date):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]", 
                            retmax=10000,
                            datetype="pdat", 
                            mindate=date, 
                            maxdate=date, )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
# print(color.BOLD + 'Hello, World!' + color.END)


def print_err(text):
    err_text = color.BOLD + color.RED + text + color.END
    print(err_text)
    return err_text


def get_pmids_from_period(topic, span):
    handle = Entrez.esearch(db="pubmed", 
                            term= topic + "[MeSH Terms]",
                            retmax=1000,
                            datetype="pdat",
                            mindate=span[0],
                            maxdate=span[1], )
    record = Entrez.read(handle)
    ids = record["IdList"]
    return ids


import time
def get_xml(id):
    attempt = 1
    while attempt <= 5:
        try:
            handle = Entrez.efetch(db="pubmed", 
                            id=id, 
                            retmode="xml", 
                            rettype="abstract")
            xml = handle.read()
            return xml
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print_err(f">>>>> Attempt {attempt}: Received error from server {err}")
                attempt += 1
                time.sleep(15)
            else:
                attempt += 1
                time.sleep(15)
    print_err(f">>>>> Failed to download {id}. Error: {err}")
    return ""

    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(json.dumps(xmltodict.parse(xml)))


def get_xmls_ids(ids):
    pairs = []
    for id in tqdm(ids):
        xml = get_xml(id)
        pairs += [(id, xml)]
    return pairs


def save_pair(csv_writer, id, xml):
    row = id, xml
    csv_writer.writerow(row)
    # print(f"Saved paper {id}.")


def save_xmls_ids(file_name, date, ids):
    with open(file_name,'a+') as out:
        csv_writer=csv.writer(out, dialect='unix')
        for id in tqdm(ids):
            xml = get_xml(id)
            save_pair(csv_writer, id, xml)
    print(f"Saved {len(ids)} papers published on {date}.")
    return file_name


import csv
def save_span(span, list_of_pairs):
    file_name = span[0].replace("/", "_") + "__" + span[1].replace("/", "_") + ".csv"
    with open(file_name,'a+') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
    return file_name


def save_date(date, list_of_pairs):
    file_name = date2filename(date)
    with open(file_name,'a+') as out:
        csv_out=csv.writer(out, dialect='unix')
        # csv_out.writerow(['name','num'])
        # for row in list_of_pairs:
        #     csv_out.writerow(row)
        csv_out.writerows(list_of_pairs)
        # print(f"Saved {len(list_of_pairs)} papers published on {date}.")
    return file_name


def ymd2date(year, month, day):
    return f'{year}/{month:02}/{day:02}'


def date2ymd(date):
    year, month, day = str.split(date, sep="/")
    return year, month, day


def date2filename(dir, date):
    year, month, _ = date2ymd(date)
    filename = year + "/" + month + "/" + date.replace("/", "_") + ".csv"
    print(f"File name: {filename}")
    return filename


In [None]:
year = 2022
month = 12

# for month in range(10, 13):

for day in range(1, 32):
    date = ymd2date(year, month, day)

    ids = get_pmids_from_one_day("breast cancer", date)
    print(f"Number of papers published on  {date}': {len(ids)}")
    if len(ids) == 0:
        break

    file_name = Path(date2filename(dir, date))
    path = Path(dir + "/" + f"{year}" + "/" + f"{month:02}" + "/")
    path.mkdir(parents=True, exist_ok=True)


    if file_name.absolute().is_file():
        print(f"File {file_name} already exists.")
        df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
        ids_done = df.id.astype(int).astype(str).to_list()
        ids = [id for id in ids if id not in ids_done]
        print(f"Number of papers already saved': {len(ids_done)}. To save: {len(ids)}")

    save_xmls_ids(file_name.absolute(), date, ids)


Number of papers published on  2022/12/01': 826
File name: 2022/12/2022_12_01.csv


 27%|██▋       | 220/826 [01:39<03:47,  2.66it/s]

In [None]:
# from pathlib import Path

# year = "2022"
# month = '04'
# day = '01'

# date = ymd2date(year, month, day)
# file_name = Path(date2filename)
# if file_name.is_file():
#     df = pd.read_csv(file_name, header=None, names=['id', 'xml'])
#     ids_done = df.id.to_list()
#     ids = 


In [6]:
file_name = "/gdrive/MyDrive/dsr/pubmed_data/2021/12/2021_12_01.csv"
df = pd.read_csv(file_name, header=None, names=['id', 'xml'])

In [None]:
! pip install xmltodict
import xmltodict

In [61]:
def get_authors(dict):
  return [el['LastName'] + " " + el['ForeName'] for el in dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['AuthorList']['Author']]

get_authors(dict)

['Punda Marija',
 'Petranović Ovčariček Petra',
 'Tabain Anita',
 'Koopmans Klaas Pieter',
 'Alfier Gabriela',
 'Jukić Tomislav',
 'Fröbe Ana']

In [64]:
def get_title(dict):
  return dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleTitle']

get_title(dict)

'PRIMARY HYPERPARATHYROIDISM AND SERUM CALCIUM IN BREAST CANCER PATIENTS EVALUATED FOR LOW BONE MASS - A SINGLE CENTER EXPERIENCE.'

In [79]:
from pathlib import Path
from datetime import datetime

def get_date(file_name, dict):
  path = Path(file_name)
  datetime_object = datetime.strptime(path.stem, '%Y_%m_%d')
  return datetime_object

str(get_date(file_name, dict))

'2021-12-01 00:00:00'

In [65]:
def get_abstract(dict):
  return dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText']

get_abstract(dict)


'The bone health guidelines for breast cancer (BC) patients recommend bone mineral density (BMD) testing. Patients with low BMD and elevated serum calcium levels (SCLs) are further evaluated for primary hyperparathyroidism (PHPT). We aimed to determine the prevalence of PHPT in treated BC patients with low BMD and analyze the association of SCLs with histopathologic tumor features and cancer treatment. This retrospective study included postmenopausal BC patients examined at Osteoporosis Clinic between 2013 and 2020. Clinical and BMD data were collected from patient medical records. Patients with biochemical suspicion of PHPT underwent standard parathyroid imaging procedures. Nine out of 137 (6.6%) patients were diagnosed with PHPT; 8/9 patients underwent parathyroidectomy and one patient was advised to follow-up. Among the rest of 128 non-PHPT patients, higher SCLs showed a trend of positive association with higher tumor grade and axillary lymph node involvement, and received immunothe

In [52]:
[el['#text'] for el in dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['KeywordList']['Keyword']]

['Bone mineral density',
 'Breast cancer',
 'Cancer treatment',
 'Primary hyperparathyroidism',
 'Serum calcium']

In [118]:
from collections.abc import Mapping

def get_cites(dict):
  cites = []
  for el in dict['PubmedArticleSet']['PubmedArticle']['PubmedData']['ReferenceList']['Reference']:
    try:
      for id in el['ArticleIdList']["ArticleId"]:
        if isinstance(id, Mapping) and id['@IdType'] == "pubmed":
          cites += [id['#text']]
        else:
          print(id)
    except KeyError as e:
      continue

  return cites

get_cites(dict)

{'@IdType': 'doi', '#text': '10.1016/j.jbo.2017.03.001'}
{'@IdType': 'pmc', '#text': 'PMC5384888'}
{'@IdType': 'doi', '#text': '10.6004/jnccn.2020.0016'}
{'@IdType': 'doi', '#text': '10.1007/BF03343867'}
{'@IdType': 'doi', '#text': '10.1200/JCO.2008.20.2549'}
{'@IdType': 'doi', '#text': '10.1007/s12282-009-0158-0'}
{'@IdType': 'doi', '#text': '10.1503/cmaj.150638'}
{'@IdType': 'pmc', '#text': 'PMC4786403'}
{'@IdType': 'doi', '#text': '10.1056/NEJMcp042806'}
{'@IdType': 'doi', '#text': '10.1007/s12282-011-0253-x'}
{'@IdType': 'doi', '#text': '10.1155/2014/608585'}
{'@IdType': 'pmc', '#text': 'PMC4052557'}
{'@IdType': 'doi', '#text': '10.1200/JCO.2008.17.7451'}
{'@IdType': 'doi', '#text': '10.1200/JCO.2008.17.7451'}
{'@IdType': 'doi', '#text': '10.3275/8580'}
{'@IdType': 'doi', '#text': '10.1007/s10552-009-9456-2'}
{'@IdType': 'doi', '#text': '10.1002/1097-0142(19871001)60:7%3C1620:AID-CNCR2820600733%3E3.0.CO;2-3'}
{'@IdType': 'doi', '#text': '10.1002/1097-0142(19871001)60:7<1620::AID-CN

['28413771',
 '32259783',
 '11407650',
 '19546403',
 '19657710',
 '26504099',
 '15673803',
 '21290264',
 '24959365',
 '18955450',
 '22931931',
 '19856117',
 '3621132',
 '22406994',
 '19593149',
 '22931931',
 '19595349',
 '22207495',
 '28838807',
 '30240520',
 '8255296',
 '24824552',
 '23167346']

In [11]:
dict = xmltodict.parse(df["xml"][0])
dict

{'PubmedArticleSet': {'PubmedArticle': {'MedlineCitation': {'@Status': 'MEDLINE',
    '@Owner': 'NLM',
    '@IndexingMethod': 'Automated',
    'PMID': {'@Version': '1', '#text': '35734499'},
    'DateCompleted': {'Year': '2022', 'Month': '06', 'Day': '24'},
    'DateRevised': {'Year': '2022', 'Month': '07', 'Day': '16'},
    'Article': {'@PubModel': 'Print',
     'Journal': {'ISSN': {'@IssnType': 'Electronic', '#text': '1333-9451'},
      'JournalIssue': {'@CitedMedium': 'Internet',
       'Volume': '60',
       'Issue': '4',
       'PubDate': {'Year': '2021', 'Month': 'Dec'}},
      'Title': 'Acta clinica Croatica',
      'ISOAbbreviation': 'Acta Clin Croat'},
     'ArticleTitle': 'PRIMARY HYPERPARATHYROIDISM AND SERUM CALCIUM IN BREAST CANCER PATIENTS EVALUATED FOR LOW BONE MASS - A SINGLE CENTER EXPERIENCE.',
     'Pagination': {'StartPage': '617',
      'EndPage': '626',
      'MedlinePgn': '617-626'},
     'ELocationID': {'@EIdType': 'doi',
      '@ValidYN': 'Y',
      '#text': '1