<a href="https://colab.research.google.com/github/angirov/pubmed_crawler/blob/main/save_bydate_saver2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [103]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [104]:
import os

proj_dir = "/gdrive/MyDrive/dsr/pubmed_data/"
os.chdir(proj_dir)
os.getcwd()

'/gdrive/MyDrive/dsr/pubmed_data'

In [105]:
import os
from tqdm import tqdm
import pandas as pd

from urllib.error import HTTPError
from pathlib import Path

In [106]:
from collections.abc import Mapping

def get_authors(dict) -> list:
  try:
    authors = dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['AuthorList']['Author']
    if isinstance(authors, list):
      lst = []
      for author in authors:
        try:
          lst += [author['LastName'] + " " + author['ForeName']]
        except KeyError:
          lst += [author['CollectiveName']]
      return lst
    elif isinstance(authors, Mapping):
      try:
        return [authors['LastName'] + " " + authors['ForeName']]
      except KeyError:
        return [authors['CollectiveName']]
    else:
      assert False
  except KeyError:
    return []

  


def get_title(dict) -> str:
  return dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleTitle']


def get_abstract(dict) -> str:
  text = ""
  try:
    abstract =  dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText']
  except KeyError as e:
    return ""
  if isinstance(abstract, list):
    for el in abstract:
      text += el["@Label"] + "\n" + el["#text"]
      if el != abstract[-1]:
         text += "\n"
  elif isinstance(abstract, Mapping):
    try:
      text = abstract["#text"]
    except KeyError as e:
      try:
        text = abstract["i"]
      except KeyError as e:
        print(e)
        assert False
  else:
    try:
      assert isinstance(abstract, str)
      text = abstract
    except AssertionError:
      print(f"Abstract type: {type(abstract)}")
      assert False
  return text


def get_keywords(dict) -> list:
  try:
    return [el['#text'] for el in dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['KeywordList']['Keyword']]
  except:
    return []


def get_refs(dict) -> list:
  global global_special_ref_counter
  cites = []
  try:
    refs = dict['PubmedArticleSet']['PubmedArticle']['PubmedData']['ReferenceList']['Reference']
    if not isinstance(refs, list):
      refs = [refs]
    for el in refs:
      try:
        for id in el['ArticleIdList']["ArticleId"]:
          try:
            if isinstance(id, Mapping) and id['@IdType'] == "pubmed":
              cites += [id['#text']]
            else:
              return [] ############################################################
          except KeyError as e:
            continue
      except KeyError as e:
        continue
  except KeyError as e:
    return []
  except TypeError:
    global_special_ref_counter += 1
    return dict['PubmedArticleSet']['PubmedArticle']['PubmedData']['ReferenceList']
  return cites

In [107]:
import csv

from pathlib import Path
from datetime import datetime

def save_row(csv_writer, file_name, paper_id, xml_str):
    row = raw2row(file_name, paper_id, xml_str)
    csv_writer.writerow(row)


def save_new_file(old_file_path, save_dir):
    df = pd.read_csv(old_file_path, header=None)
    length = len(df)
    stem = Path(old_file_path).stem
    date = datetime.strptime(stem, '%Y_%m_%d')
    new_file_path = os.path.join(save_dir, stem + "_text.csv")
    with open(new_file_path,'a+') as out:
        csv_writer=csv.writer(out, dialect='unix')
        for idx in range(length):
            paper_id = df[0][idx]
            xml_str = df[1][idx]
            # print(paper_id)
            # print(xml_str)
            try:
              title, authors, keywords, abstract, references = process_xml(file_name, paper_id, xml_str)
            finally:
              print(f"Finished to process {paper_id}")
            csv_writer.writerow([paper_id, date, title, authors, keywords, abstract, references])
    print(f"Processed {length} papers published on {stem}.")
    pass

In [110]:
# ! pip install xmltodict
import xmltodict

def process_xml(file_name, paper_id, xml_str):
    dict = xmltodict.parse(xml_str)
    authors = get_authors(dict)
    title = get_title(dict)
    abstract = get_abstract(dict)
    keywords = get_keywords(dict)
    references = get_refs(dict)
    # print(f"Failed to process {paper_id}")
    result_list = [title, authors, keywords, abstract, references]
    for i, _ in enumerate(result_list):
      if result_list[i] == []:
        result_list[i] = ""
    return result_list

In [112]:
year = "2021"
# month = 12
global_special_ref_counter = 0
for m in range(1, 13):

        month = f"{m:02}"
        dir = os.path.join(proj_dir, year, month)
        save_dir = Path(os.path.join(proj_dir, "text_csv_new2", year, month))
        save_dir.mkdir(parents=True, exist_ok=True)

        for file_name in os.listdir(dir):
            save_new_file(os.path.join(dir, file_name), save_dir)
print(f"global_special_ref_counter: {global_special_ref_counter}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Finished to process 34595602
Finished to process 34593458
Finished to process 34593453
Finished to process 34593449
Finished to process 34593419
Finished to process 34593169
Finished to process 34592188
Finished to process 34592187
Finished to process 34592179
Finished to process 34591648
Finished to process 34591612
Finished to process 34587475
Finished to process 34585062
Finished to process 34582066
Finished to process 34579909
Finished to process 34569446
Finished to process 34565179
Finished to process 34563921
Finished to process 34563835
Finished to process 34563833
Finished to process 34562826
Finished to process 34560477
Finished to process 34558774
Finished to process 34554605
Finished to process 34553296
Finished to process 34550927
Finished to process 34550926
Finished to process 34550924
Finished to process 34550922
Finished to process 34547183
Finished to process 34545187
Finished to process 34544554
Finishe

# Inspecting the errors

In [None]:
# df = pd.read_csv("/gdrive/MyDrive/dsr/pubmed_data/2021/08/2021_08_03.csv", header=None)
# problem_id = 34343058

# xml_str = df[df[0] == problem_id].iloc[0, 1]
# dict = xmltodict.parse(xml_str)
# # dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText']
# dict