<a href="https://colab.research.google.com/github/angirov/pubmed_crawler/blob/main/process_xml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [121]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [122]:
import os

proj_dir = "/gdrive/MyDrive/dsr/pubmed_data/"
os.chdir(proj_dir)
os.getcwd()

'/gdrive/MyDrive/dsr/pubmed_data'

In [123]:
import os
from tqdm import tqdm
import pandas as pd

from urllib.error import HTTPError
from pathlib import Path

In [144]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
# print(color.BOLD + 'Hello, World!' + color.END)

In [124]:
from collections.abc import Mapping

def get_authors(dict) -> list:
  try:
    authors = dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['AuthorList']['Author']
    if isinstance(authors, list):
      lst = []
      for author in authors:
        try:
          lst += [author['LastName'] + " " + author['ForeName']]
        except KeyError:
          lst += [author['CollectiveName']]
      return lst
    elif isinstance(authors, Mapping):
      try:
        return [authors['LastName'] + " " + authors['ForeName']]
      except KeyError:
        return [authors['CollectiveName']]
    else:
      assert False
  except KeyError:
    return []

  


def get_title(dict) -> str:
  return dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['ArticleTitle']


def get_abstract(dict) -> str:
  text = ""
  try:
    abstract =  dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText']
  except KeyError as e:
    return ""
  if isinstance(abstract, list):
    for el in abstract:
      text += el["@Label"] + "\n" + el["#text"]
      if el != abstract[-1]:
         text += "\n"
  elif isinstance(abstract, Mapping):
    try:
      text = abstract["#text"]
    except KeyError as e:
      try:
        text = abstract["i"]
      except KeyError as e:
        print(e)
        assert False
  else:
    try:
      assert isinstance(abstract, str)
      text = abstract
    except AssertionError:
      print(f"Abstract type: {type(abstract)}")
      assert False
  return text


def get_keywords(dict) -> list:
  try:
    return [el['#text'] for el in dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['KeywordList']['Keyword']]
  except:
    return []


def get_refs(dict) -> list:
  global global_special_ref_counter
  cites = []
  try:
    refs = dict['PubmedArticleSet']['PubmedArticle']['PubmedData']['ReferenceList']['Reference']
    if not isinstance(refs, list):
      refs = [refs]
    for el in refs:
      try:
        for id in el['ArticleIdList']["ArticleId"]:
          try:
            if isinstance(id, Mapping) and id['@IdType'] == "pubmed":
              cites += [id['#text']]
            else:
              return [] ############################################################
          except KeyError as e:
            continue
      except KeyError as e:
        continue
  except KeyError as e:
    return []
  except TypeError:
    global_special_ref_counter += 1
    return dict['PubmedArticleSet']['PubmedArticle']['PubmedData']['ReferenceList']
  return cites

In [154]:
import csv

from pathlib import Path
from datetime import datetime

def save_row(csv_writer, file_name, paper_id, xml_str):
    row = raw2row(file_name, paper_id, xml_str)
    csv_writer.writerow(row)



def save_new_file(old_file_path, save_dir):
    global global_failure_counter
    df = pd.read_csv(old_file_path, header=None)
    length = len(df)
    stem = Path(old_file_path).stem
    date = datetime.strptime(stem, '%Y_%m_%d')
    new_file_path = os.path.join(save_dir, stem + "_text.csv")
    with open(new_file_path,'a+') as out:
        csv_writer=csv.writer(out, dialect='unix')
        for idx in range(length):
            paper_id = df[0][idx]
            xml_str = df[1][idx]
            # print(paper_id)
            # print(xml_str)
            try:
              title, authors, keywords, abstract, references = process_xml(file_name, paper_id, xml_str)
            except KeyError as e:
              global_failure_counter += 1
              print(color.BOLD + color.RED + f'>>>>>>>> Failed to process {paper_id} [{stem}] <<<<<<<<<' + color.END)
            except:
              print(f"Finished to process {paper_id} [{stem}]")
              assert False
            csv_writer.writerow([paper_id, date, title, authors, keywords, abstract, references])
    print(f"Processed {length} papers published on {stem}.")
    pass

In [147]:
# ! pip install xmltodict
import xmltodict

def process_xml(file_name, paper_id, xml_str):
    dict = xmltodict.parse(xml_str)
    authors = get_authors(dict)
    title = get_title(dict)
    abstract = get_abstract(dict)
    keywords = get_keywords(dict)
    references = get_refs(dict)
    # print(f"Failed to process {paper_id}")
    result_list = [title, authors, keywords, abstract, references]
    for i, _ in enumerate(result_list):
      if result_list[i] == []:
        result_list[i] = ""
    return result_list

In [156]:
year = "2022"
# month = 12
global_special_ref_counter = 0
global_failure_counter = 0
for m in range(4, 13):

        month = f"{m:02}"
        xml_dir = os.path.join(proj_dir, "xmls", year, month)
        save_dir = Path(os.path.join(proj_dir, "text_csv", year, month))
        save_dir.mkdir(parents=True, exist_ok=True)

        for file_name in os.listdir(xml_dir):
            save_new_file(os.path.join(xml_dir, file_name), save_dir)
print(f"global_special_ref_counter: {global_special_ref_counter}")
print(f"global_failure_counter: {global_failure_counter}")

Processed 42 papers published on 2022_04_04.
Processed 8 papers published on 2022_04_03.
Processed 21 papers published on 2022_04_02.
[1m>>>>>>>> Failed to process 35247637 [2022_04_01] <<<<<<<<<[0m
Processed 732 papers published on 2022_04_01.
Processed 37 papers published on 2022_04_05.
Processed 34 papers published on 2022_04_29.
Processed 42 papers published on 2022_04_28.
Processed 45 papers published on 2022_04_27.
Processed 39 papers published on 2022_04_26.
Processed 38 papers published on 2022_04_25.
Processed 8 papers published on 2022_04_24.
Processed 16 papers published on 2022_04_23.
Processed 43 papers published on 2022_04_22.
Processed 30 papers published on 2022_04_21.
Processed 43 papers published on 2022_04_20.
Processed 42 papers published on 2022_04_19.
Processed 36 papers published on 2022_04_18.
Processed 6 papers published on 2022_04_17.
Processed 16 papers published on 2022_04_16.
Processed 48 papers published on 2022_04_15.
Processed 35 papers published on 20

# Inspecting the errors

In [143]:
# df = pd.read_csv("/gdrive/MyDrive/dsr/pubmed_data/xmls/2022/04/2022_04_01.csv", header=None)
# problem_id = 35247637

# xml_str = df[df[0] == problem_id].iloc[0, 1]
# dict = xmltodict.parse(xml_str)
# # dict['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']['Abstract']['AbstractText']
# dict

{'PubmedArticleSet': {'pubmed': None}}