In [1]:
import matplotlib.pyplot as plt
import matplotlib

%matplotlib notebook

import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse

import xml.etree.ElementTree as et
import os

import time
import pickle
import memory_profiler
%load_ext memory_profiler

In [2]:
import gzip


In [3]:
from lxml import etree, objectify
import random        
from io import BytesIO, StringIO
from pyarrow import Table, json as pajson, parquet
import pyarrow as pa
from pathlib import Path
import json

In [4]:

class Article():
  """
  A data structure to absorb parsed xml
  """
  def __init__(self):
    self.struct = {}
    self.hierarchy = []
    self.position = self.struct

  @property
  def flat(self):
    return flattened(self.struct)

  def set_target(self, name):
    self.hierarchy.append(name)
    if name in list_elements:
      if not name in self.position:
        self.position[name] = []
      self.position[name].append({})
      self.position = self.position[name][-1]
    else:
      if name in self.position:
        print(self.flat)
        raise KeyError(f"{name} already exists")
      self.position[name] = {}
      self.position = self.position[name]
  def pop_target(self, check_value = None):
    value = self.hierarchy.pop()
    if check_value is not None:
      assert value == check_value
    self.position = self.struct
    for i in range(len(self.hierarchy)):
      if isinstance(self.position, list):
        self.position = self.position[-1]
      self.position = self.position[self.hierarchy[i]]
    if isinstance(self.position, list):
      self.position = self.position[-1]

  def set_value(self, value):
    if 'text' in self.position:
      if isinstance(self.position['text'], list):
        self.position['text'].append(value)
      else:
        self.position['text'] = [self.position['text'], value]
    else:
      self.position['text'] = value


In [23]:
# Keys that can be repeated in the XML. By setting them explicitly, force all occurrences
# to be lists allowing coalescing into tables.
list_elements = { 
  'Author', 'sup', 'sub', "Title",
  'PublicationType', 'Language', 
  'Chemical', 'b', 'u', 
  'i', 'SupplMeshName', "PublicationType",
  "MeshHeading",
  'QualifierName', 'GeneSymbol', 'SpaceFlightMission',
  'PubMedPubDate', 'DataBank',
  'ArticleId', "Identifier",
  'Reference', 'OtherID', 
  'Grant', 'OtherAbstract', 
  'AbstractText', 'DispFormula',
  'AccessionNumber',
  'Investigator', 'GeneralNote', 'OtherId', 'Keyword', 'ELocationID',
  'PersonalNameSubject', 'CommentsCorrections', 'AffiliationInfo'
}

yuck = {"PubmedArticle", "PubmedArticleSet", 'AuthorList', "MeshHeadingList", "KeywordList", 'ReferenceList'}

text_fields = {
  # Force some fields to be just text.
  'MedlinePgn', 'AbstractText', 'OtherAbstract', 'ArticleTitle', 'Keyword', 'Country', 
  'Suffix', 'Citation', 'Agency'
}

def yield_articles(fname, limit = float("inf")):
  r = gzip.open(fname, "rb")
  yielded = 0
  for i, (event, elem) in enumerate(etree.iterparse(r, events=("start", "end"))):
    if event == "start":
      if not elem.tag in yuck and not elem.tag.startswith("{http://www.w3.org/1998/Math/MathML}"):
        current_article.set_target(elem.tag)
      if elem.tag == 'PubmedArticle':
        current_article = Article()
        continue
    if event == "end":
      if elem.tag == 'PubmedArticle':
        try:
          yield current_article.flat
        except RecursionError:
          print(current_article.struct)
          raise
        yielded += 1
        if yielded >= limit:
          return
        current_article = None
      if elem.text and elem.text.strip():
        current_article.set_value(elem.text.strip())
      if not elem.tag in yuck and not elem.tag.startswith("{http://www.w3.org/1998/Math/MathML}"):
        current_article.pop_target(elem.tag)

def just_the_text(struct) -> str:
  if isinstance(struct, list): return "\n".join([just_the_text(i) for i in struct])
  elif isinstance(struct, str): return struct
  elif isinstance(struct, dict):
    output = []
    for k, v in struct.items():
      if k != 'text':
        output.append(just_the_text(v))
      else:
        output.append(just_the_text(v))
    return " ".join(output)

def flattened(struct):
  if isinstance(struct, list):
    return [flattened(i) for i in struct]
  elif isinstance(struct, dict):
    if "text" in struct:
      return just_the_text(struct)
    output = {}
    for k, v in struct.items():
      if k in text_fields:
        output[k] = just_the_text(v)
      else:
        output[k] = flattened(v)
    return output
  else:
    raise

def export_file(fn, limit = float("inf"), how = "obj"):
    
  input = Path(fn)
  output = Path("parquet", Path(fn).with_suffix("").with_suffix(".parquet").name)
  if output.exists():
    return
  fakeout = BytesIO()
  if how == "obj":
    read = Table.from_pylist([*yield_articles(fn, limit)])
  elif how == "json":
    for i, article in enumerate(yield_articles(fn, limit)):
      fakeout.write(json.dumps(article).encode("utf-8") + b"\n")
      print(i, end = "\r")
    fakeout.seek(0)
    read = pa.json.read_json(fakeout)
  else:
    raise ValueError(f"{how} is not a valid value for how")
  parquet.write_table(read, output)
#  return Table.from_pylist([*yield_articles(fn, limit)])


files = [*Path("data").glob("*.gz")]
random.shuffle(files)

for file in files:
  print(file)
  export_file(file, how = "json")



data/pubmed22n0595.xml.gz
data/pubmed22n0557.xml.gz
data/pubmed22n0882.xml.gz
29999

ArrowInvalid: JSON parse error: Column(/MedlineCitation/Article/GrantList/Grant/[]/Agency) changed from string to object in row 127

In [49]:
counts.most_common(100)

[('MedlineCitation--MeshHeadingList--MeshHeading', 269975),
 ('MedlineCitation--MeshHeadingList--MeshHeading--DescriptorName', 269975),
 ('MedlineCitation--MeshHeadingList--MeshHeading--QualifierName', 130828),
 ('MedlineCitation--Article--AuthorList--Author', 90036),
 ('MedlineCitation--Article--AuthorList--Author--LastName', 90036),
 ('PubmedData--History--PubMedPubDate', 90024),
 ('PubmedData--History--PubMedPubDate--Year', 90024),
 ('PubmedData--History--PubMedPubDate--Month', 90024),
 ('PubmedData--History--PubMedPubDate--Day', 90024),
 ('MedlineCitation--Article--AuthorList--Author--ForeName', 89992),
 ('MedlineCitation--Article--AuthorList--Author--Initials', 89992),
 ('PubmedData--History--PubMedPubDate--Hour', 60003),
 ('PubmedData--History--PubMedPubDate--Minute', 60003),
 ('MedlineCitation--Article--PublicationTypeList--PublicationType', 57487),
 ('PubmedData--ArticleIdList--ArticleId', 51865),
 ('PubmedData--ReferenceList--Reference', 45141),
 ('PubmedData--ReferenceList--R

In [42]:
level = 0
in_meta = False

def article_yielder(fname, limit = float("inf")):
  """
  fname: a gzipped xml file from pubmed.
  limit: The number of **lines** to stop after. For debugging.
  """
  r = gzip.open(fname, "rb")
  current_accumulator = None
  for i, (event, elem) in enumerate(etree.iterparse(r, events=("end", "start"))):
    if event == "start":
      if elem.tag == "MedlineCitation":
        # Start of whole new citation.
        d = {
          'authors': [],
          'mesh': [],
        }
      if elem.tag in ['Author', :

    if i > limit:
      break
    if event == "end":
      for k in ['PMID', 'AbstractText', 'ISSN', 'Language', 'MedlineTA', 'ISSNLinking', 'ArticleTitle', 'Country', 'Title', 'NlmUniqueID']:
        if elem.tag == k:
          if not k in d: # Don't overwrite if we already have this key
            d[k] = elem.text      
      if elem.tag == "MedlineCitation":
        yield d

def export_file(fn):
  # Extract some relevant parts of a gzip file to an arrow table.
#  rows = []
#  for i, article in enumerate(article_yielder(fn)):
#    rows.append(article)
  return Table.from_pylist([*article_yielder(fn)])

files = [*Path("data").glob("*.gz")]
random.shuffle(files)

tb = export_file(files[0])

pyarrow.Table
PMID: string
ISSN: string
Title: string
ArticleTitle: string
AbstractText: string
Language: string
Country: string
MedlineTA: string
ISSNLinking: string
----
PMID: [["30927714","30927715","30927716","30927717","30927718",...,"30958281","30958282","30958283","30958284","30958285"]]
ISSN: [["1878-5905","1878-5905","1879-0046","1527-3326","1527-3326",...,"1512-0112","1512-0112","1512-0112","1512-0112","1512-0112"]]
Title: [["Biomaterials","Biomaterials","Drug and alcohol dependence","Solid state nuclear magnetic resonance","Solid state nuclear magnetic resonance",...,"Georgian medical news","Georgian medical news","Georgian medical news","Georgian medical news","Georgian medical news"]]
ArticleTitle: [["Ultrasmall Cu","miR-100-5p-abundant exosomes derived from infrapatellar fat pad MSCs protect articular cartilage and ameliorate gait abnormalities via inhibition of mTOR in osteoarthritis.","Detection of heroin intake in patients in substitution treatment using oral fluid as 

# Functions

## Function for importing each xml file

In [4]:
def xml_import(xml_file):
    """Parses the following elements of each paper stored in the input XML file, and stores them in a pandas 
    DataFrame.
    
    Elements of each paper:
    - PDMI (stored in <PDMI>).
    - Title (stored in <ArticleTitle>).
    - Abstract (stored in <AbstractText>).
    - Language (stored in <Language>).
    - Journal (stored in <Title>).
    - Date (stored in <PubDate>).
    
    
    
    Details about information extracion: 
    - PDMI: If there is no tag <PDMI>, it will add 'no tag'. If there is more than one <PDMI>,
    it will import only the first one. If <PDMI> contains no text, it will add '' (empty string).
    
    - Title: If there is no tag <ArticleTitle>, it will add 'no tag'. If there is more than one
    <ArticleTitle>, it will import only the first one. If <ArticleTitle> contains no text, it will add '' (empty string).
    
    - Abstract: If there is no tag <Abstract> (parent of <AbstractText>), it will add '' (empty string).
    If there is more than one <AbstractText> inside <Abstract>, it will combine them into one list.
    If <AbstractText> contains no text, it will add '' (empty string).
    If there is more than one <Abstract> or other tags containing <AbstractText>, like 
    <OtherAbstract>, it will not get text from them. 
    
    - Language: If there is no tag <Language>, it will add 'no tag'. If there is more than one
    <Language>, it will import only the first one. If <Language> contains no text, it will add '' (empty string).
    
    - Journal: If there is no tag <Title>, it will add 'no tag'. If there is more than one <Title>,
    it will import only the first one. If <Title> contains no text, it will add '' (empty string).
    
    - Date: If there is no tag <PubDate>, it will add 'no tag'. It will combine all the <PubDate>'s childs' texts
    into one (due to the assymetry of the date storage, sometimes with <Day>, <Month> and <Year>, other times 
    with <MedlineDate>). If <PubDate> contains no further childs, it will print ' '.
    """
    
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()

    dicc={}

    #PMID 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for element in child2.iter('MedlineCitation'):
                tag=element.find('PMID')
                if tag is None:
                    ros.append(['no tag'])
                else:
                    res=[]
                    if not tag.text :
                        res.append('')
                    else:
                        res.append(tag.text)
                    ros.append(res)


    ros=[' '.join(ele) for ele in ros]
    dicc['PMID']=ros

    
    #Title 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for article in child3.iter('Article'):
                    tag=article.find('ArticleTitle')
                    if tag is None:
                        ros.append(['no tag'])
                    else:
                        res=[]
                        if not tag.text :
                            res.append('')
                        else:
                            res.append(tag.text)
                        ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['Title']=ros


    #Abstract 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for article in child3.iter('Article'):
                    tag=article.find('Abstract')
                    if tag is None:
                        ros.append([''])
                    else:
                        for child4 in child3:
                            for elem in child4.iter('Abstract'):
                                res=[]
                                for AbstractText in elem.iter('AbstractText'):
                                    if not AbstractText.text:
                                        res.append('')
                                    else:
                                        res.append(AbstractText.text)
                                ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['AbstractText']=ros
    
    
    #Language 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for article in child3.iter('Article'):
                    tag=article.find('Language')
                    if tag is None:
                        ros.append(['no tag'])
                    else:
                        res=[]
                        if not tag.text :
                            res.append('')
                        else:
                            res.append(tag.text)
                        ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['Language']=ros

    
    #Journal 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for child4 in child3:
                    for journal in child4.iter('Journal'):
                        tag=journal.find('Title')
                        if tag is None:
                            ros.append(['no tag'])
                        else:
                            res=[]
                            if not tag.text:
                                res.append('')
                            else:
                                res.append(tag.text)
                            ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['Journal']=ros

    
    #Date
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for child4 in child3:
                    for child5 in child4:
                        for JI in child5.iter('JournalIssue'):
                            tag=JI.find('PubDate')
                            if tag is None:
                                ros.append(['no tag'])
                            else:
                                res=[]
                                for elem in tag:
                                    res.append(elem.text)
                                ros.append(res)
                                
    ros=[' '.join(ele) for ele in ros]
    dicc['Date']=ros
    
    out=pd.DataFrame.from_dict(dicc)
    return out, dicc

## Function for importing all files from a directory

In [7]:
def import_all_files(path, order_files=False):
    """
    Imports all xml files from a directory into a combined dataframe using the function xml_import created above.
    
    Input:
        - path : srt "" with the path of the directory with the files you want to import.
        - order_files=False : if it is True it will print the order in which files are being imported.
        
    Output:
        - final_df : Panda Dataframe with all the xml files from the directory imported and merged 
        together (concatenated in the order that they were in the directory (from up to down)).
    """
    # name_files has the names of both .xml files and .gz.md5 files
    name_files=os.listdir(path)
    
    # we select only the .xml files
    len_filenames_list=map(len, name_files)
    len_filenames=np.fromiter(len_filenames_list, dtype=np.int64,count=len(name_files))

    name_files_array=np.array(name_files)
    name_xml_files=name_files_array[len_filenames==17]
    
    # import
    frame_all_df=[]
    
    for i in range(0,len(name_xml_files)):
        path_file=path+name_xml_files[i]
        if order_files==True:
            print(name_xml_files[i])
        df,dic=xml_import(str(path_file))
        frame_all_df.append(df)

    final_df=pd.concat(frame_all_df,ignore_index=True)
    return final_df

# Import data

In [8]:
%%time

path="data"

all_files_df=import_all_files(path, order_files=True)

#save results
all_files_df.to_pickle("variables/all_files_df")

pubmed21n0038.xml
pubmed21n0722.xml
pubmed21n0814.xml
pubmed21n0171.xml
pubmed21n0124.xml
pubmed21n0864.xml
pubmed21n0876.xml
pubmed21n0133.xml
pubmed21n0539.xml
pubmed21n0119.xml
pubmed21n0056.xml
pubmed21n0497.xml
pubmed21n0821.xml
pubmed21n0241.xml
pubmed21n0986.xml
pubmed21n0769.xml
pubmed21n0857.xml
pubmed21n0073.xml
pubmed21n0479.xml
pubmed21n0044.xml
pubmed21n0208.xml
pubmed21n0071.xml
pubmed21n0014.xml
pubmed21n0795.xml
pubmed21n0074.xml
pubmed21n0662.xml
pubmed21n0543.xml
pubmed21n0555.xml
pubmed21n0048.xml
pubmed21n0512.xml
pubmed21n0152.xml
pubmed21n0078.xml
pubmed21n0093.xml
pubmed21n1051.xml
pubmed21n0304.xml
pubmed21n0172.xml
pubmed21n0361.xml
pubmed21n0831.xml
pubmed21n0376.xml
pubmed21n0203.xml
pubmed21n0885.xml
pubmed21n0740.xml
pubmed21n0154.xml
pubmed21n1005.xml
pubmed21n0398.xml
pubmed21n0079.xml
pubmed21n1011.xml
pubmed21n0049.xml
pubmed21n0929.xml
pubmed21n0404.xml
pubmed21n0887.xml
pubmed21n0911.xml
pubmed21n0668.xml
pubmed21n0231.xml
pubmed21n0013.xml
pubmed21n0

In [12]:
print('There are {} papers'.format(all_files_df.shape[0]))

There are 31820051 papers


# Filtering abstracts

#### First filtering by: 
- Empty abstracts
- Language
- Correction abstracts

In [13]:
%%time

# Eliminate empty abstracts
clean_df=all_files_df[all_files_df.AbstractText != '']

# Eliminate non-english papers
clean_df=clean_df[clean_df.Language == 'eng']

# Eliminate correction abstracts
weird_text='An amendment to this paper has been published and can be accessed via a link at the top of the paper.'
clean_df=clean_df[clean_df.AbstractText != weird_text]

# print size
print('After first cleaning, there are {} papers'.format(clean_df.shape[0]))


After first cleaning, there are 19511885 papers
--- 0.5490604281425476 minutes ---


## Threshold and cut off
Based on the abstract string length. We observed that papers with either too short or too long abstracts corresponded to non-meaningful abstracts. Examples of non-meaningful abstracts are texts containing only links, symbols, or simply empty. Those type of abstracts create artifacts in the embedding that hinder the quality of the visualization, therefore they were removed.

The values of cutoff and threshold were chosen trying to balance the fraction of papers filtered out and the artifacts observed in the embedding.

### Cut off = 4000

In [14]:
print('Before cut off, there are {} papers'.format(clean_df.shape[0]))
abstracts=clean_df['AbstractText'].tolist()
len_strings=map(len, abstracts)
len_abstracts=np.fromiter(len_strings, dtype=np.int64,count=len(abstracts))


cut_off=4000
clean_df=clean_df[len_abstracts<cut_off]
print('After cut off, there are {} papers'.format(clean_df.shape[0]))

Before cut off, there are 19511885 papers
After cut off, there are 19500414 papers


### Threshold = 250

In [15]:
threshold=250
len_short_abstracts=len_abstracts[len_abstracts<cut_off]
clean_df=clean_df[len_short_abstracts>threshold]
print('After threshold, there are {} papers'.format(clean_df.shape[0]))

After threshold, there are 19016308 papers


In [16]:
#save df
clean_df.to_pickle("variables/clean_df")