In [1]:
import matplotlib.pyplot as plt
import matplotlib

%matplotlib notebook

import pandas as pd
import numpy as np
import scipy as sp
from scipy import sparse

import xml.etree.ElementTree as et
import os

import time
import pickle
import memory_profiler
%load_ext memory_profiler

In [2]:
import gzip


In [3]:
from lxml import etree, objectify


In [40]:
level = 0
in_meta = False
def article_yielder(fname, limit = float("inf")):
  r = gzip.open(fname, "rb")
  for i, (event, elem) in enumerate(etree.iterparse(r, events=("end", "start"))):
    if event == "start" and elem.tag == "MedlineCitation":
      d = {}
    if i > limit:
      break
    if event == "end":
      for k in ['PMID', 'AbstractText', 'ISSN', 'Language', 'MedlineTA', 'ISSNLinking', 'ArticleTitle', 'Country', 'Title']:
        if elem.tag == k:
          d[k] = elem.text      
      if elem.tag == "MedlineCitation":
        yield d

import random        
from io import BytesIO
from pyarrow import Table
from pathlib import Path

def export_file(fn):
  rows = []
  for i, article in enumerate(article_yielder(fn)):
    rows.append(article)
  return Table.from_pylist(rows)
files = [*Path("data").glob("*.gz")]
random.shuffle(files)

export_file(files[0])

pyarrow.Table
PMID: string
ISSN: string
Title: string
ArticleTitle: string
AbstractText: string
Language: string
Country: string
MedlineTA: string
ISSNLinking: string
----
PMID: [["18222646","18222647","18222648","18222649","18222652",...,"18254138","18254137","18254139","18254140","18254136"]]
ISSN: [["0896-8411","0898-6568","0920-9964","0923-2508","0924-977X",...,"0269-3879","0960-7692","0269-3879","0269-3879","1520-7552"]]
Title: [["Journal of autoimmunity","Cellular signalling","Schizophrenia research","Research in microbiology","European neuropsychopharmacology : the journal of the European College of Neuropsychopharmacology",...,"Biomedical chromatography : BMC","Ultrasound in obstetrics & gynecology : the official journal of the International Society of Ultrasound in Obstetrics and Gynecology","Biomedical chromatography : BMC","Biomedical chromatography : BMC","Diabetes/metabolism research and reviews"]]
ArticleTitle: [["Will hematopoietic stem cell transplantation cure human au

# Functions

## Function for importing each xml file

In [4]:
def xml_import(xml_file):
    """Parses the following elements of each paper stored in the input XML file, and stores them in a pandas 
    DataFrame.
    
    Elements of each paper:
    - PDMI (stored in <PDMI>).
    - Title (stored in <ArticleTitle>).
    - Abstract (stored in <AbstractText>).
    - Language (stored in <Language>).
    - Journal (stored in <Title>).
    - Date (stored in <PubDate>).
    
    
    
    Details about information extracion: 
    - PDMI: If there is no tag <PDMI>, it will add 'no tag'. If there is more than one <PDMI>,
    it will import only the first one. If <PDMI> contains no text, it will add '' (empty string).
    
    - Title: If there is no tag <ArticleTitle>, it will add 'no tag'. If there is more than one
    <ArticleTitle>, it will import only the first one. If <ArticleTitle> contains no text, it will add '' (empty string).
    
    - Abstract: If there is no tag <Abstract> (parent of <AbstractText>), it will add '' (empty string).
    If there is more than one <AbstractText> inside <Abstract>, it will combine them into one list.
    If <AbstractText> contains no text, it will add '' (empty string).
    If there is more than one <Abstract> or other tags containing <AbstractText>, like 
    <OtherAbstract>, it will not get text from them. 
    
    - Language: If there is no tag <Language>, it will add 'no tag'. If there is more than one
    <Language>, it will import only the first one. If <Language> contains no text, it will add '' (empty string).
    
    - Journal: If there is no tag <Title>, it will add 'no tag'. If there is more than one <Title>,
    it will import only the first one. If <Title> contains no text, it will add '' (empty string).
    
    - Date: If there is no tag <PubDate>, it will add 'no tag'. It will combine all the <PubDate>'s childs' texts
    into one (due to the assymetry of the date storage, sometimes with <Day>, <Month> and <Year>, other times 
    with <MedlineDate>). If <PubDate> contains no further childs, it will print ' '.
    """
    
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()

    dicc={}

    #PMID 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for element in child2.iter('MedlineCitation'):
                tag=element.find('PMID')
                if tag is None:
                    ros.append(['no tag'])
                else:
                    res=[]
                    if not tag.text :
                        res.append('')
                    else:
                        res.append(tag.text)
                    ros.append(res)


    ros=[' '.join(ele) for ele in ros]
    dicc['PMID']=ros

    
    #Title 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for article in child3.iter('Article'):
                    tag=article.find('ArticleTitle')
                    if tag is None:
                        ros.append(['no tag'])
                    else:
                        res=[]
                        if not tag.text :
                            res.append('')
                        else:
                            res.append(tag.text)
                        ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['Title']=ros


    #Abstract 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for article in child3.iter('Article'):
                    tag=article.find('Abstract')
                    if tag is None:
                        ros.append([''])
                    else:
                        for child4 in child3:
                            for elem in child4.iter('Abstract'):
                                res=[]
                                for AbstractText in elem.iter('AbstractText'):
                                    if not AbstractText.text:
                                        res.append('')
                                    else:
                                        res.append(AbstractText.text)
                                ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['AbstractText']=ros
    
    
    #Language 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for article in child3.iter('Article'):
                    tag=article.find('Language')
                    if tag is None:
                        ros.append(['no tag'])
                    else:
                        res=[]
                        if not tag.text :
                            res.append('')
                        else:
                            res.append(tag.text)
                        ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['Language']=ros

    
    #Journal 
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for child4 in child3:
                    for journal in child4.iter('Journal'):
                        tag=journal.find('Title')
                        if tag is None:
                            ros.append(['no tag'])
                        else:
                            res=[]
                            if not tag.text:
                                res.append('')
                            else:
                                res.append(tag.text)
                            ros.append(res)

    ros=[' '.join(ele) for ele in ros]
    dicc['Journal']=ros

    
    #Date
    ros=[]
    for child1 in xroot:
        for child2 in child1:
            for child3 in child2:
                for child4 in child3:
                    for child5 in child4:
                        for JI in child5.iter('JournalIssue'):
                            tag=JI.find('PubDate')
                            if tag is None:
                                ros.append(['no tag'])
                            else:
                                res=[]
                                for elem in tag:
                                    res.append(elem.text)
                                ros.append(res)
                                
    ros=[' '.join(ele) for ele in ros]
    dicc['Date']=ros
    
    out=pd.DataFrame.from_dict(dicc)
    return out, dicc

## Function for importing all files from a directory

In [7]:
def import_all_files(path, order_files=False):
    """
    Imports all xml files from a directory into a combined dataframe using the function xml_import created above.
    
    Input:
        - path : srt "" with the path of the directory with the files you want to import.
        - order_files=False : if it is True it will print the order in which files are being imported.
        
    Output:
        - final_df : Panda Dataframe with all the xml files from the directory imported and merged 
        together (concatenated in the order that they were in the directory (from up to down)).
    """
    # name_files has the names of both .xml files and .gz.md5 files
    name_files=os.listdir(path)
    
    # we select only the .xml files
    len_filenames_list=map(len, name_files)
    len_filenames=np.fromiter(len_filenames_list, dtype=np.int64,count=len(name_files))

    name_files_array=np.array(name_files)
    name_xml_files=name_files_array[len_filenames==17]
    
    # import
    frame_all_df=[]
    
    for i in range(0,len(name_xml_files)):
        path_file=path+name_xml_files[i]
        if order_files==True:
            print(name_xml_files[i])
        df,dic=xml_import(str(path_file))
        frame_all_df.append(df)

    final_df=pd.concat(frame_all_df,ignore_index=True)
    return final_df

# Import data

In [8]:
%%time

path="data"

all_files_df=import_all_files(path, order_files=True)

#save results
all_files_df.to_pickle("variables/all_files_df")

pubmed21n0038.xml
pubmed21n0722.xml
pubmed21n0814.xml
pubmed21n0171.xml
pubmed21n0124.xml
pubmed21n0864.xml
pubmed21n0876.xml
pubmed21n0133.xml
pubmed21n0539.xml
pubmed21n0119.xml
pubmed21n0056.xml
pubmed21n0497.xml
pubmed21n0821.xml
pubmed21n0241.xml
pubmed21n0986.xml
pubmed21n0769.xml
pubmed21n0857.xml
pubmed21n0073.xml
pubmed21n0479.xml
pubmed21n0044.xml
pubmed21n0208.xml
pubmed21n0071.xml
pubmed21n0014.xml
pubmed21n0795.xml
pubmed21n0074.xml
pubmed21n0662.xml
pubmed21n0543.xml
pubmed21n0555.xml
pubmed21n0048.xml
pubmed21n0512.xml
pubmed21n0152.xml
pubmed21n0078.xml
pubmed21n0093.xml
pubmed21n1051.xml
pubmed21n0304.xml
pubmed21n0172.xml
pubmed21n0361.xml
pubmed21n0831.xml
pubmed21n0376.xml
pubmed21n0203.xml
pubmed21n0885.xml
pubmed21n0740.xml
pubmed21n0154.xml
pubmed21n1005.xml
pubmed21n0398.xml
pubmed21n0079.xml
pubmed21n1011.xml
pubmed21n0049.xml
pubmed21n0929.xml
pubmed21n0404.xml
pubmed21n0887.xml
pubmed21n0911.xml
pubmed21n0668.xml
pubmed21n0231.xml
pubmed21n0013.xml
pubmed21n0

In [12]:
print('There are {} papers'.format(all_files_df.shape[0]))

There are 31820051 papers


# Filtering abstracts

#### First filtering by: 
- Empty abstracts
- Language
- Correction abstracts

In [13]:
%%time

# Eliminate empty abstracts
clean_df=all_files_df[all_files_df.AbstractText != '']

# Eliminate non-english papers
clean_df=clean_df[clean_df.Language == 'eng']

# Eliminate correction abstracts
weird_text='An amendment to this paper has been published and can be accessed via a link at the top of the paper.'
clean_df=clean_df[clean_df.AbstractText != weird_text]

# print size
print('After first cleaning, there are {} papers'.format(clean_df.shape[0]))


After first cleaning, there are 19511885 papers
--- 0.5490604281425476 minutes ---


## Threshold and cut off
Based on the abstract string length. We observed that papers with either too short or too long abstracts corresponded to non-meaningful abstracts. Examples of non-meaningful abstracts are texts containing only links, symbols, or simply empty. Those type of abstracts create artifacts in the embedding that hinder the quality of the visualization, therefore they were removed.

The values of cutoff and threshold were chosen trying to balance the fraction of papers filtered out and the artifacts observed in the embedding.

### Cut off = 4000

In [14]:
print('Before cut off, there are {} papers'.format(clean_df.shape[0]))
abstracts=clean_df['AbstractText'].tolist()
len_strings=map(len, abstracts)
len_abstracts=np.fromiter(len_strings, dtype=np.int64,count=len(abstracts))


cut_off=4000
clean_df=clean_df[len_abstracts<cut_off]
print('After cut off, there are {} papers'.format(clean_df.shape[0]))

Before cut off, there are 19511885 papers
After cut off, there are 19500414 papers


### Threshold = 250

In [15]:
threshold=250
len_short_abstracts=len_abstracts[len_abstracts<cut_off]
clean_df=clean_df[len_short_abstracts>threshold]
print('After threshold, there are {} papers'.format(clean_df.shape[0]))

After threshold, there are 19016308 papers


In [16]:
#save df
clean_df.to_pickle("variables/clean_df")