# Load data from all_of_plos collection

Data is in JATS XML format
https://en.wikipedia.org/wiki/Journal_Article_Tag_Suite


GETTING STARTED:
Just download the entire collection of xml files (>10GB)

Alternatively, supposedly you can use pip via the official repo (but it didn't work for me):
official repo: https://github.com/PLOS/allofplos
Install All Of Plos:        pip install allofplos                         
                            python -m allofplos.update              (installs the >10GB of article files)
                            
Nice guide to usage (although the instructions did not work for me): http://www.thehackerwithin.org/berkeley/plos.html
This project could potentially also be helpful, from a collaborator: https://github.com/titipata/pubmed_parser
Another potentially useful repo: https://github.com/elifesciences/jats-scraper
               

In [215]:
import os
import numpy as np
import random
from bs4 import BeautifulSoup # using beautiful soup as an interface to the 'lxml' xml parser
                                # note - you could also use the lxml parser natively...
                                # performance will be better (e.g. if parsing becomes a bottleneck)
path2directory = '/home/brch/Data/allofplos/'


In [146]:
# load all the xml filenames from the directory path into a list
#   os.listdir won't work on mac (probably)...you can use the glob package instead (glob.glob)

xml_list = os.listdir(path2directory)

L = len(xml_list)
print(L) # ~250,000 articles

254570


In [196]:
ref_idx = random.randint(0,L)
print('document number {}'.format(ref_idx))

document number 198116


In [216]:
# example of how to read from one of the xml files
path = path2directory + xml_list[ref_idx]
print('PATH: {}'.format(path))
with open(path) as f:
    xml_soup = BeautifulSoup(f,'xml')
    print('CONTENTS:')
    print(xml_soup.prettify()) # print the entire document

    
    
        

PATH: /home/brch/Data/allofplos/journal.pgen.1000389.xml
CONTENTS:
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
<article article-type="research-article" dtd-version="3.0" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
 <front>
  <journal-meta>
   <journal-id journal-id-type="publisher-id">
    plos
   </journal-id>
   <journal-id journal-id-type="nlm-ta">
    PLoS Genet
   </journal-id>
   <journal-id journal-id-type="pmc">
    plosgen
   </journal-id>
   <!--===== Grouping journal title elements =====-->
   <journal-title-group>
    <journal-title>
     PLoS Genetics
    </journal-title>
   </journal-title-group>
   <issn pub-type="ppub">
    1553-7390
   </issn>
   <issn pub-type="epub">
    1553-7404
   </issn>
   <publisher>
    <publisher-name>
     Public Library of Science
    </publish

In [212]:
# functional form

def extract_document_info(path, printPath=True, printXML=False, printVerbose=True):
    if printPath:
        print('PATH: {}'.format(path))
    
    with open(path) as f:
        xml_soup = BeautifulSoup(f,'xml')
        if printXML:
            print('CONTENTS:')
            print(xml_soup.prettify())

        titles = xml_soup.find_all('article-title')    
        title = titles[0].text # the first article-title = the paper itself
                            # subsequent article-titles = the references

        tags = xml_soup.find_all('abstract')
        abstract = ' '.join([tag.text for tag in tags])
        dois = xml_soup.find_all(attrs={'pub-id-type':'doi'})
        doi = dois[0].text

        dates = xml_soup.find_all('pub-date')
        #day = dates[0].day.text # some entries don't have a 'day', omitting it for simplicity 
        month = dates[0].month.text
        year = dates[0].year.text
        datestring = "{} {}".format(month,year)

        # todo there's a lot more to do here
        #    e.g. look into the attribute name-style="western"
        #         pull out the institutional affiliations
        #         look for an author ID field to supplement the name alone
        tags = xml_soup.find_all(attrs= {'contrib-type':'author'})  # note: this section also contains affiliations and roles
        name_strings = []
        for tag in tags:
            given_name = ' '.join([gn.text for gn in tag.find_all('given-names')])
            surname = tag.surname.text
            name_string = ' '.join([surname, given_name])  # lastname   firstname   middle initial
            name_strings.append(name_string)
            #print('')
            #print(tag)
            #print(name_string)
        authors = ', '.join(name_strings)  # comma separated

        if printVerbose:
            print('')
            print('Title: {}'.format(title))
            print('Authors: {}'.format(authors))
            print('Abstract: {}'.format(abstract))
            print('DOI: {}'.format(doi))
            print('Datestring: {}'.format(datestring)) 

        obj = {'Title': title,
                     'Authors': authors,
                     'Abstract': abstract,
                     'DOI': doi,
                     'Datestring': datestring}
        return obj


In [214]:
path = path2directory + xml_list[ref_idx]

test_obj = extract_document_info(path,printVerbose=False)
print(test_obj)

PATH: /home/brch/Data/allofplos/journal.pgen.1000389.xml
{'Title': 'Altered Hematopoiesis in Mice Lacking DNA Polymerase μ Is Due to Inefficient Double-Strand Break Repair', 'Authors': 'Lucas Daniel, Escudero Beatriz, Ligos José Manuel, Segovia Jose Carlos, Estrada Juan Camilo, Terrados Gloria, Blanco Luis, Samper Enrique, Bernad Antonio', 'Abstract': '\nPolymerase mu (Polμ) is an error-prone, DNA-directed DNA polymerase that participates in non-homologous end-joining (NHEJ) repair. In vivo, Polμ deficiency results in impaired Vκ-Jκ recombination and altered somatic hypermutation and centroblast development. In Polμ−/− mice, hematopoietic development was defective in several peripheral and bone marrow (BM) cell populations, with about a 40% decrease in BM cell number that affected several hematopoietic lineages. Hematopoietic progenitors were reduced both in number and in expansion potential. The observed phenotype correlates with a reduced efficiency in DNA double-strand break (DSB) r

In [47]:
# first steps:

# parse the title

# parse the abstract

# parse the publication year



# next steps:

# parse the authors

# parse the introduction

# grab the methods section

# grab the results

# grab the discussion

# grab the references