In [51]:
import os
import numpy as np
import random
import sqlite3
from bs4 import BeautifulSoup # using beautiful soup as an interface to the 'lxml' xml parser
                                # note - you could also use the lxml parser natively...
                                # performance will be better (e.g. if parsing becomes a bottleneck)
path2directory = '/home/brch/Data/allofplos/'
target_db_path = '/home/brch/Data/allofplos_sql/'
target_db_name = 'allofplos_proto_version' # sql database for parsed abstracts etc


In [52]:
# helper function to extract some basic info from the xml files
def extract_document_info(path, printPath=True, printXML=False, printVerbose=True):
    if printPath:
        print('PATH: {}'.format(path))
    
    with open(path) as f:
        xml_soup = BeautifulSoup(f,'xml')
        if printXML:
            print('CONTENTS:')
            print(xml_soup.prettify())

        titles = xml_soup.find_all('article-title')    
        title = titles[0].text # the first article-title = the paper itself
                            # subsequent article-titles = the references

        tags = xml_soup.find_all('abstract')
        abstract = ' '.join([tag.text for tag in tags])
        dois = xml_soup.find_all(attrs={'pub-id-type':'doi'})
        doi = dois[0].text

        dates = xml_soup.find_all('pub-date')
        #day = dates[0].day.text # some entries don't have a 'day', omitting it for simplicity 
        #month = dates[0].month.text # some entries don't have a 'month', omitting it for simplicity
        year = dates[0].year.text
        datestring = "{}".format(year)

        # todo there's a lot more to do here
        #    e.g. look into the attribute name-style="western"
        #         pull out the institutional affiliations
        #         look for an author ID field to supplement the name alone
        tags = xml_soup.find_all(attrs= {'contrib-type':'author'})  # note: this section also contains affiliations and roles
        name_strings = []
        for tag in tags:
            try:
                given_name = ' '.join([gn.text for gn in tag.find_all('given-names')])
                surname = tag.surname.text
                name_string = ' '.join([surname, given_name])  # lastname   firstname   middle initial
                name_strings.append(name_string)
            except:
                print('WARNING: skipping author with unexpected formatting') # todo handle this
        authors = ', '.join(name_strings)  # comma separated

        if printVerbose:
            print('')
            print('Title: {}'.format(title))
            print('Authors: {}'.format(authors))
            print('Abstract: {}'.format(abstract))
            print('DOI: {}'.format(doi))
            print('Datestring: {}'.format(datestring)) 

        obj = {'Title': title,
                     'Authors': authors,
                     'Abstract': abstract,
                     'DOI': doi,
                     'Datestring': datestring}
        return obj

In [53]:
# initialize database
db = sqlite3.connect(target_db_path + target_db_name + '.db')
cursor = db.cursor()
cursor.execute('''CREATE TABLE {} (ID INTEGER PRIMARYKEY,
                Title TEXT, Authors TEXT, Abstract TEXT, Datestring TEXT, DOI TEXT)'''.format(target_db_name))
db.commit()

In [54]:
# load all the xml filenames from the directory path into a list
#   os.listdir won't work on mac (probably)...you can use the glob package instead (glob.glob)

xml_list = os.listdir(path2directory)

L = len(xml_list)
print(L) # ~250,000 articles

254571


In [None]:
for idx, xml_entry in enumerate(xml_list):
    path = path2directory + xml_entry
    if idx % 5000 == 0:
        print(xml_entry)
    
    try:
        article_data_obj = extract_document_info(path, printPath=False, printVerbose=False, printXML=False)

        article_data_obj['ID'] = idx # define a unique primary key
        # add article data object to a database        
        cursor.execute('''INSERT INTO {}
                (ID, Title, Authors, Abstract, Datestring, DOI)
                VALUES(:ID, :Title, :Authors, :Abstract, :Datestring, :DOI)'''.format(target_db_name),
                article_data_obj)

    except:
        print('WARNING: parsing error. skipping xml entry')
        
    db.commit()


journal.pone.0126066.xml
