<center><h1>Reading scholarly.html files</h1></center>

## Introduction

In this notebook I am going to attempt to write a parser to read scholarly.html files.

### Find an example file and load it

In [1]:
import os

print(os.listdir('./results/PMC2241601'))

html_root = './results/PMC2241601/scholarly.html'

#load the data in
with open(html_root, 'r') as f:

    raw_text = f.read()
    


['svg', 'word.frequencies.snippets.xml', 'fulltext.pdf', 'pdfimages', 'search.country.snippets.xml', 'scholarly.html', 'eupmc_result.json', 'search.country.count.xml', 'results', 'fulltext.xml', 'word.frequencies.count.xml']


In [2]:
from bs4 import BeautifulSoup

#parse the html into soup
soup = BeautifulSoup(raw_text, 'html.parser')

#extract the body
body = soup.find('body')



### Try and make the parser work using just the abstract

In [3]:
#extract the front element
front = body.find('div', {'class':'front'})

### Extract the paper metadata

In [4]:
from pprint import pprint

def iterdict(d):
    for k,v in d.items():        
        if isinstance(v, dict):
            iterdict(v)
        else:            
            print (k,":",v)
        
def iterhtml(el):
    
    store_json = {}
    
    if el.children:
    
        for item in el.children:
            
            #check if the item is whatever a navigable string is
            if 'bs4.element.NavigableString' in str(item.__class__):
                #move on if it does
                continue

            if item.name == 'div':
                
                #object_name = item.text
                
                #check if we have a title object
                if item.has_attr('tagx'):
                    
                    if item['tagx'] == 'title':
                        store_json['title'] = item.text.strip()

                        #this is not a div element we are interested in diving into
                        #so lets move on
                        continue
                
                #do some recursion
                store_json[item['class'][0]] = iterhtml(item)

            #we have a span object which contains information we are interested in
            elif item.name == 'span':
                
                store_json[item['class'][0]] = item.text
                
            elif item.name == 'p':
                
                last_key = list(store_json.keys())[-1]
                
                if 'text' not in store_json.keys():
                    
                    store_json['text'] = ''
                    
                store_json['text'] += item.text
                
                #pprint(store_json)
                #print('-------------')
                #print(item)
                
                #print(stop)
    #print('************************')   
    return store_json


result = iterhtml(front)
    
print('++++++++')
    
pprint(result)


++++++++
{'article-meta': {'abstract': {'abstract-title': {},
                               'background': {'text': 'The key enzymes of '
                                                      'photosynthetic carbon '
                                                      'assimilation in C4 '
                                                      'plants have evolved '
                                                      'independently several '
                                                      'times from C3 isoforms '
                                                      'that were present in '
                                                      'the C3 ancestral '
                                                      'species. The C4 isoform '
                                                      'of phosphoenolpyruvate '
                                                      'carboxylase (PEPC), the '
                                                      'primary CO2-fixing '
 

In [5]:
#try to extract the body using this

main = body.find('div', {'class':'body'})

body_result = iterhtml(main)

pprint(body_result)

{"authors'contributions": {'text': 'SE carried out the histochemical and '
                                   'quantitative GUS assays, the cloning of '
                                   'construct ppcA-PRFtΔIntron-DR(+)Ft, the '
                                   'sequence alignments and wrote the '
                                   'manuscript. CZ produced construct '
                                   'ppcA-PRFp-DR(+)Ft. MK, US and MS performed '
                                   'the transformation of F. bidentis. PW '
                                   'coordinated the design of this study and '
                                   'participated in drafting the manuscript. '
                                   'All authors read and approved the final '
                                   'manuscript.',
                           'title': "Authors' contributions"},
 'background': {'text': 'About 90% of terrestrial plant species, including '
                        'major crops such 

                                                                                                            'and '
                                                                                                            'expression '
                                                                                                            'analyses '
                                                                                                            'of '
                                                                                                            'additional '
                                                                                                            'DR-PR '
                                                                                                            'combinations '
                                                                                                            'from '
                                                  

                                                                                                                                                                'for '
                                                                                                                                                                'a '
                                                                                                                                                                'high '
                                                                                                                                                                'and '
                                                                                                                                                                'mesophyll-specific '
                                                                                                                                                       

### Define functions for reference extraction

In [6]:
#process a object that contains information about a person
def process_people_group(people_group_els, this_ref_dict):
    
    #scroll through the people objects
    for pg_els in people_group_els:
        
        #define the person dictionary
        person_dict = {}
        
        #run through the elements in this person object
        for span_el in pg_els:
            
            #store the person's details
            person_dict[span_el['class'][0]] = span_el.text

        #store this person
        this_ref_dict['people'].append(person_dict)
        
    return this_ref_dict

#process an list element which is a reference
def process_reference_element(l):
    
    #run through the elements of this reference element
    for t in l.children:

        #check if the item is whatever a navigable string is
        if 'bs4.element.NavigableString' in str(t.__class__):
            #move on if it does
            continue

        #if we have the label element grab it and form the background of our element
        if t.name == 'a':
            ref_dict = {'people':[],
                       'label':t['name'],
                       'ref_no':int(t['name'][1:])}
            continue

        #go through the children of the reference to grab the reference's elements
        for u in t.children:
            
            #check that the element we have is not a NavigableString
            if 'bs4.element.NavigableString' in str(u.__class__):
                #move on if it does
                continue

            #if we have a person group then we want to use our extraction tool
            if 'person-group' in u['class'][0]:

                #process it as a people element
                ref_dict = process_people_group(u, ref_dict)

            else: 
                
                #find to see if there are links in here
                As = u.findAll('a')
                
                #if there are links we process it as a link object
                if As != []:

                    #if we have a reference link we store the text as well
                    #as the link
                    ref_dict[u['class'][0]] = {'link':As[0]['href'],
                                               'text':u.text}

                else:
                    #we have a plain piece of information, lets just store it
                    ref_dict[u['class'][0]] = u.text
                    
    return ref_dict

#this is the function for extracting the references from the tail
def extract_references(tail):
    
    references = []

    #find the references block
    refs = tail.find('div', {'tag':'ref-list'})

    #find the references list
    r = refs.find('ul')

    #go through the list of references
    for ref_el in r.children:

        #check if the item is whatever a navigable string is
        if 'bs4.element.NavigableString' in str(ref_el.__class__):
            #move on if it does
            continue

        ref_dict = process_reference_element(ref_el)

        references.append(ref_dict)
        
    return references

### Create a class which will read the objects in

In [36]:

#define a class to extract the acknowledgments
class acknowledgements_extractor(object):
    
    def __init__(self, tail):
        
        #get the acknowledgements element
        acks = tail.find('div', {'class':'ack'})

        #get the ack_dict element
        self.ack_dict = {'text':''}
        
        #start iterating
        self.iteracks(acks)
        
    def iteracks(self, el):
                
        #iterate through the children of this device
        for child in el.children:
            
            #check if the item is whatever a navigable string is
            if 'bs4.element.NavigableString' in str(child.__class__):
                #move on if it does
                continue
            
            #check if we have an div, usually the acknowledgments heading
            if child.name == 'div':
                
                #print('FOUND DIV')
                
                #check if this element is a title
                if child['class'][0] == 'title':

                    self.ack_dict['title'] = child.text.strip()
                    
                    #if it is we are done with this element
                    continue
                    
                #go one level down
                self.iteracks(child)
            
            #check if this is a text element
            elif child.name == 'p':
                
                #print('FOUND TEXT')
                
                #now we need to extract and store the result
                self.ack_dict['text'] += ' ' + el.text.strip()


#this is the object that will process the paper and in which the results will be stored
class paper_obj(object):
    
    def __init__(self, html_file, keep_fig_caps = True):
        
        
        self.abstract_text = ''
        self.body_text = ''
        self.figures = []
        
        self.authors = []
        
        self.abstract_el = False
        self.body_el = False
        
        #tell the tool to keep the figure captions in the fulltexts
        self.keep_fig_labels = keep_fig_caps
        
        #load the data in
        with open(html_root, 'r') as f:

            raw_text = f.read()
            
            
        #parse the html into soup
        soup = BeautifulSoup(raw_text, 'html.parser')

        #extract the metadata
        front = soup.find('div', {'class':'front'})
        
        #extract the body
        body = soup.find('div', {'class':'body'})
        
        #extract the tail
        tail = soup.find('div', {'class':'back'})
        
        #extract the metadata into a json
        self.meta = self.iterhtml(front)
        
        #we are now going to work with the body so set the flag
        self.body_el = True
        
        #extract the body of the document
        self.body = self.iterhtml(body)
        
        #we are done with the body so we will reset the body to False
        self.body_el = False
        
        
        #extract the end notes
        self.extract_tail(tail)

    def iterhtml(self, el):
        
        #set up the json we're going to be returning
        store_json = {}
        
        #check if this item has children
        if el.children:

            for item in el.children:
                
                #print('I HAVE AN ITEM')

                #check if the item is whatever a navigable string is
                if 'bs4.element.NavigableString' in str(item.__class__):
                    #move on if it does
                    continue

                if item.name == 'div':

                    #object_name = item.text

                    #check if we have a title object
                    if item.has_attr('tagx'):

                        if item['tagx'] == 'title':
                            store_json['title'] = item.text.strip()

                            #this is a heading, lets store the text from it
                            self.capture_text(item)
                            
                            #this is not a div element we are interested in diving into
                            #so lets move on
                            continue
                            
                    if item['class'][0] == 'fig':
                        
                        self.process_figure(item)
                        
                        #we dont want to dive in here since that will lead to double counts.
                        #lets move on
                        continue
                        
                    elif item['class'][0] == 'abstract':
                        
                        #this is going to be the abstract and we want to extract
                        #the text from it so lets use this function here to do that
                        self.process_abstract(item)
                        
                        #we are done here so lets move on since the diving in happened in
                        #the process abstract function
                        continue
                    
                    #we have the title of the article
                    elif item['class'][0] == 'article-title':
                        
                        #store the title since the title is a unique class all of its own
                        store_json[item['class'][0]] = item.text.strip()
                        
                        #this will not go any deeper so lets move on
                        continue
                        
                    #we have the authors information
                    elif item['class'][0] == 'contrib-group':
                        
                        #extract the authors from our author element
                        self.process_authors(item)
                        
                        #now move on since we're done with this element
                        continue
                        

                    #do some recursion
                    store_json[item['class'][0]] = self.iterhtml(item)
                    
                    #we dont want to store text from div elements so lets move on
                    #continue

                #we have a span object which contains information we are interested in
                elif item.name == 'span':
                    
                    #check if there is a link in this item
                    
                    links = item.findAll('a')
                    
                    if len(links) > 0:
                        
                        #
                        store_json[item['class'][0]] = {'text':item.text.strip(),
                                                       'links':[]}
                        
                        #go through this links we have in this span element
                        for link in links:
                            
                            store_json[item['class'][0]]['links'].append({'link':link['href'],
                                                                         'text':link.text.strip()})
                            
                    else:
                        #just store the text from the item
                        store_json[item['class'][0]] = item.text.strip()
                    
                    #grab the text from this element
                    self.capture_text(item)
                
                #if we have a list of the element
                elif item.name == 'p':

                    last_key = list(store_json.keys())[-1]

                    if 'text' not in store_json.keys():

                        store_json['text'] = ''

                    store_json['text'] += item.text
                    
                    #grab the text from this element
                    self.capture_text(item)
                 
        return store_json
    
    def capture_text(self, el):
        
        #check if we are working with a body
        if self.body_el:
            self.body_text += el.text + '\n'

        #if we have an abstract element we want to store it
        elif self.abstract_el:
            self.abstract_text += el.text + '\n'
            
    def process_figure(self, fig_el):
        
        fig_dict = {'caption':''}

        for child in fig_el.children:

            #check if the item is whatever a navigable string is
            if 'bs4.element.NavigableString' in str(child.__class__):
                #move on if it does
                continue

            if child.name == 'div':

                fig_dict['title'] = child.text
                
                if self.keep_fig_labels:
                    self.capture_text(child)

            elif child.name == 'p':
                #child.name == 'p':
                
                if self.keep_fig_labels:
                    self.capture_text(child)

                fig_dict['caption'] += child.text + '\n'
        
        #store the title and caption for this figure
        fig_dict['title'] = fig_dict['title'].strip()
        fig_dict['caption'] = fig_dict['caption'].strip()
        
        #store the details of this figure
        self.figures.append(fig_dict)


    def extract_tail(self, tail):
        
        ack_tool = acknowledgements_extractor(tail)
        
        self.acks = ack_tool.ack_dict
        
        self.references = extract_references(tail)
    
    
    def process_abstract(self, el):
        
        self.abstract_el = True
        
        self.iterhtml(el)
        
        self.abstract_el = False
    
    #process the list of authors
    def process_authors(self, el):
        
        #extract a list of author elements
        raw_authors = el.findAll('span', {'class':'contrib'})
    
        #move through the authors and extract them
        for author in raw_authors:
            
            #extract the details of this author
            author_dict = {}
            for stat in author.children:


                #if we have the person's link
                if stat.name == 'a':

                    author_dict['link'] = {'link':stat['href'],
                                          'text':stat.text.strip()}

                    continue

                #extract the name information if we are looking at the name element
                if stat['class'][0] == 'name':

                    for nam in stat.children:

                        author_dict[nam['class'][0]] = nam.text
                        
                #extract the other information if we have a non-name piece of info, e.g. email address
                else:

                    author_dict[stat['class'][0]] = stat.text.strip()

            #store what we've extracted about this author
            self.authors.append(author_dict)
        
    
###############################
### End of class definition ###
###############################

#set the filename
filename = './results/PMC2241601/scholarly.html'

#create the paper object
result = paper_obj(filename)

print(result.abstract_text)


Background
The key enzymes of photosynthetic carbon assimilation in C4 plants have evolved independently several times from C3 isoforms that were present in the C3 ancestral species. The C4 isoform of phosphoenolpyruvate carboxylase (PEPC), the primary CO2-fixing enzyme of the C4 cycle, is specifically expressed at high levels in mesophyll cells of the leaves of C4 species. We are interested in understanding the molecular changes that are responsible for the evolution of this C4-characteristic PEPC expression pattern, and we are using the genus Flaveria (Asteraceae) as a model system. It is known that cis-regulatory sequences for mesophyll-specific expression of the ppcA1 gene of F. trinervia (C4) are located within a distal promoter region (DR).

Results
In this study we focus on the proximal region (PR) of the ppcA1 promoter of F. trinervia and present an analysis of its function in establishing a C4-specific expression pattern. We demonstrate that the PR harbours cis-regulatory det

In [37]:
result.authors

[{'surname': 'Engelmann',
  'given-names': 'Sascha',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'engelmas@uni-duesseldorf.de'},
 {'surname': 'Zogel',
  'given-names': 'Corinna',
  'link': {'link': '#I2', 'text': '2'},
  'email': 'corinna.zogel@uni-due.de'},
 {'surname': 'Koczor',
  'given-names': 'Maria',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'Maria.Koczor@uni-duesseldorf.de'},
 {'surname': 'Schlue',
  'given-names': 'Ute',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'Ute.Schlue@uni-duesseldorf.de'},
 {'surname': 'Streubel',
  'given-names': 'Monika',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'streubel@uni-duesseldorf.de'},
 {'surname': 'Westhoff',
  'given-names': 'Peter',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'west@uni-duesseldorf.de'}]

In [38]:
result.meta

{'journal-meta': {'nlm-ta': 'BMC Plant Biol',
  'journal-title': 'BMC Plant Biology',
  'issn-epub': '1471-2229',
  'publisher': {'publisher-name': 'BioMed Central'}},
 'article-meta': {'publisher-id': '1471-2229-8-4',
  'pmid': {'text': 'pmid: 18208593',
   'links': [{'link': 'http://www.ncbi.nlm.nih.gov/pubmed/18208593',
     'text': '18208593'}]},
  'doi': {'text': 'doi: 10.1186/1471-2229-8-4',
   'links': [{'link': 'https://dx.doi.org/10.1186/1471-2229-8-4',
     'text': '10.1186/1471-2229-8-4'}]},
  'article-categories': {'subject': 'Research Article'},
  'title-group': {'article-title': 'Evolution of the C4 phosphoenolpyruvate carboxylase promoter of the C4 species Flaveria trinervia: the role of the proximal promoter region'},
  'citation_author_institution': '[2],',
  'pub-date-collection': 'collection: 2008',
  'pub-date-epub': 'epub: 2008-1-1',
  'volume': '8',
  'fpage': '4',
  'lpage': '4',
  'history': 'received: 2007-11-8accepted: 2008-1-21',
  'permissions': {'copyright'

In [32]:
author_list

[{'surname': 'Engelmann',
  'given-names': 'Sascha',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'engelmas@uni-duesseldorf.de'},
 {'surname': 'Zogel',
  'given-names': 'Corinna',
  'link': {'link': '#I2', 'text': '2'},
  'email': 'corinna.zogel@uni-due.de'},
 {'surname': 'Koczor',
  'given-names': 'Maria',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'Maria.Koczor@uni-duesseldorf.de'},
 {'surname': 'Schlue',
  'given-names': 'Ute',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'Ute.Schlue@uni-duesseldorf.de'},
 {'surname': 'Streubel',
  'given-names': 'Monika',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'streubel@uni-duesseldorf.de'},
 {'surname': 'Westhoff',
  'given-names': 'Peter',
  'link': {'link': '#I1', 'text': '1'},
  'email': 'west@uni-duesseldorf.de'}]