Parse converted xml files for word content:

In [1]:
import os
import glob
import utils
import re
from bs4 import BeautifulSoup
import nltk
import numpy as np

In [19]:
class Parser(object):
    soup = None
    parenthetical_citations = 0
    intext_citations = 0

    def __init__(self):
        pass
    
    def remove_stuff(self, section):
        titles = section.find_all('title', recursive=True)
        for title in titles:
            title.decompose()
        footnotes = section.find_all('note', recursive=True)
        for footnote in footnotes:
            footnote.decompose()
        tables = section.find_all('tabular', recursive=True)
        for table in tables:
            table.decompose()
        captions = section.find_all('caption', recursive=True)
        for caption in captions:
            caption.decompose()
        captions2 = section.find_all('toccaption', recursive=True)
        for caption2 in captions2:
            caption2.decompose()
        figures = section.find_all('figure', recursive=True)
        for figure in figures:
            figure.decompose()
        tags = section.find_all('tags', recursive=True) # includes footnotes
        for tag in tags:
            tag.decompose()
        # Remove inline math for now
        maths = section.find_all('Math', recursive=True)
        for math in maths:
            math.decompose()
        # Ignore errors in converting, e.g. author miswrote \citep as \pcite
        errors = section.find_all('ERROR', recursive=True)
        for error in errors:
            error.decompose()
            
    def render_authors(self, citation, bib_item):
        intext_citations = 0
        
        authors = bib_item.find(attrs={'role': 'refnum'}, recursive=True)
        if authors != None and authors.text != None:
            # If the authors text is numeric only, don't use
            regex = re.compile(r'^[(]?\d*[)]?$')
            if regex.match(authors.text.strip()):
                self.parenthetical_citations += 1
            else:
                # Replace citation tag with in-text citation string
                citation.replace_with(authors.text)
                self.intext_citations += 1
        else:
            print('Authors not found')
            
    def process_citations(self, section):        
        # Process only citations that have a class 
        citations = section.find_all('cite', {'class': True})
        for citation in citations:
            if citation.name != None:
                # Get rid of parenthetical citations
                if citation['class'] == 'ltx_citemacro_citep':
                    citation.decompose()
                    self.parenthetical_citations += 1
                # Process in-text citations
                elif citation['class'] == 'ltx_citemacro_citet' or citation['class'] == 'ltx_citemacro_cite':
                    bibref = citation.bibref['bibrefs']
                    bib_item = self.soup.find('bibitem', attrs={'key': bibref})
                    if bib_item:
                        self.render_authors(citation, bib_item)
                    else: 
                        # Decompose list of authors which usually indicates parenthetical citations
                        print('Could not find reference for ' + citation['class'] + ': ' + bibref)
                        citation.decompose()
                        self.parenthetical_citations += 1
                        
         
            #
            # Render inline citations by fetching author info from bibs
            #if citation.name != None:
            #    if citation['class'] == 'ltx_citemacro_citet' or citation['class'] == 'ltx_citemacro_cite':
            #        bibref = citation.bibref['bibrefs']
            #        if ',' in bibref:
            #            bibref = [x.strip() for x in bibref.split(',')]
            #        bib_item = self.soup.find('bibitem', attrs={'key': bibref})
            #        if bib_item: 
            #            # Try to find authors in bibtag
            #            #refnum = bib_item.find('bibtag', attrs={'role': 'refnum'}, recursive=True)
            #            #if refnum == None:
            #                # refnum = bib_item.find('tag', attrs={'role': 'refnum'})
            #            self.render_authors(citation, bib_item)
            #        else:
            #            print('Removed citation: ' + bibref)
            #    elif citation['class'] == 'ltx_citemacro_citep':
            #        bibrefs = citation.bibref['bibrefs'] # looks like 'Hinshaw+2006,Hivon+2002,McEwen+2007'
            #        bibref_split = bibrefs.split(',')
            #        for bibref in bibrefs: 
            #            bib_item = self.soup.find('bibitem', attrs={'key': bibref})
            # Otherwise, remove citation from parsed text

    def parse(self, xml_path):
        '''
        Parses XML file at given path.
        '''

        fulltext = ''

        with open(xml_path) as xml:
            self.soup = BeautifulSoup(xml, 'xml')
            sections = self.soup.find_all('section')
            print('Sections: ' + str(len(sections)))
            
            if not sections: 
                paragraphs = self.soup.find_all('para')
                print('Paragraphs: ' + str(len(paragraphs)))
                if paragraphs:
                    for p in paragraphs:
                        self.process_citations(p)
                        self.remove_stuff(p)
                        fulltext += p.get_text()
            else:    
                for section in sections:
                    self.process_citations(section)
                    self.remove_stuff(section)
                    fulltext += section.get_text()
        
        print('Num parenthetical citations removed: ' + str(self.parenthetical_citations))
        print('Num in-text citations rendered: ' + str(self.intext_citations))
        return fulltext
    
    def cleanse(self, doc):
        # Convert to lowercase
        doc = doc.lower()
        # Tokenize alphanumeric characters only, removing punctuation, changing all numbers to <num> token
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        num_pattern = re.compile(r'^\d*$')
        tokens = ['<num>' if num_pattern.match(x) else x for x in tokenizer.tokenize(doc)]
        return tokens
        

    def main(self):
        # For each xml file
        utils.confirmDir('corpus')
        xml_files = glob.glob('xml/*[.xml]')
        for xf in xml_files:
            arxiv_id = os.path.splitext(os.path.basename(xf))[0]
            np_filename = 'corpus/' + arxiv_id + '.npy'
            # If XML file has already been parsed, don't parse again
            if os.path.isfile(np_filename):
                print('{} has already been parsed.'.format(arxiv_id))
                continue
            # Otherwise parse it
            else:
                print('Parsing {}...'.format(arxiv_id))
                fulltext = self.parse(xf)
                print(fulltext)
                tokens = self.cleanse(fulltext) # array
                np.save(np_filename, tokens)
                print('Saved ' + np_filename)

In [22]:
if __name__ == '__main__':
    p = Parser()
    #p.main()
    #fulltext = p.parse('xml/quant-ph0511222.xml')
    fulltext = p.parse('xml/physics0612062.xml')
    print(fulltext)

Sections: 0
Paragraphs: 14
Could not find reference for ltx_citemacro_cite: haas-etal1,anderson-etal,haas-etal2,haas,garcia-etal,marklund,Shukla-Eliasson,marklund-shukla
Could not find reference for ltx_citemacro_cite: Markowich-etal,Calvayrac-etal,Stenflo-etal
Could not find reference for ltx_citemacro_cite: exp1,exp2
Could not find reference for ltx_citemacro_cite: haas-etal2,haas,garcia-etal
Could not find reference for ltx_citemacro_cite: shukla-mamun,shukla
Could not find reference for ltx_citemacro_cite: halperin-hohenberg,blum,balatsky,rathe-etal,hu-keitel,arvieu-etal,aldana-roso,walser-keitel,qian-vignale,walser-etal,roman-etal,liboff,fuchs-etal
Could not find reference for ltx_citemacro_cite: rathe-etal,hu-keitel,arvieu-etal,aldana-roso,walser-keitel,walser-etal
Could not find reference for ltx_citemacro_cite: halperin-hohenberg,blum,balatsky
Could not find reference for ltx_citemacro_cite: melrose,harding-lai
Could not find reference for ltx_citemacro_cite: haas-etal1,anderso

In [7]:
len(np.load('cleaned_corpus/0707.2895.npy'))

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_corpus/0707.2895.npy'