Parse converted xml files for word content:

In [7]:
import os
from bs4 import BeautifulSoup

In [9]:
def parse(filepath):
    '''
    Parses XML file for full-text content. 
    '''
    
    # Open text file for writing
    name = os.path.splitext(os.path.basename(filepath))[0]
    if not os.path.isdir('corpus'):
        os.makedirs('corpus')
    with open('corpus/' + name + '.txt', 'w+') as fulltext_file: 
        with open('xml/' + filepath) as xml:
            soup = BeautifulSoup(xml, 'xml')
            document = soup.find('document')
            if document:
                pass
            else:
                print('not a document: ' + filepath)
            
def control_parsing():
    # Parse each XML file
    for file in os.listdir('xml'):
        if file.endswith('xml'):
            parse(file)

control_parsing()

not a document: 0706.2986.xml
not a document: 0902.1226.xml
not a document: 1003.2415.xml
not a document: 1011.6189.xml
not a document: 1012.0934.xml
not a document: 1108.6072.xml
not a document: 1108.6079.xml
not a document: 1203.1342.xml
not a document: 1303.0320.xml
not a document: 1303.1806.xml
not a document: 1303.2172.xml
not a document: 1303.2509.xml
not a document: 1303.2658.xml
not a document: 1303.3241.xml
not a document: 1303.3272.xml
not a document: 1303.3313.xml
not a document: 1312.1177.xml
not a document: 1505.02159.xml
not a document: astro-ph0412079.xml
not a document: astro-ph0412115.xml
not a document: astro-ph0412129.xml
not a document: astro-ph0412203.xml
not a document: astro-ph0412229.xml


In [None]:
def parse():
    """
    Parses converted .xml files for word content.
    """

    # Initialize trackers
    num_of_abstracts = 0
    num_of_fulltexts = 0

    # Open corpora for writing
    fulltexts_corpus = open('fulltexts_corpus.txt', 'w')

    # For each xml file
    for file in os.listdir("xml"):
        if file.endswith('.xml'):
            print('\nParsing ' + file + '...')
            with open("xml/" + file) as f:
                soup = BeautifulSoup(f, "xml")
                document = soup.find('document')

                # If .xml file represents an actual article (specified by \document tag)
                if document:
                    # Get abstract from document
                    #abstract = getAbstract(document)
                    # Add abstract to corpus 
                    #if abstract is not None:
                    #    print('Adding abstract from ' + file)
                    #    abstracts_corpus.write('\n\n' + abstract)
                    #    num_of_abstracts += 1

                    # Get fulltext from file
                    fulltext = getFullText(soup)
                    # Add fulltext to corpus
                    if fulltext is not None:
                        print('Adding fulltext from ' + file)
                        converted_file = open(os.path.join('txt/', os.path.splitext(file)[0] + '.txt'), 'w+')
                        converted_file.write(fulltext)
                        # fulltexts_corpus.write(fulltext)
                        num_of_fulltexts += 1

    #print('\n\nAbstracts: ' + str(num_of_abstracts))
    print('\n\nFull texts: ' + str(num_of_fulltexts))
    
    abstracts_corpus.close()
    # fulltexts_corpus.close()


def getFullText(soup):
    """ 
    Returns cleaned full body text from passed soup, a BeautifulSoup soup of a .xml file.
    """

    fulltext = ""

    sections = soup.find_all('section')
    print('sections: ' + str(len(sections)))

    for section in sections:
        # Process citations that have a class only
        citations = section.find_all('cite', {'class': True})
        for citation in citations:
            # Render inline citations by fetching author info from bibliography
            if citation.name != None and citation['class'] == 'ltx_citemacro_citet': # Note: first condition catches a <None> tag in 0076.xml although I couldn't find the tag itself in the file...
                # Get bibliography reference
                citet = citation.bibref['bibrefs']
                # Using reference, get authors from bibliography
                bib_item = soup.find('bibitem', attrs={'key': citet})
                if bib_item: 
                    authors = bib_item.find('bibtag', attrs={'role': 'authors'})
                    if authors != None and authors.text != None:
                        # Replace citation tag with in-text citation str
                        citation.replace_with(authors.text)
                    else:
                        print('Authors not found')
                else: 
                    print('Citation missed: ' + citet) # account for array of citations in aldering.xml
            # If not inline citation, remove citation for now
            else:
                citation.decompose()
        # Remove titles, footnotes, tables, figures (although converting to XML should not include them), and captions
        titles = section.find_all('title')
        for title in titles:
            title.decompose()
        footnotes = section.find_all('note')
        for footnote in footnotes:
            footnote.decompose()
        tables = section.find_all('tabular')
        for table in tables:
            table.decompose()
        captions = section.find_all('caption')
        for caption in captions:
            caption.decompose()
        figures = section.find_all('figure')
        for figure in figures:
            figure.decompose()
        # Remove inline math for now
        maths = section.find_all('Math')
        for math in maths:
            math.decompose()
        # Ignore errors in converting, e.g. authors miswrote \citep as \pcite
        errors = section.find_all('ERROR')
        for error in errors:
            error.decompose()

        fulltext = fulltext + section.get_text()
    
    return fulltext