Parse converted xml files for word content:

In [1]:
import os
import glob
import utils
import re
from bs4 import BeautifulSoup
import nltk
import numpy as np

In [6]:
class Parser(object):
    soup = None

    def __init__(self):
        pass
    
    def remove_stuff(self, section):
        titles = section.find_all('title')
        for title in titles:
            title.decompose()
        footnotes = section.find_all('note')
        for footnote in footnotes:
            footnote.decompose()
        tables = section.find_all('tabular')
        for table in tables:
            table.decompose()
        captions = section.find_all('caption')
        for caption in captions:
            caption.decompose()
        captions2 = section.find_all('toccaption')
        for caption2 in captions2:
            caption2.decompose()
        figures = section.find_all('figure')
        for figure in figures:
            figure.decompose()
        tags = section.find_all('tags') # includes footnotes
        for tag in tags:
            tag.decompose()
        # Remove inline math for now
        maths = section.find_all('Math')
        for math in maths:
            math.decompose()
        # Ignore errors in converting, e.g. author miswrote \citep as \pcite
        errors = section.find_all('ERROR')
        for error in errors:
            error.decompose()
            
    def process_citations(self, section):
        # Process citations that have a class only
        citations = section.find_all('cite', {'class': True})
        for citation in citations:
            # Render inline citations by fetching author info from bibs
            if citation.name != None:
                if citation['class'] == 'ltx_citemacro_citet' or citation['class'] == 'ltx_citemacro_cite':
                    bibref = citation.bibref['bibrefs']
                    bib_item = self.soup.find('bibitem', attrs={'key': bibref})
                    if bib_item: 
                        # Try to find authors in bibtag
                        #refnum = bib_item.find('bibtag', attrs={'role': 'refnum'}, recursive=True)
                        #if refnum == None:
                            # refnum = bib_item.find('tag', attrs={'role': 'refnum'})
                        authors = bib_item.find(attrs={'role': 'refnum'}, recursive=True)
                        if authors != None and authors.text != None:
                            # if the authors text is only numeric, don't use
                            regex = re.compile(r'^\\d*$')
                            if regex.match(authors.text):
                                print('Excluding numeric reference: {}'.format(authors))
                            else:
                                # Replace citation tag with in-text citation str
                                citation.replace_with(authors.text)
                        else:
                            print('Authors not found')
                    else:
                        print('Citation missed ' + bibref)
                elif citation['class'] == 'ltx_citemacro_citep':
                    bibrefs = citation.bibref['bibrefs'] # looks like 'Hinshaw+2006,Hivon+2002,McEwen+2007'
                    bibref_split = bibrefs.split(',')
                    for bibref in bibrefs: 
                        bib_item = self.soup.find('bibitem', attrs={'key': bibref})
            # Otherwise, remove citation from parsed text
            else:
                citation.decompose() 

    def parse(self, xml_path):
        '''
        Parses XML file at given path.
        '''

        fulltext = ''

        with open(xml_path) as xml:
            self.soup = BeautifulSoup(xml, 'xml')
            sections = self.soup.find_all('section')
            print('Sections: ' + str(len(sections)))
            
            if not sections: 
                paragraphs = self.soup.find_all('para')
                print('Paragraphs: ' + str(len(paragraphs)))
                if paragraphs:
                    for p in paragraphs:
                        self.process_citations(p)
                        fulltext += p.get_text()
            else:    
                for section in sections:
                    self.process_citations(section)
                    self.remove_stuff(section)
                    fulltext += section.get_text()

        return fulltext
    
    def cleanse(self, doc):
        # Convert to lowercase
        doc = doc.lower()
        # Tokenize alphanumeric characters only, removing punctuation, changing all numbers to <num> token
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        num_pattern = re.compile(r'^\d*$')
        tokens = ['<num>' if num_pattern.match(x) else x for x in tokenizer.tokenize(doc)]
        return tokens
        

    def main(self):
        # For each xml file
        utils.confirmDir('corpus')
        xml_files = glob.glob('xml/*[.xml]')
        for xf in xml_files:
            arxiv_id = os.path.splitext(os.path.basename(xf))[0]
            np_filename = 'corpus/' + arxiv_id + '.npy'
            # If XML file has already been parsed, don't parse again
            if os.path.isfile(np_filename):
                print('{} has already been parsed.'.format(arxiv_id))
                continue
            # Otherwise parse it
            else:
                print('Parsing {}...'.format(arxiv_id))
                fulltext = self.parse(xf)
                print(fulltext)
                tokens = self.cleanse(fulltext) # array
                np.save(np_filename, tokens)
                print('Saved ' + np_filename)

In [7]:
if __name__ == '__main__':
    p = Parser()
    p.main()
    #p.parse('xml/quant-ph0606117.xml')

Parsing 0704.0009...
Sections: 12




The Serpens star-forming cloud is one of five such large clouds selected for observation as part of
The Spitzer Legacy project “From Molecular Cores
to Planet-forming Disks” (c2d) (
, 
). Previous papers in this series have
described the observational results in the Serpens Cloud as seen with IRAC (
, 
)(Paper I) and
MIPS (
, 
) as well as some of the other clouds (
, 
).
In this paper we examine how the combination of the IRAC and
MIPS data together with other published results on this region can be used to find and characterize
a highly reliable catalog of young stellar objects (YSO’s) in the surveyed area.
With the combination of broad wavelength coverage and amazing depth of Spitzer’s sensitivity
we are able to probe to both extremely low luminosity limits for YSO’s and to cover a very
wide range in dust emission, both in optical depth and in range of emitting temperatures.
The Spitzer wavelength region is particularly well tuned for sensitivit

Sections: 8
Citation missed kin99,bel02
Citation missed bel02,bel05
Citation missed hoo04,beu03




EX Hya is an Intermediate Polar (IP), a sub-class of magnetic Cataclysmic Variable Stars (mCVs) where a late-type main sequence star transfers material to the magnetic white dwarf star as the two stars orbit each other under the influence of their mutual gravitation. Unlike in Polars, another subclass of mCVs, where the white dwarf is in synchronous rotation with the binary rotation (), the white dwarf in an IP is in asynchronous rotation with the orbital motion of the system. EX Hya, however, is nearer synchronism than the majority of IPs as it has a spin period (67.03 min) which is about  its orbital period (98.26 min) (
, 
), and is one of only six out of thirty nine confirmed IPs with its orbital period below the 2-3 h CV period gap (
, 
). It has an inclination .


Recent studies have shown that EX Hya does not conform to the traditional IP model (
, 
). This system has a large  rat

Sections: 6
Citation missed Jaynes:2003,Gregory:2005,Gelmanetal:1997
Citation missed Umstaetter:2005,Stroeeretal:2006,Wickhametal:2006
Citation missed CornishCrowder:2005,Crowderetal:2006,CrowderCornish:2007
Citation missed glig-report1,mldc1-summary
Citation missed CornishCrowder:2005,Crowderetal:2006,CrowderCornish:2007
Citation missed MLDCLISA06a,MLDCLISA06b,MLDCdoc




The data obtained from LISA 1 will contain a large number of white dwarf binary systems across the whole observational window 2. At frequencies below  mHz the sources are so abundant that they produce a stochastic foreground whose intensity dominates the instrumental noise 3. The closer (and louder) sources will still be sufficiently bright to be individually resolvable. Above  mHz the sources become sufficiently sparse in parameter space (and in particular in the frequency domain) that the detectable sources become individually resolvable. The identification of white dwarfs in the LISA data set represents one of the

Sections: 4
Citation missed bakos2002,bakos2004
Citation missed bouchy2005,pont2005a,pont2005b,pont2006




Solving for the masses and radii of stars has traditionally been
accomplished through the analysis of double-lined eclipsing binaries,
where the light of both components is detected. Masses and radii
determined this way are fundamental and can be very accurate, because
they rely only on Newton’s laws and geometry for the analysis of the
spectroscopic orbit and light curve, and not on models of stellar
structure and evolution. In particular, analysis of the eclipse light
curve yields the orbital inclination, and when combined with the
double-lined spectroscopic orbit, this yields individual masses for
both stars.


There are dozens of double-lined eclipsing binaries with very
accurate mass and radius determinations (e.g. see 
, 
, for a
review), but only 10 M dwarfs (in 5 systems) with
accuracies better than 3 percent. In order of increasing mass, the 5
systems are: CM Draconis (


Sections: 4


KeyboardInterrupt: 

In [None]:
len(np.load('cleaned_corpus/0707.2895.npy'))