# Parse preprint full-texts

Contents:
1. Introduction
2. 

Check if downloaded .tex files contain fulltext. Some are just supplementary files for the submission.  If they contain \documentclass. 
Each submission folder should contain one \documentclass file. 

In [1]:
import os, sys, re, requests, subprocess, glob, pathlib
from TexSoup import TexSoup

Each submission folder may contain 1 or more .tex files. We need to identify which .tex file is the full-text preprint.

In [84]:
submissions = 0
texs = 0
preprints = 0
base_path = 'latex'
type_errors = 0
unicode_decode_errors = 0

# For each file in latex/
for tar_folder in os.listdir(base_path):
    if os.path.isdir(base_path + '/' + tar_folder):
        submission_folders = os.listdir(base_path + '/' + tar_folder)
        if len(submission_folders) > 0:
            for submission_folder in submission_folders:
                if os.path.isdir(base_path + '/' + tar_folder + '/' + submission_folder):
                    submissions += 1
                    for tex_file in os.listdir(base_path + '/' + tar_folder + '/' + submission_folder):
                        texs += 1
                        file_path = (base_path + '/' + tar_folder + '/' + submission_folder + '/' + tex_file)
                        # Check if is document
                        if os.path.isfile(file_path):
                            with open(file_path) as file:
                                try: 
                                    soup = TexSoup(file)
                                    if soup.find('document'):
                                        print('document: ' + file_path)
                                        preprints += 1
                                except TypeError: 
                                    print('TypeError: ' + file_path)
                                    type_errors += 1
                                except UnicodeDecodeError:
                                    print('UnicodeDecodeError: ' + file_path)
                                    unicode_decode_errors += 1
                                    

print('Submissions: ' + str(submissions))
print('Texs: ' + str(texs))
print('Preprints: ' + str(preprints))
print('TypeErrors: ' + str(type_errors))
print('UnicodeDecodeErrors: ' + str(unicode_decode_errors))


TypeError: latex/arXiv_src_1607_004/1607.01601/ms.tex
document: latex/arXiv_src_1607_004/1607.01698/SPHYNXcode.tex
document: latex/arXiv_src_1607_004/1607.01468/main_v1.tex
document: latex/arXiv_src_1607_004/1607.01403/he0435lensmodel.tex
document: latex/arXiv_src_1607_004/1607.01772/arxiv.tex
document: latex/arXiv_src_1607_004/1607.01788/wind_paper.tex
document: latex/arXiv_src_1607_004/1607.01585/ms_Jul23.tex
document: latex/arXiv_src_1607_004/1607.01380/effectsofstellarrotation.tex
document: latex/arXiv_src_1607_004/1607.01522/jitpol.tex
document: latex/arXiv_src_1607_004/1607.01780/rxsj1804_accepted.tex
document: latex/arXiv_src_1607_004/1607.01720/ScalarFields.tex
document: latex/arXiv_src_1607_004/1607.01386/PrecisionPredictionLogPower.tex
document: latex/arXiv_src_1607_004/1607.01395/CCdAM.tex
document: latex/arXiv_src_1607_004/1607.01392/Bell_oDAVs34_arXiv.tex
document: latex/arXiv_src_1607_004/1607.01769/iso_ksz.tex
document: latex/arXiv_src_1607_004/1607.01393/TILEY_COLDGASS.

KeyboardInterrupt: 

TOO SLOW.

Find which .tex files are documents by quickly scanning them (we don't want to convert or soupify EACH one yet): 

In [None]:
def convert_to_xml_v1():
    submissions = 0
    preprints = 0
    base_path = 'latex'
    missing = []

    # For each submission folder
    tar_folders = os.listdir(base_path)
    tar_folders.sort()
    for tar_folder in tar_folders:
        tar_path = base_path + '/' + tar_folder
        #print('Looking through submissions in ' + tar_path + '...')
        if os.path.isdir(tar_path):
            submission_folders = os.listdir(tar_path)
            for submission in submission_folders: 
                submission_path = tar_path + '/' + submission
                submission_id = os.path.basename(submission_path)
                if os.path.isdir(submission_path):
                    texs = os.listdir(submission_path)
                    # If submission is missing .tex files, note & skip
                    if len(texs) == 0:
                        missing.append(submission_id)
                        print('Missing files: ' + submission_id)
                        continue
                    # Otherwise get the preprint, the main .tex
                    else: 
                        preprint_path = getMainTex(submission_path, texs)
                        if preprint_path:
                            # Convert preprint to XML, renaming it with the submission id
                            xml_path = 'xml/' + submission_id + '.xml'
                            print('Found preprint: ' + submission_path)
                            #convert_to_xml(tex_path, xml_path)
                            #print('Converted {} to XML'.format(submission_id))
                            preprints += 1
                            break
                        else: 
                            print('Failed to find preprint: ' + submission_path)
                            
    print('Submissions: ' + str(submissions))
    print('Preprints: ' + str(preprints))
    print('Missing: ' + str(missing))
    
def getMainTex(submission_path, texs):
    preprint = None
    
    for tex in texs:
        tex_path = submission_path + '/' + tex
        if os.path.isfile(tex_path):
            with open(tex_path, 'rb') as f:
                data = f.readlines()
                r = re.compile(b'(.*\\\\documentclass.*)|(.*\\\\documentstyle.*)')
                isPreprint = len(list(filter(r.match, data))) > 0
                if isPreprint:
                    preprint = tex_path
                    break

    return preprint

I also came up with a different function: 

In [165]:
def guess_extension_from_headers(h):
    """
    Given headers from an ArXiV e-print response, try and guess what the file
    extension should be.
    Based on: https://arxiv.org/help/mimetypes
    """
    if h.get('content-type') == 'application/pdf':
        return '.pdf'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/postscript':
        return '.ps.gz'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-eprint-tar':
        return '.tar.gz'
    # content-encoding is x-gzip but this appears to normally be a lie - it's
    # just plain text
    if h.get('content-type') == 'application/x-eprint':
        return '.tex'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-dvi':
        return '.dvi.gz'
    return None

def arxiv_id_to_source_url(arxiv_id):
    # This URL is normally a tarball, but sometimes something else.
    # ArXiV provides a /src/ URL which always serves up a tarball,
    # but if we used this, we'd have to untar the file to figure out
    # whether it's renderable or not. By using the /e-print/ endpoint
    # we can figure out straight away whether we should bother rendering
    # it or not.
    # https://arxiv.org/help/mimetypes has more info
    return 'https://arxiv.org/e-print/' + arxiv_id

def download_source_file(arxiv_id):
    """
    Download the LaTeX source of this paper and returns as ContentFile.
    """
    source_url = arxiv_id_to_source_url(arxiv_id)
    res = requests.get(source_url)
    res.raise_for_status()
    extension = guess_extension_from_headers(res.headers)
    if not extension:
        raise DownloadError("Could not determine file extension from "
                            "headers: Content-Type: {}; "
                            "Content-Encoding: {}".format(
                                res.headers.get('content-type'),
                                res.headers.get('content-encoding')))
    with open(arxiv_id + extension, 'wb+') as f:
        f.write(res.content)
        print('Created ' + arxiv_id + extension)

download_source_file('1010.3382')

Created 1010.3382.tar.gz


This creates a file. I probably should have downloaded everything... onto disk. This shows that we need more files than just what we have downloaded. Now I need to see how arXiv-vanity renders its papers, especially with 1010.3382 which renders multiple files. I also don't have .bbl files. 

Convert to XML:

In [64]:
def convert_to_xml(tex_path, xml_path):
    # Convert given .tex to XML
    filename, file_extension = os.path.splitext(tex_path)
    if os.path.exists(xml_path):
        print(os.path.basename(tex_path) + ' has already been converted.')
    elif file_extension == '.tex':
        subprocess.call(['latexml', '--dest=' + xml_path, tex_path])

Attempt using more cores:

In [18]:
import datetime as dt
from multiprocessing import Process, current_process
import sys, time

def worker(tex_path):
    convert_to_xml(tex_path)
    print('{}: hello {} from {}'.format(
        dt.datetime.now(), tex_path, current_process().name))
    sys.stdout.flush()

if __name__ == '__main__':
    start = time.time()
    
    worker_count = 8
    worker_pool = []
    texs = ['workers/1.tex', 'workers/2.tex', 'workers/4.tex'] 
    for i in range(0, worker_count):
        p = Process(target=worker, args=(texs[i],))
        p.start()
        worker_pool.append(p)
    for p in worker_pool:
        p.join()  # Wait for all of the workers to finish.

    # Allow time to view results before program terminates.
    end = time.time()
    print('Finished in ' + str(end - start))

2019-02-13 17:46:21.878179: hello workers/2.tex from Process-42
2019-02-13 17:46:23.819732: hello workers/1.tex from Process-41
2019-02-13 17:46:24.994856: hello workers/4.tex from Process-43
Finished in 23.987058877944946


In [20]:
start = time.time()
convert_to_xml('workers/1.tex')
convert_to_xml('workers/2.tex')
convert_to_xml('workers/4.tex')
end = time.time()
print('Finished in ' + str(end - start))

Finished in 49.3291289806366


It is faster to use all 4 cores. So I will do the XML conversion using all 4 cores. 

First I need to confirm the main file in each repository. 
- If it doesn't contain a .bbl file, I need to add it to the bbl_lack folder. Later. Set aside and skip.
- If it doesn't contain a file, I need to retrieve it again. Later. Set aside and skip. 

I will look at each submission folder, check xml to see if a file exists with its name. If not, I will go into the submission folder to check each file if it contains \\documentclass. If it does, grab it and convert it. Break out of loop. 

In [175]:
def convert_v3():
    base_path = 'latex'
    submissions = 0
    tex_count = 0
    empties = []
    preprints = []
    missed = []

    # Walk through tars
    for idx, tar_folder in enumerate(os.listdir(base_path)):
        tar_path = base_path + '/' + tar_folder
        if not os.path.isdir(tar_path):
            continue
        submission_dirs = os.listdir(tar_path)
        submissions += len(submission_dirs)
        # Walk through each submission
        for submission in submission_dirs:
            submission_path = tar_path + '/' + submission
            if not os.path.isdir(submission_path):
                submissions -= 1
                continue
            arxiv_id = os.path.basename(submission_path)
            texs = glob.glob(submission_path + '/**/*.tex', recursive=True)
            tex_count += len(texs)
            
            # If submission is empty, note & skip
            if len(texs) == 0:
                empties.append(arxiv_id)
                continue
            # Otherwise get the preprint
            else:
                preprint_path = getPreprint(submission_path, texs)
                if preprint_path:
                    preprints.append(preprint_path)
                else:
                    missed.append(arxiv_id)
                    
    print('Submissions: ' + str(submissions))
    print('Preprints: ' + str(len(preprints)))
    print('Non-empty submissions missing preprints: ' + str(len(missed)))
    print('Empty submissions: ' + str(len(empties)))
    print('Texs: ' + str(tex_count))
    print(str(missed))

def getPreprint(submission_path, texs):
    preprint = None
    
    # If submission contains only one file, this is the preprint
    if len(texs) == 1:
        preprint = submission_path + '/' + texs[0]
    # If submission contains ms.tex or main.tex, this is the preprint
    elif 'ms.tex' in texs:
        preprint = submission_path + '/' + 'ms.tex'
    elif 'main.tex' in texs:
        preprint = submission_path + '/' + 'main.tex'
    # Otherwise, iterate through each .tex looking for \documentclass or \documentstyle
    else: 
        for tex_path in texs: 
            with open(tex_path, 'rb') as f: 
                data = f.readlines()
                r = re.compile(b'(.*\\\\documentclass.*)|(.*\\\\documentstyle.*)')
                if len(list(filter(r.match, data))) > 0:
                    preprint = tex_path
                    break
    
    return preprint

In [176]:
convert_v3()

Submissions: 89908
Preprints: 89630
Non-empty submissions missing preprints: 7
Empty submissions: 271
Texs: 125484
['1607.01189', '1606.06791', '1409.3422', '1105.1087', '1304.7762', '1211.4277', '1308.6483']


arxiv-vanity is unable to render the 7 papers as well. 

Now call the functions, with conversion process.

I have 8 cores: https://superuser.com/questions/1101311/how-many-cores-does-my-mac-have/1101314#1101314
4 physical and 4 logical. 