# Convert preprints 

Contents:
1. Introduction
2. Extract preprints
2. Identify preprints
3. Convert preprints
4. Appendix: Request a submission from arXiv API

## 1. Introduction 

In this notebook, we convert preprints from TEX to XML, a format that simplifies downstream parsing. 

Each submission in the ./latex/ folder should contain 1 or more .tex files. If the submission contains more than 1 .tex file, we identify the main file. The additional files are usually inserts for the main file. 

After collecting the filepaths for all submissions' main files, we convert them from .tex to .xml using the [latexml](https://dlmf.nist.gov/LaTeXML/) package, spreading the work across all CPU cores (4 on my machine).

All converted .xml files are stored in ./xml/.

## 2. Extract preprints

Import all dependencies:

In [232]:
import os, re, subprocess as sp, glob, multiprocessing, time, pathlib, tarfile, gzip, shutil
import pandas as pd

In [216]:
# Grab arXiv submission identifiers from the metadata
metadata_df = pd.read_csv('../data/2020_03_06_arxiv_metadata_astroph/arxiv_metadata_astroph.csv', 
                         dtype={'filename_parsed': str})
identifiers = list(metadata_df['filename_parsed'])
len(identifiers)

267794

704.0017

In [141]:
def work(in_file):
    '''
    Defines the work to be done in each multiprocessing worker.
    '''
    
    # Get paths for converted file & logfile
    out_file = get_outpath(in_file)
    logfile_path = 'logs/' + os.path.splitext(os.path.basename(out_file))[0] + '.txt'
    
    # Try conversion, logging command line output
    try:
        print('{} is converting {}...'.format(multiprocessing.current_process(), in_file))
        with open(logfile_path, 'w') as logfile:
            sp.call(['latexml', '--dest=' + out_file, in_file], timeout=240, stderr=logfile)
        print('Writing logfile for ' + in_file)
    # If conversion has timed out, stop it (or it will eat up memory)
    # (this usually happens if latexml hangs recursively, as with 
    # latex/arXiv_src_1009_002/1009.1724/15727_eger.tex)
    except sp.TimeoutException:
        print('---x Conversion failed: {}'.format(in_file))
        sp.kill()
    return 0
 
if __name__ == '__main__':
    # Specify files that need work 
    tasks = get_preprints_to_convert()
    
    # Set up the parallel task pool to use all available processors
    pool = mp.Pool(processes=mp.cpu_count())
 
    # Run the jobs
    try:
        pool.map(work, tasks)
    except KeyboardInterrupt:
        print('\nYou interrupted the script!')
        pool.terminate()
        exit(1)
    except Exception as e:
        print('\nSomething unknown went wrong: ' + e)
        pool.terminate()
        exit(1)
    pool.close()
    pool.join()
    
    

In [264]:
def get_outpath(inpath):
    '''
    Returns the filepath for a XML file,
    based on the given filepath. 
    '''
    path_parts = pathlib.Path(inpath).parts
    submission_id = os.path.splitext(path_parts[4])[0]
    outpath = '../data/2020_03_09_extract_and_convert_submissions/converted_xml/' + submission_id + '.xml'
    return outpath, submission_id

def convert(submission_path):
    '''
    Converts file at passed filepath to XML,
    using LaTeXMLc.
    '''
    outpath, submission_id = get_outpath(submission_path)
    try:
        print('Converting ' + submission_path + '...')
        proc = sp.Popen(['latexmlc', '--timeout=240', '--dest=' + outpath, submission_path], stderr=subprocess.PIPE)
        out, err = proc.communicate()
        converted = 'yes'
    except Exception as e:
        print('Something went wrong in convert(): ' + e)
        converted = 'no'
    return err.decode("utf-8"), converted

In [None]:
tar_dirs = ['../data/2020_03_03_original_arxiv_tars/', '../data/2020_03_07_update_tars/']
log = []

# For each tar directory,
for tar_dir in tar_dirs: 
    files = os.listdir(tar_dir)
    
    # For each tar file, 
    for file in files:
        filepath = tar_dir + file
        if os.path.isfile(filepath) and tarfile.is_tarfile(filepath):
            
            # Open it as read-only
            print('Opening ' + file + ',')
            tar = tarfile.open(filepath)
            tar_name = os.path.splitext(os.path.basename(tar.name))[0]
            extracted_tar_path = None # temp, only for ReadErrors, will know path once we get tar contents
            
            # Iterate over its .gz files (which are each an article submission),
            for submission in tar.getmembers():
                
                # Only look at submissions (.gz and .pdf)
                if submission.name.endswith('.gz') or submission.name.endswith('.pdf'):
                    
                    # Only look at the submissions that are from astrophysics 
                    # and that have not already been converted
                    submission_id = os.path.splitext(os.path.basename(submission.name))[0]
                    if submission_id not in identifiers:
                        continue
                    elif os.path.isfile('../data/2020_03_09_extract_and_convert_submissions/converted_xml/' + submission_id + '.xml'):
                        print('Already converted ' + submission_id + '!')
                        continue
                    
                    print('Processing submission: ' + submission_id + '...')
                    submission_path = tar_dir + os.path.splitext(os.path.basename(file))[0] + '/' + submission_id
                    submission_type = os.path.splitext(os.path.basename(submission.name))[1]
                    extracted = 'no' 
                    suffix = None

                    # If .pdf, skip, we cannot extract and will not convert it here
                    if submission.name.endswith('.pdf'):
                        continue

                    # Extract the .gz submission 
                    extraction_path = '../data/2020_03_09_extract_and_convert_submissions/temp/'
                    extracted_tar_path = extraction_path + submission.name.split('/')[0]
                    
                    try:
                        suffix = '.zip'
                        gz_obj = tar.extractfile(submission)
                        gz = tarfile.open(fileobj=gz_obj, mode='r|gz')
                        zipf = zipfile.ZipFile(file=extraction_path + submission_id + suffix, mode='a', compression=zipfile.ZIP_DEFLATED)
                        
                        for m in gz:
                            f = gz.extractfile(m)
                            if m.isdir():
                                continue
                            f_out = f.read()
                            f_in = m.name
                            zipf.writestr(f_in, f_out)
                        zipf.close()
                        gz.close()
                        extracted = 'yes'
                        
                    except tarfile.ReadError: 
                        # These submissions contain a single .tex file with no extension,
                        # so we need to treat them differently
                        suffix = '.tex'
                        tar.extract(submission, extraction_path)
                        with gzip.open(extraction_path + submission.name, 'rb') as f_in:
                            with open(extraction_path + submission_id + suffix, 'wb+') as f_out:
                                    shutil.copyfileobj(f_in, f_out)
                                    
                    result, converted = convert(extraction_path + submission_id + suffix)
                
                    # Log submission extraction & conversion info, remove .zip
                    log.append([submission_id, tar_name, submission_type, extracted, suffix, converted, result])
                    os.remove(extraction_path + submission.name + suffix)
                
            # Remove the folder that appears for the tar during ReadErrors
            shutil.rmtree(extracted_tar_path)
            
            # Save log so we have something if it fails 
            df = pd.DataFrame(log, columns=['submission', 
                                    'tarfile', 
                                    'type', 
                                    'extracted', 
                                    'extracted_suffix',
                                    'converted',
                                    'conversion_result'])
            df.to_csv('../data/2020_03_09_extract_and_convert_submissions/conversion_log.csv', 
                      mode='a', header='False')

Opening arXiv_src_0001_001.tar,
Processing submission: astro-ph0001001...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001001.zip...
Processing submission: astro-ph0001002...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001002.zip...
Processing submission: astro-ph0001003...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001003.zip...
Processing submission: astro-ph0001004...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001004.zip...
Processing submission: astro-ph0001005...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001005.zip...
Processing submission: astro-ph0001006...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001006.zip...
Processing submission: astro-ph0001007...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001007.zip...
Processing submission: astro-ph0001008...
Co

Processing submission: astro-ph0001064...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001064.zip...
Processing submission: astro-ph0001065...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001065.zip...
Processing submission: astro-ph0001066...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001066.zip...
Processing submission: astro-ph0001067...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001067.zip...
Processing submission: astro-ph0001068...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001068.zip...
Processing submission: astro-ph0001069...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001069.zip...
Processing submission: astro-ph0001070...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001070.zip...
Processing submission: astro-ph0001071...
Converting ../data/2020_03_09_extr

Processing submission: astro-ph0001127...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001127.zip...
Processing submission: astro-ph0001128...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001128.zip...
Processing submission: astro-ph0001129...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001129.zip...
Processing submission: astro-ph0001130...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001130.zip...
Processing submission: astro-ph0001131...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001131.tex...
Processing submission: astro-ph0001132...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001132.tex...
Processing submission: astro-ph0001133...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001133.zip...
Processing submission: astro-ph0001134...
Converting ../data/2020_03_09_extr

Processing submission: astro-ph0001190...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001190.zip...
Processing submission: astro-ph0001191...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001191.zip...
Processing submission: astro-ph0001192...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001192.tex...
Processing submission: astro-ph0001193...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001193.zip...
Processing submission: astro-ph0001194...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001194.zip...
Processing submission: astro-ph0001195...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001195.zip...
Processing submission: astro-ph0001196...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001196.zip...
Processing submission: astro-ph0001197...
Converting ../data/2020_03_09_extr

Processing submission: astro-ph0001253...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001253.zip...
Processing submission: astro-ph0001254...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001254.zip...
Processing submission: astro-ph0001255...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001255.zip...
Processing submission: astro-ph0001256...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001256.zip...
Processing submission: astro-ph0001257...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001257.zip...
Processing submission: astro-ph0001258...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001258.tex...
Processing submission: astro-ph0001259...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001259.zip...
Processing submission: astro-ph0001260...
Converting ../data/2020_03_09_extr

Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001315.zip...
Processing submission: astro-ph0001316...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001316.zip...
Processing submission: astro-ph0001317...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001317.tex...
Processing submission: astro-ph0001318...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001318.zip...
Processing submission: astro-ph0001319...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001319.zip...
Processing submission: astro-ph0001320...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001320.zip...
Processing submission: astro-ph0001321...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001321.zip...
Processing submission: astro-ph0001322...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0

Processing submission: astro-ph0001379...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001379.zip...
Processing submission: astro-ph0001380...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001380.zip...
Processing submission: astro-ph0001381...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001381.zip...
Processing submission: astro-ph0001382...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001382.zip...
Processing submission: astro-ph0001383...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001383.zip...
Processing submission: astro-ph0001384...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001384.zip...
Processing submission: astro-ph0001385...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001385.zip...
Processing submission: astro-ph0001386...
Converting ../data/2020_03_09_extr

Processing submission: astro-ph0001442...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001442.zip...
Processing submission: astro-ph0001443...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001443.zip...
Processing submission: astro-ph0001444...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001444.zip...
Processing submission: astro-ph0001445...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001445.zip...
Processing submission: astro-ph0001446...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001446.zip...
Processing submission: astro-ph0001447...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001447.zip...
Processing submission: astro-ph0001448...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001448.zip...
Processing submission: astro-ph0001449...
Converting ../data/2020_03_09_extr

Processing submission: astro-ph0001505...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001505.zip...
Processing submission: astro-ph0001506...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001506.zip...
Processing submission: astro-ph0001507...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001507.zip...
Processing submission: astro-ph0001508...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001508.zip...
Processing submission: astro-ph0001509...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001509.zip...
Processing submission: astro-ph0001510...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001510.zip...
Processing submission: astro-ph0001511...
Converting ../data/2020_03_09_extract_and_convert_submissions/temp/astro-ph0001511.zip...
Processing submission: astro-ph0001512...
Converting ../data/2020_03_09_extr

In [267]:
# Turn log for both tar dirs into a single data frame (APPEND)
df = pd.DataFrame(log, columns=['submission', 
                                    'tarfile', 
                                    'type', 
                                    'extracted', 
                                    'extracted_suffix',
                                    'converted',
                                    'conversion_result'])
df.to_csv('../data/2020_03_09_extract_and_convert_submissions/conversion_log.csv', mode='a', header='False')

Unnamed: 0,submission,tarfile,type,extracted,extracted_suffix,converted,conversion_result
0,astro-ph0001005,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
1,astro-ph0001008,arXiv_src_0001_001,.gz,no,.tex,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
2,astro-ph0001009,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
3,astro-ph0001010,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
4,astro-ph0001011,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....


In [253]:
print(log_df.iloc[0]['conversion_result'][1])


(Loading /opt/local/lib/perl5/vendor_perl/5.28/LaTeXML/Package/TeX.pool.ltxml...
(Loading /opt/local/lib/perl5/vendor_perl/5.28/LaTeXML/Package/eTeX.pool.ltxml... 0.01 sec)
(Loading /opt/local/lib/perl5/vendor_perl/5.28/LaTeXML/Package/pdfTeX.pool.ltxml... 0.04 sec) 0.34 sec)

latexmlc (LaTeXML version 0.8.4)
processing started Thu Mar 12 02:57:03 2020

(Digesting TeX procl...
(Processing content /var/folders/0n/1kjgmjg11ts77y0b7gzgm1fr0000gn/T/ykzaeDLdgT/procl.tex...
(Loading /opt/local/lib/perl5/vendor_perl/5.28/LaTeXML/Package/LaTeX.pool.ltxml... 0.22 sec)
Info:unexpected:\documentstyle Entering LaTeX 2.09 Compatibility mode
	at procl.tex; line 5 col 0 - line 5 col 30

(Loading /opt/local/lib/perl5/vendor_perl/5.28/LaTeXML/Package/article.cls.ltxml... 0.01 sec)
	at procl.tex; line 6 col 0 - line 6 col 1
	Anticipate undefined macros or environments
	search paths are /var/folders/0n/1kjgmjg11ts77y0b7gzgm1fr0000gn/T/ykzaeDLdgT, /Volumes/BRIENNAKH/Thesis/notebooks, /Volumes/BRIENNAKH/T

Collect preprint filepaths:

In [2]:
global empty_submissions
global preprints 
global corrupt_submissions 

def collect_all_preprints():
    '''
    Collects the filepaths for all preprints within 
    the ./latex/ folder. Returns an array of strings,
    each string representing the path of a preprint.
    '''
    
    # Initialize variables
    global empty_submissions 
    empty_submissions = []
    global preprints 
    preprints = []
    global corrupt_submissions 
    corrupt_submissions = []
    base_path = ''
    submission_count = 0
    texfile_count = 0

    # Walk through tar directories
    for idx, tar_folder in enumerate(os.listdir(base_path)):
        
        # If current path isn't a directory, skip
        tar_path = base_path + '/' + tar_folder
        if not os.path.isdir(tar_path):
            continue
        
        # Walk through each submission directory
        submission_dirs = os.listdir(tar_path)
        submission_count += len(submission_dirs)
        for submission in submission_dirs:
            
            # If current path isn't a directory, skip
            submission_path = tar_path + '/' + submission
            if not os.path.isdir(submission_path):
                submission_count -= 1
                continue

            arxiv_id = os.path.basename(submission_path) # used to note empty or corrupt submissions 

            # If submission is empty, note & skip
            texs = glob.glob(submission_path + '/**/*.tex', recursive=True)
            texfile_count += len(texs)
            if len(texs) == 0:
                empty_submissions.append(arxiv_id)
                continue
            
            # Otherwise get the preprint
            else:
                preprint_path = identify_preprint(submission_path, texs)
                if preprint_path:
                    preprints.append(preprint_path)
                else:
                    corrupt_submissions.append(arxiv_id)
    
    print('TEX files: ' + str(texfile_count))
    print('Submissions: ' + str(submission_count))
    print('Preprints: ' + str(len(preprints)))
    print('Empty submissions: ' + str(len(empty_submissions)))
    print('Potentially corrupt submissions: ' + str(len(corrupt_submissions)))
    

def identify_preprint(submission_path, texs):
    '''
    Identifies the preprint within a given submission. 
    
    Parameters
    ----------
    submission_path : str
        Filepath to submission directory
    texs : list of str
        Filepaths to all TEX files within submission directory
    '''
    preprint = None
    
    # If submission contains only one file, this is the preprint
    if len(texs) == 1:
        preprint = texs[0]
    # If submission contains ms.tex or main.tex, this is the preprint
    elif 'ms.tex' in texs:
        preprint = submission_path + '/' + 'ms.tex'
    elif 'main.tex' in texs:
        preprint = submission_path + '/' + 'main.tex'
    # Otherwise, iterate through each .tex looking for \documentclass or \documentstyle
    else: 
        for tex_path in texs: 
            with open(tex_path, 'rb') as f: 
                data = f.readlines()
                r = re.compile(b'(.*\\\\documentclass.*)|(.*\\\\documentstyle.*)')
                if len(list(filter(r.match, data))) > 0:
                    preprint = tex_path
                    break
    
    return preprint

In [3]:
collect_all_preprints()

TEX files: 125484
Submissions: 89908
Preprints: 89630
Empty submissions: 271
Potentially corrupt submissions: 7


View arXiv ids for the potentially corrupt submissions: 

In [4]:
corrupt_submissions

['1105.1087',
 '1211.4277',
 '1304.7762',
 '1308.6483',
 '1409.3422',
 '1606.06791',
 '1607.01189']

The website [arXiv Vanity]() is unable to render these corrupt preprints as well. There is something wrong with their TEX structure. Since there are only a few and I don't want to bother with their PDFs, we will skip these preprints for now. 

## 3. Convert each preprint:

In [4]:
def get_outpath(tex_path):
    '''
    Returns the filepath for a XML file,
    based on the given TEX filepath. 
    '''
    
    path_parts = pathlib.Path(tex_path).parts
    arxiv_id = path_parts[2]
    outpath = 'xml/' + arxiv_id + '.xml'
    return outpath

def get_preprints_to_convert():
    '''
    Returns a list of strings. Each string 
    is a path to a TEX file within ./latex/ 
    that has not yet been converted to XML.
    '''
    
    global preprints_to_convert 
    preprints_to_convert = []
    
    for tex_path in preprints:
        outpath = get_outpath(tex_path)
        logfile_path = 'logs/' + os.path.splitext(os.path.basename(tex_path))[0] + '.txt'
        if not os.path.isfile(outpath):
        #and not os.path.isfile(logfile_path):
            preprints_to_convert.append(tex_path)
            
    print(str(len(preprints_to_convert)) + ' preprints to be converted...')
    return preprints_to_convert

76056 preprints to be converted...
<ForkProcess(ForkPoolWorker-9, started daemon)> is converting latex/arXiv_src_1008_005/1008.4260/7740.tex...
<ForkProcess(ForkPoolWorker-10, started daemon)> is converting latex/arXiv_src_1201_003/1201.1934/paper.tex...
<ForkProcess(ForkPoolWorker-11, started daemon)> is converting latex/arXiv_src_1205_007/1205.4846/19253.tex...
<ForkProcess(ForkPoolWorker-12, started daemon)> is converting latex/arXiv_src_1210_002/1210.1228/paper.tex...
Writing logfile for latex/arXiv_src_1008_005/1008.4260/7740.tex
<ForkProcess(ForkPoolWorker-9, started daemon)> is converting latex/arXiv_src_1008_006/1008.4948/XLikeParPaper-astroph.tex...
Writing logfile for latex/arXiv_src_1008_006/1008.4948/XLikeParPaper-astroph.tex
<ForkProcess(ForkPoolWorker-9, started daemon)> is converting latex/arXiv_src_1009_001/1009.0024/ions6.tex...
Writing logfile for latex/arXiv_src_1205_007/1205.4846/19253.tex
<ForkProcess(ForkPoolWorker-11, started daemon)> is converting latex/arXiv_sr

Writing logfile for latex/arXiv_src_1201_003/1201.2111/klpbeijing.tex
<ForkProcess(ForkPoolWorker-10, started daemon)> is converting latex/arXiv_src_1201_003/1201.2113/errata.tex...
Writing logfile for latex/arXiv_src_1205_007/1205.4897/moustakidis.tex
<ForkProcess(ForkPoolWorker-11, started daemon)> is converting latex/arXiv_src_1205_007/1205.4901/hh30_paper.tex...
Writing logfile for latex/arXiv_src_1201_003/1201.2113/errata.tex
<ForkProcess(ForkPoolWorker-10, started daemon)> is converting latex/arXiv_src_1201_003/1201.2119/ARA_Review_final_revised.tex...
Writing logfile for latex/arXiv_src_1210_002/1210.1346/lofar-virgo.tex
<ForkProcess(ForkPoolWorker-12, started daemon)> is converting latex/arXiv_src_1210_002/1210.1359/VelaX.tex...
Writing logfile for latex/arXiv_src_1201_003/1201.2119/ARA_Review_final_revised.tex
<ForkProcess(ForkPoolWorker-10, started daemon)> is converting latex/arXiv_src_1201_003/1201.2122/Tunka133-archiv.tex...
Writing logfile for latex/arXiv_src_1302_003/130

Writing logfile for latex/arXiv_src_1205_007/1205.4989/abbrev.tex
<ForkProcess(ForkPoolWorker-11, started daemon)> is converting latex/arXiv_src_1205_007/1205.5006/ms.tex...
Writing logfile for latex/arXiv_src_1201_003/1201.2172/msastro.tex
<ForkProcess(ForkPoolWorker-10, started daemon)> is converting latex/arXiv_src_1201_003/1201.2174/ms.tex...
Writing logfile for latex/arXiv_src_1205_007/1205.5006/ms.tex
<ForkProcess(ForkPoolWorker-11, started daemon)> is converting latex/arXiv_src_1205_007/1205.5007/Thesis.tex...
<ForkProcess(ForkPoolWorker-12, started daemon)> is converting latex/arXiv_src_1306_007/1306.3958/nicsmg.revised.v2.tex...
Writing logfile for latex/arXiv_src_1201_003/1201.2174/ms.tex
<ForkProcess(ForkPoolWorker-10, started daemon)> is converting latex/arXiv_src_1201_003/1201.2175/forastroph.tex...
Writing logfile for latex/arXiv_src_1302_004/1302.2147/mdot-13_04_27.tex
<ForkProcess(ForkPoolWorker-9, started daemon)> is converting latex/arXiv_src_1302_004/1302.2148/uv7.te

This helped: https://jarrodmcclean.com/simple-bash-parallel-commands-in-python/

If this is started after a whole bunch of files have converted, this will take a while to ensure that those files don't need converting before reaching the ones that do.

Then later check the difference between logs and xml files. For those in logs that are not in xml, they failed. 

https://github.com/hopper-project/hoptex/search?q=generate_sanitized_document&unscoped_q=generate_sanitized_document

First I need to confirm the main file in each repository. 
- If it doesn't contain a .bbl file, I need to add it to the bbl_lack folder. Later. Set aside and skip.
- If it doesn't contain a file, I need to retrieve it again. Later. Set aside and skip. 

I will look at each submission folder, check xml to see if a file exists with its name. If not, I will go into the submission folder to check each file if it contains \\documentclass. If it does, grab it and convert it. Break out of loop. 

In [165]:
def guess_extension_from_headers(h):
    """
    Given headers from an ArXiV e-print response, try and guess what the file
    extension should be.
    Based on: https://arxiv.org/help/mimetypes
    """
    if h.get('content-type') == 'application/pdf':
        return '.pdf'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/postscript':
        return '.ps.gz'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-eprint-tar':
        return '.tar.gz'
    # content-encoding is x-gzip but this appears to normally be a lie - it's
    # just plain text
    if h.get('content-type') == 'application/x-eprint':
        return '.tex'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-dvi':
        return '.dvi.gz'
    return None

def arxiv_id_to_source_url(arxiv_id):
    # This URL is normally a tarball, but sometimes something else.
    # ArXiV provides a /src/ URL which always serves up a tarball,
    # but if we used this, we'd have to untar the file to figure out
    # whether it's renderable or not. By using the /e-print/ endpoint
    # we can figure out straight away whether we should bother rendering
    # it or not.
    # https://arxiv.org/help/mimetypes has more info
    return 'https://arxiv.org/e-print/' + arxiv_id

def download_source_file(arxiv_id):
    """
    Download the LaTeX source of this paper and returns as ContentFile.
    """
    source_url = arxiv_id_to_source_url(arxiv_id)
    res = requests.get(source_url)
    res.raise_for_status()
    extension = guess_extension_from_headers(res.headers)
    if not extension:
        raise DownloadError("Could not determine file extension from "
                            "headers: Content-Type: {}; "
                            "Content-Encoding: {}".format(
                                res.headers.get('content-type'),
                                res.headers.get('content-encoding')))
    with open(arxiv_id + extension, 'wb+') as f:
        f.write(res.content)
        print('Created ' + arxiv_id + extension)

download_source_file('1010.3382')

Created 1010.3382.tar.gz
