# Parse preprint full-texts

Contents:
1. Introduction
2. 

Check if downloaded .tex files contain fulltext. Some are just supplementary files for the submission.  If they contain \documentclass. 
Each submission folder should contain one \documentclass file. 

In [2]:
import os, sys, re, requests, subprocess, glob, multiprocessing, time, pathlib

Each submission folder may contain 1 or more .tex files. We need to identify which .tex file is the full-text preprint.

In [3]:
def get_preprints():
    base_path = 'latex'
    submissions = 0
    tex_count = 0
    empties = []
    preprints = []
    missed = []

    # Walk through tars
    for idx, tar_folder in enumerate(os.listdir(base_path)):
        tar_path = base_path + '/' + tar_folder
        if not os.path.isdir(tar_path):
            continue
        submission_dirs = os.listdir(tar_path)
        submissions += len(submission_dirs)
        # Walk through each submission
        for submission in submission_dirs:
            submission_path = tar_path + '/' + submission
            if not os.path.isdir(submission_path):
                submissions -= 1
                continue
            arxiv_id = os.path.basename(submission_path)
            texs = glob.glob(submission_path + '/**/*.tex', recursive=True)
            tex_count += len(texs)
            
            # If submission is empty, note & skip
            if len(texs) == 0:
                empties.append(arxiv_id)
                continue
            # Otherwise get the preprint
            else:
                preprint_path = getPreprint(submission_path, texs)
                if preprint_path:
                    preprints.append(preprint_path)
                else:
                    missed.append(arxiv_id)
                    
    print('Submissions: ' + str(submissions))
    print('Preprints: ' + str(len(preprints)))
    print('Non-empty submissions missing preprints: ' + str(len(missed)))
    print('Empty submissions: ' + str(len(empties)))
    print('Texs: ' + str(tex_count))
    print(str(missed))
    
    return preprints

def getPreprint(submission_path, texs):
    preprint = None
    
    # If submission contains only one file, this is the preprint
    if len(texs) == 1:
        preprint = texs[0]
    # If submission contains ms.tex or main.tex, this is the preprint
    elif 'ms.tex' in texs:
        preprint = submission_path + '/' + 'ms.tex'
    elif 'main.tex' in texs:
        preprint = submission_path + '/' + 'main.tex'
    # Otherwise, iterate through each .tex looking for \documentclass or \documentstyle
    else: 
        for tex_path in texs: 
            with open(tex_path, 'rb') as f: 
                data = f.readlines()
                r = re.compile(b'(.*\\\\documentclass.*)|(.*\\\\documentstyle.*)')
                if len(list(filter(r.match, data))) > 0:
                    preprint = tex_path
                    break
    
    return preprint

In [4]:
preprints = get_preprints()

Submissions: 89908
Preprints: 89630
Non-empty submissions missing preprints: 7
Empty submissions: 271
Texs: 125484
['1105.1087', '1211.4277', '1304.7762', '1308.6483', '1409.3422', '1606.06791', '1607.01189']


In [5]:
len(preprints)

89630

arxiv-vanity is unable to render these 7 papers as well. I'll just leave them for now.

Convert each preprint via multiprocessing.

Before you can begin multiprocessing, you need to pick which sections of code to multiprocess. These sections of code must meet the following criteria:

1. Must not be reliant on previous outcomes. True.

2. Does not need to be executed in a particular order. True.

3. Does not return anything that would need to be accessed later in the code. True.

In [None]:
def convert_to_xml(tex_path):
    '''
    Converts TEX file to XML.
    '''
    
    time.sleep(1)
    print('--> Conversion beginning for {} to XML'.format(tex_path))
    # Convert given .tex to XML
    filename, file_extension = os.path.splitext(os.path.basename(tex_path))
    xml_path = 'xml/' + pathlib.Path(tex_path).parts[2] + '.xml'
    
    if os.path.exists(xml_path):
        print(os.path.basename(tex_path) + ' has already been converted.')
    elif file_extension == '.tex':
        subprocess.call(['latexml', '--dest=' + xml_path, tex_path])
        print('--> Conversion complete for {} to XML'.format(tex_path))

def start_conversion():
    starttime = time.time()
    pool = multiprocessing.Pool(processes=3, maxtasksperchild=1) # prevent memory blow out
    # Specify a timeout so that this pool can be interrupted, also use imap to prevent loading entire list in memory
    for worker in pool.imap_unordered(convert_to_xml, preprints, chunksize=1):
        print("{} (Time elapsed: {}s)".format(worker, int(time.time() - starttime)))
    pool.join()
    pool.close()

start_conversion()

--> Conversion beginning for latex/arXiv_src_1008_005/1008.3750/Lundqvist_pwn0540_corrected.tex to XML
--> Conversion beginning for latex/arXiv_src_1008_005/1008.3737/stats_ms4_pdf.tex to XML
--> Conversion beginning for latex/arXiv_src_1008_005/1008.3740/nstarB-f2.tex to XML
stats_ms4_pdf.tex has already been converted.
nstarB-f2.tex has already been converted.
Lundqvist_pwn0540_corrected.tex has already been converted.
None (Time elapsed: 1s)
None (Time elapsed: 1s)
None (Time elapsed: 1s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.3764/ms.tex to XML
--> Conversion beginning for latex/arXiv_src_1008_005/1008.3790/buerzleetal.tex to XML
ms.tex has already been converted.
buerzleetal.tex has already been converted.
--> Conversion beginning for latex/arXiv_src_1008_005/1008.3791/hdisc_grl.tex to XML
hdisc_grl.tex has already been converted.
None (Time elapsed: 2s)
None (Time elapsed: 2s)
None (Time elapsed: 2s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.3

None (Time elapsed: 19s)
None (Time elapsed: 19s)
None (Time elapsed: 19s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4058/ngc4631.tex to XML
ngc4631.tex has already been converted.
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4061/LE.tex to XML
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4082/CBASS_SPIE_arxiv.tex to XML
LE.tex has already been converted.
CBASS_SPIE_arxiv.tex has already been converted.
None (Time elapsed: 20s)
None (Time elapsed: 20s)
None (Time elapsed: 20s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4086/EvolutionaryEquationsI.tex to XML
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4089/sgr0501_egogus.tex to XML
EvolutionaryEquationsI.tex has already been converted.
sgr0501_egogus.tex has already been converted.
None (Time elapsed: 21s)
None (Time elapsed: 21s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4090/ms_rev2.tex to XML
ms_rev2.tex has already been converted.
None (Time

accretion.tex has already been converted.
None (Time elapsed: 40s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4335/eisner_spie2010.tex to XML
eisner_spie2010.tex has already been converted.
None (Time elapsed: 41s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4340/ms2.tex to XML
ms2.tex has already been converted.
None (Time elapsed: 41s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4343/10yrsCha_ptesta.tex to XML
10yrsCha_ptesta.tex has already been converted.
None (Time elapsed: 42s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4344/chalonge2.tex to XML
chalonge2.tex has already been converted.
None (Time elapsed: 42s)
--> Conversion complete for latex/arXiv_src_1008_005/1008.4260/7740.tex to XML
None (Time elapsed: 42s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4347/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 43s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4352/14074.t

None (Time elapsed: 60s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4576/akari-mdwarfs-astroph.tex to XML
akari-mdwarfs-astroph.tex has already been converted.
None (Time elapsed: 60s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4578/WBMsmallN.tex to XML
WBMsmallN.tex has already been converted.
None (Time elapsed: 60s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4581/paper.tex to XML
paper.tex has already been converted.
None (Time elapsed: 61s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4582/hr8799_astroph.tex to XML
hr8799_astroph.tex has already been converted.
None (Time elapsed: 61s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4583/paper_halo_model_january_31.tex to XML
paper_halo_model_january_31.tex has already been converted.
None (Time elapsed: 61s)
--> Conversion beginning for latex/arXiv_src_1008_005/1008.4584/Hayward_C.tex to XML
Hayward_C.tex has already been converted.
None (Time elapsed: 62s)
-

Accepted_MS_arXive.tex has already been converted.
None (Time elapsed: 78s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.4842/SXDS_LAE_Accepted_arXive.tex to XML
SXDS_LAE_Accepted_arXive.tex has already been converted.
None (Time elapsed: 79s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.4848/1008.4848.tex to XML
1008.4848.tex has already been converted.
None (Time elapsed: 79s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.4863/cme.tex to XML
cme.tex has already been converted.
None (Time elapsed: 79s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.4866/cmm_2010_submitted.tex to XML
cmm_2010_submitted.tex has already been converted.
None (Time elapsed: 80s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.4867/MgRadCoeff.tex to XML
MgRadCoeff.tex has already been converted.
None (Time elapsed: 80s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.4882/paperCesar-resub2.tex to XML
paperCesar-resub2.tex has already

ms.tex has already been converted.
None (Time elapsed: 98s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.5152/IAUS271_GKetal2010.tex to XML
IAUS271_GKetal2010.tex has already been converted.
None (Time elapsed: 99s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.5157/erinflation.tex to XML
erinflation.tex has already been converted.
None (Time elapsed: 99s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.5159/1008.5159.tex to XML
1008.5159.tex has already been converted.
None (Time elapsed: 100s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.5164/Fermi_NGC1068_NGC4945.tex to XML
Fermi_NGC1068_NGC4945.tex has already been converted.
None (Time elapsed: 100s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.5174/ms_rev2_astroph.tex to XML
ms_rev2_astroph.tex has already been converted.
None (Time elapsed: 100s)
--> Conversion beginning for latex/arXiv_src_1008_006/1008.5175/BoehmSilkEnsslin.tex to XML
BoehmSilkEnsslin.tex has al

None (Time elapsed: 117s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0027/Peroux_SINFONI_II.tex to XML
Peroux_SINFONI_II.tex has already been converted.
None (Time elapsed: 118s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0030/paper_v13.tex to XML
paper_v13.tex has already been converted.
None (Time elapsed: 119s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0038/ms_ApJ.tex to XML
ms_ApJ.tex has already been converted.
None (Time elapsed: 119s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0039/Proceedings.tex to XML
Proceedings.tex has already been converted.
None (Time elapsed: 120s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0049/ms_astroph.tex to XML
ms_astroph.tex has already been converted.
None (Time elapsed: 120s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0054/Nitrogen_A-F_in_SG_FINAL_ED.tex to XML
Nitrogen_A-F_in_SG_FINAL_ED.tex has already been converted.
None (Time elapsed: 121s)
--> 

--> Conversion beginning for latex/arXiv_src_1009_001/1009.0334/paper.tex to XML
paper.tex has already been converted.
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0335/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 139s)
None (Time elapsed: 139s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0338/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 140s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0341/morii_submitted_for_arxiv.tex to XML
morii_submitted_for_arxiv.tex has already been converted.
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0344/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 140s)
None (Time elapsed: 140s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0345/arena2010lopes.tex to XML
arena2010lopes.tex has already been converted.
None (Time elapsed: 141s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0346/arena2010reasmgmr.tex to X

moustakidis.tex has already been converted.
None (Time elapsed: 158s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0632/fnl_constraints_casaponsa_1st_revision.tex to XML
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0645/ediscs_sigma_sissa.tex to XML
ediscs_sigma_sissa.tex has already been converted.
None (Time elapsed: 158s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0646/pap_REV2.tex to XML
pap_REV2.tex has already been converted.
None (Time elapsed: 159s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0649/ms_alt.tex to XML
ms_alt.tex has already been converted.
None (Time elapsed: 159s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0658/struve.tex to XML
struve.tex has already been converted.
None (Time elapsed: 160s)
--> Conversion beginning for latex/arXiv_src_1009_001/1009.0662/ms_v9.tex to XML
ms_v9.tex has already been converted.
None (Time elapsed: 160s)
--> Conversion beginning for latex/arXiv_src_1009_001/1

None (Time elapsed: 178s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0882/NewVersion_v4.tex to XML
NewVersion_v4.tex has already been converted.
None (Time elapsed: 178s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0894/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 178s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0902/1009.0902.tex to XML
1009.0902.tex has already been converted.
None (Time elapsed: 179s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0903/grazina2.tex to XML
grazina2.tex has already been converted.
None (Time elapsed: 179s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0916/wimpfvrev.tex to XML
wimpfvrev.tex has already been converted.
None (Time elapsed: 179s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0920/report.tex to XML
report.tex has already been converted.
None (Time elapsed: 180s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.0924/ms.tex

OApapereffnew.tex has already been converted.
None (Time elapsed: 198s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1154/paper_final.tex to XML
paper_final.tex has already been converted.
None (Time elapsed: 198s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1158/time_resolved_astroph.tex to XML
time_resolved_astroph.tex has already been converted.
None (Time elapsed: 199s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1163/paper.v5.tex to XML
paper.v5.tex has already been converted.
None (Time elapsed: 199s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1165/RedLBVinM33.tex to XML
RedLBVinM33.tex has already been converted.
None (Time elapsed: 199s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1167/Sepinsky_Mykonos.tex to XML
Sepinsky_Mykonos.tex has already been converted.
None (Time elapsed: 200s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1171/iipeg2.tex to XML
iipeg2.tex has already been converted

cosmonest_modelaveraging_resubmit.tex has already been converted.
None (Time elapsed: 218s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1395/DLAs_rev_v1.tex to XML
DLAs_rev_v1.tex has already been converted.
None (Time elapsed: 218s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1396/varying_alpha2.tex to XML
varying_alpha2.tex has already been converted.
None (Time elapsed: 219s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1397/binaries2.tex to XML
binaries2.tex has already been converted.
None (Time elapsed: 219s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1399/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 219s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1400/STNbody.tex to XML
STNbody.tex has already been converted.
None (Time elapsed: 220s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1401/figures.tex to XML
figures.tex has already been converted.
None (Time elapsed: 220s

FLMKP_final.tex has already been converted.
None (Time elapsed: 238s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1633/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 238s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1634/aipcheck.tex to XML
aipcheck.tex has already been converted.
None (Time elapsed: 239s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1636/Pilia.tex to XML
Pilia.tex has already been converted.
None (Time elapsed: 239s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1642/ms_emapj.tex to XML
ms_emapj.tex has already been converted.
None (Time elapsed: 239s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1644/monit1.tex to XML
monit1.tex has already been converted.
None (Time elapsed: 240s)
--> Conversion beginning for latex/arXiv_src_1009_002/1009.1650/Investigation_rev23.tex to XML
Investigation_rev23.tex has already been converted.
None (Time elapsed: 240s)
--> Conversion beginning

ngc604eldridge.tex has already been converted.
None (Time elapsed: 262s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.1872/Planet_search_HST.tex to XML
Planet_search_HST.tex has already been converted.
None (Time elapsed: 263s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.1873/15729.tex to XML
15729.tex has already been converted.
None (Time elapsed: 263s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.1874/revision.tex to XML
revision.tex has already been converted.
None (Time elapsed: 264s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.1875/murphy_retreat_arXiv_v2.tex to XML
murphy_retreat_arXiv_v2.tex has already been converted.
None (Time elapsed: 264s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.1879/ms4small.tex to XML
ms4small.tex has already been converted.
None (Time elapsed: 265s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.1882/ms.tex to XML
ms.tex has already been converted.
None (Time elapse

--> Conversion beginning for latex/arXiv_src_1009_003/1009.2082/pipe_apj_v2.tex to XML
pipe_apj_v2.tex has already been converted.
None (Time elapsed: 299s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2089/1009.2089.tex to XML
1009.2089.tex has already been converted.
None (Time elapsed: 300s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2091/ms.tex to XML
ms.tex has already been converted.
None (Time elapsed: 300s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2093/EFT_of_multifield_inflation_arXive_resubm.tex to XML
EFT_of_multifield_inflation_arXive_resubm.tex has already been converted.
None (Time elapsed: 301s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2094/mag-energy.tex to XML
mag-energy.tex has already been converted.
None (Time elapsed: 301s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2098/Skinner_2010_06.tex to XML
Skinner_2010_06.tex has already been converted.
None (Time elapsed: 302s)
--> Conversion

muth_etal_IAU271.tex has already been converted.
None (Time elapsed: 335s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2410/Crockett_area.tex to XML
Crockett_area.tex has already been converted.
None (Time elapsed: 337s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2427/Anisotropic_Imbalanced_Elsasser_Modes_arXiv_2.tex to XML
Anisotropic_Imbalanced_Elsasser_Modes_arXiv_2.tex has already been converted.
None (Time elapsed: 338s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2430/Bell_NGC7213_submission_draftV3.tex to XML
Bell_NGC7213_submission_draftV3.tex has already been converted.
None (Time elapsed: 339s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2431/seabroke_elsa_proc_2010.tex to XML
seabroke_elsa_proc_2010.tex has already been converted.
None (Time elapsed: 340s)
--> Conversion beginning for latex/arXiv_src_1009_003/1009.2448/1009.2448.tex to XML
1009.2448.tex has already been converted.
None (Time elapsed: 341s)
--> Conv

In [None]:
# THIS ONE

def get_outpath(tex_path):
    path_parts = pathlib.Path(tex_path).parts
    arxiv_id = path_parts[2]
    outpath = 'xml/' + arxiv_id + '.xml'
    return outpath

def convert_to_xml(inpath):
    time.sleep(0.1) # need the pause for KeyboardInterrupt to work...idk why
    outpath = get_outpath(inpath)
    
    if os.path.isfile(outpath):
        return
    
    try:
        print('Converting {}...'.format(inpath))
        proc = subprocess.Popen(['latexml', '--dest=' + outpath, inpath], 
                                   stderr=subprocess.PIPE, 
                                   stdout=subprocess.PIPE)
        proc.communicate(timeout=240) #timeout catches some files that hang during conversion such as latex/arXiv_src_1009_002/1009.1724/15727_eger.tex
    except subprocess.TimeoutExpired: 
        proc.kill()
        print("--X Conversion failed - timeout: {}".format(inpath))
    except Exception as error:
        print("--X Conversion failed - timeout: {}".format(inpath))
        print("error: %s run(*%r, **%r)" % (e, args, kwargs))
     
    else:
        print('{}: Converted!'.format(inpath))

def start():
    start_time = time.time()
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count(),
                               maxtasksperchild=1) # important to set maxtasksperchild, cuz each task takes up considerable memory
    print('Initialized {} workers'.format(multiprocessing.cpu_count()))
    print('Beginning conversion...')
    
    try:
        for _ in pool.imap_unordered(convert_to_xml, preprints): 
            pass
    except KeyboardInterrupt:
        pool.terminate()
        exit(1)
    
    pool.close()
    pool.join()
    
if __name__ == '__main__':
    start()

Initialized 4 workers
Beginning conversion...
Converting latex/arXiv_src_1008_005/1008.4260/7740.tex...
latex/arXiv_src_1008_005/1008.4260/7740.tex: Converted!
Converting latex/arXiv_src_1008_006/1008.4948/XLikeParPaper-astroph.tex...
latex/arXiv_src_1008_006/1008.4948/XLikeParPaper-astroph.tex: Converted!
Converting latex/arXiv_src_1009_001/1009.0024/ions6.tex...
Converting latex/arXiv_src_1009_001/1009.0632/fnl_constraints_casaponsa_1st_revision.tex...
latex/arXiv_src_1009_001/1009.0024/ions6.tex: Converted!
Converting latex/arXiv_src_1009_002/1009.0966/gibsonsj.tex...
latex/arXiv_src_1009_001/1009.0632/fnl_constraints_casaponsa_1st_revision.tex: Converted!
latex/arXiv_src_1009_002/1009.0966/gibsonsj.tex: Converted!
Converting latex/arXiv_src_1009_002/1009.1296/aquila_paper_arxiv.tex...
Converting latex/arXiv_src_1009_002/1009.1352/exotides.tex...
latex/arXiv_src_1009_002/1009.1296/aquila_paper_arxiv.tex: Converted!
latex/arXiv_src_1009_002/1009.1352/exotides.tex: Converted!
Converti

NOTE:
If I use `pool.map(convert_to_xml, preprints)`, I cannot kill this. So this is why I used `pool.map_async()` with a timeout specified. 
https://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool

If I ever use pool.map accidentally - this is how to kill the stupid Medusa-spawning workers: https://stackoverflow.com/questions/25415104/kill-python-multiprocessing-pool


In [16]:
import subprocess

test = ['latex/arXiv_src_1008_005/1008.3737/stats_ms4_pdf.tex',
 'latex/arXiv_src_1008_005/1008.3740/nstarB-f2.tex',
 'latex/arXiv_src_1008_005/1008.3750/Lundqvist_pwn0540_corrected.tex',
 'latex/arXiv_src_1008_005/1008.3764/ms.tex',
 'latex/arXiv_src_1008_005/1008.3790/buerzleetal.tex']



processes = []
for inpath in test: 
    outpath = get_outpath(inpath)
    processes.append(subprocess.Popen('latexml --dest={} {}'.format(outpath, inpath), shell=True))
exitcodes = [p.wait() for p in processes]
for exitcode in exitcodes:
    print(exitcode)


# ext = os.path.splitext(path_parts[len(path_parts)])

0
0
0
0
0


How to choose between multithreading, subprocess, and threading: https://stackoverflow.com/questions/2629680/deciding-among-subprocess-multiprocessing-and-thread-in-python

Attempt using more cores:

It is faster to use all 8 cores. So I will do the XML conversion using all 8 cores. 

First I need to confirm the main file in each repository. 
- If it doesn't contain a .bbl file, I need to add it to the bbl_lack folder. Later. Set aside and skip.
- If it doesn't contain a file, I need to retrieve it again. Later. Set aside and skip. 

I will look at each submission folder, check xml to see if a file exists with its name. If not, I will go into the submission folder to check each file if it contains \\documentclass. If it does, grab it and convert it. Break out of loop. 

Now call the functions, with conversion process.

I have 8 cores: https://superuser.com/questions/1101311/how-many-cores-does-my-mac-have/1101314#1101314
4 physical and 4 logical. 

In [165]:
def guess_extension_from_headers(h):
    """
    Given headers from an ArXiV e-print response, try and guess what the file
    extension should be.
    Based on: https://arxiv.org/help/mimetypes
    """
    if h.get('content-type') == 'application/pdf':
        return '.pdf'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/postscript':
        return '.ps.gz'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-eprint-tar':
        return '.tar.gz'
    # content-encoding is x-gzip but this appears to normally be a lie - it's
    # just plain text
    if h.get('content-type') == 'application/x-eprint':
        return '.tex'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-dvi':
        return '.dvi.gz'
    return None

def arxiv_id_to_source_url(arxiv_id):
    # This URL is normally a tarball, but sometimes something else.
    # ArXiV provides a /src/ URL which always serves up a tarball,
    # but if we used this, we'd have to untar the file to figure out
    # whether it's renderable or not. By using the /e-print/ endpoint
    # we can figure out straight away whether we should bother rendering
    # it or not.
    # https://arxiv.org/help/mimetypes has more info
    return 'https://arxiv.org/e-print/' + arxiv_id

def download_source_file(arxiv_id):
    """
    Download the LaTeX source of this paper and returns as ContentFile.
    """
    source_url = arxiv_id_to_source_url(arxiv_id)
    res = requests.get(source_url)
    res.raise_for_status()
    extension = guess_extension_from_headers(res.headers)
    if not extension:
        raise DownloadError("Could not determine file extension from "
                            "headers: Content-Type: {}; "
                            "Content-Encoding: {}".format(
                                res.headers.get('content-type'),
                                res.headers.get('content-encoding')))
    with open(arxiv_id + extension, 'wb+') as f:
        f.write(res.content)
        print('Created ' + arxiv_id + extension)

download_source_file('1010.3382')

Created 1010.3382.tar.gz
