# Convert preprints 

Contents:
1. Introduction
2. Extract and convert preprints
2. Identify preprints
3. Convert preprints
4. Appendix: Request a submission from arXiv API

## 1. Introduction 

In this notebook, we convert preprints from TEX to XML, a format that simplifies downstream parsing. 

Each submission in the ./latex/ folder should contain 1 or more .tex files. If the submission contains more than 1 .tex file, we identify the main file. The additional files are usually inserts for the main file. 

After collecting the filepaths for all submissions' main files, we convert them from .tex to .xml using the [latexml](https://dlmf.nist.gov/LaTeXML/) package, spreading the work across all CPU cores (4 on my machine).

All converted .xml files are stored in ./xml/.

## 2. Extract and convert preprints

Import all dependencies:

In [127]:
import os, re, subprocess, glob, multiprocessing, time, tarfile, gzip, shutil, zipfile
import pandas as pd
import numpy as np

Grab arXiv submission identifiers from the metadata, and load conversion log:

In [128]:
metadata_df = pd.read_csv('../data/2020_03_06_arxiv_metadata_astroph/arxiv_metadata_astroph.csv', 
                         dtype={'filename_parsed': str})
identifiers = list(metadata_df['filename_parsed'])
log_df = pd.read_csv('../data/2020_03_09_extract_and_convert_submissions/conversion_log.csv')
log_df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,submission,tarfile,type,extracted,extracted_suffix,converted,conversion_result
0,astro-ph0001001,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
1,astro-ph0001002,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
2,astro-ph0001003,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
3,astro-ph0001004,arXiv_src_0001_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
4,astro-ph0001005,arXiv_src_0001_001,.gz,yes,.zip,no,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
...,...,...,...,...,...,...,...
15670,nlin0203056,arXiv_src_0203_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
15671,nucl-th0203011,arXiv_src_0203_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
15672,nucl-th0203061,arXiv_src_0203_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....
15673,nucl-th0203071,arXiv_src_0203_001,.gz,yes,.zip,yes,\n(Loading /opt/local/lib/perl5/vendor_perl/5....


Define utility functions:

In [129]:
extraction_path = '../data/2020_03_09_extract_and_convert_submissions/temp_extractions/'
csv = '../data/2020_03_09_extract_and_convert_submissions/conversion_log.csv'

print('Astrophysics submissions: ' + str(len(identifiers)))

tar_dir = '../data/2020_03_03_original_arxiv_tars/'
tar_dirs = ['../data/2020_03_03_original_arxiv_tars/', '../data/2020_03_07_update_tars/']
print('Tar directories: ' + str(len(tar_dirs)))

tasks = []
for tar_dir in tar_dirs: 
    tasks += [tar_dir + x for x in os.listdir(tar_dir) if x.endswith('.tar')]
print('Tars/tasks: ' + str(len(tasks)))

print('Submissions processed: ' + str(len(log_df)))

Astrophysics submissions: 267794
Tar directories: 2
Tars/tasks: 2561
Submissions processed: 15675


In [44]:
chunks = np.array_split(tasks, 3) # do this in both notebooks

854

In [130]:
def get_outpath(submission_path):
    '''
    Returns the filepath for a XML file,
    based on the given filepath. 
    '''
    submission_id = os.path.splitext(os.path.basename(submission_path))[0]
    outpath = '../data/2020_03_09_extract_and_convert_submissions/converted_xml/' + submission_id + '.xml'
    return outpath

def open_tar(tar_file):
    if os.path.isfile(tar_file) and tarfile.is_tarfile(tar_file):
        tar = tarfile.open(tar_file)
        tar_name = os.path.splitext(os.path.basename(tar.name))[0]
        return tar, tar_name 


def extract(tar, submission, extracted_gz_path):
    '''
    Extracts given submission (formatted as TarInfo object).
    Returns string signifying whether or not extraction was successful.
    '''
    
    try:
        suffix = '.zip'
        gz_obj = tar.extractfile(submission)
        gz = tarfile.open(fileobj=gz_obj, mode='r|gz')
        zipf = zipfile.ZipFile(file=extracted_gz_path + suffix, mode='a', compression=zipfile.ZIP_DEFLATED)

        for m in gz:
            f = gz.extractfile(m)
            if m.isdir():
                continue
            f_out = f.read()
            f_in = m.name
            zipf.writestr(f_in, f_out)
        zipf.close()
        gz.close()
        extracted = 'yes'
    except tarfile.ReadError: 
        # These submissions contain a single .tex file with no extension,
        # so we need to treat them differently
        suffix = '.tex'
        tar.extract(submission, extraction_path)
        with gzip.open(extraction_path + submission.name, 'rb') as f_in:
            with open(extraction_path + submission.name + suffix, 'wb+') as f_out:
                shutil.copyfileobj(f_in, f_out)
        extracted = 'yes'
        
    return extracted, suffix


def do_not_convert(submission_id):
    '''
    Returns boolean indicating whether or not 
    the given submission should be converted.
    We only want submissions that are from the
    astrophysics archive, and, for performance
    purposes, that haven't already been converted.
    '''
    
    not_astrophysics = submission_id not in identifiers
    already_converted = submission_id in log_df['submission'].values
    return not_astrophysics or already_converted 

        
def convert(submission_path):
    '''
    Converts file at passed filepath to XML,
    using LaTeXMLc.
    '''
    outpath = get_outpath(submission_path)
    print('Converting to ' + outpath)
    try:
        proc = subprocess.Popen(['latexmlc', '--timeout=240', '--dest=' + outpath, submission_path], stderr=subprocess.PIPE)
        out, err = proc.communicate()
        err = err.decode('utf-8')
        # Check if file was converted successfully
        if 'Error! Did not write file' in err:
            converted = 'no'
        else:
            converted = 'yes'
    except Exception as e:
        print('Something went wrong in convert(): ' + str(e))
        converted = 'no'
    return err, converted

def save_log(log):
    # Save log so we have something if it fails
    df = pd.DataFrame(log, columns=['submission', 
                            'tarfile', 
                            'type', 
                            'extracted', 
                            'extracted_suffix',
                            'converted',
                            'conversion_result'])
    df.to_csv(csv, mode='a', header=(not os.path.exists(csv)), index=False)

The main code:

In [None]:
def work(task):
    '''
    Defines work to be done for each tar.
    task — filepath to tar
    '''
    
    # Open it as read-only
    log = []
    tar, tar_name = open_tar(task)
    print('\nOpening ' + tar_name + ',')
    extracted_tar_path = None # temp, only for ReadErrors, will know path once we get tar contents

    # For each submission (.gz or .pdf)
    for submission in tar.getmembers():
        if submission.name.endswith('.gz') or submission.name.endswith('.pdf'):
            submission_id = os.path.splitext(os.path.basename(submission.name))[0]
            if do_not_convert(submission_id):
                continue
            print('Working on submission: ' + submission_id + '...')
            submission_path = os.path.splitext(task)[0] + '/' + submission_id
            submission_type = os.path.splitext(os.path.basename(submission.name))[1]

            # If .pdf, skip, we cannot extract and will not convert here
            if submission_type == '.pdf':
                result = None
                extracted = 'no'
                converted = 'no'
            else:
                # Extract submission 
                #extracted_gz_path = '../data/2020_03_09_extract_and_convert_submissions/temp_extractions/'
                extracted_tar_path = extraction_path + submission.name.split('/')[0] + '/'
                #print('Extracted gz path: ' + extracted_gz_path)
                #print('Extracted tar path: ' + extracted_tar_path)
                extracted, suffix = extract(tar, submission, extraction_path + submission_id)

                # Convert submission and remove extracted submission
                if extracted == 'yes':
                    result, converted = convert(extraction_path + submission_id + suffix)
                
                # Remove the folder that appears for the tar during ReadErrors, if it exists
                if os.path.exists(extracted_tar_path): 
                    shutil.rmtree(extracted_tar_path)
                else:
                    os.remove(extraction_path + submission_id + suffix)

            # Log submission extraction & conversion info, remove .zip
            log.append([submission_id, tar_name, submission_type, extracted, suffix, converted, result])

    save_log(log)

    # After finishing tarfile, move it to 'processed' directory
    os.rename(task, os.path.dirname(task) + '/processed/' + tar_name + '.tar')

In [None]:
for task in tasks:
    work(task)


Opening arXiv_src_0204_001,
Working on submission: astro-ph0204001...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204001
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204001.xml
converted
Working on submission: astro-ph0204002...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204002
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204002.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204002.gz.tex
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204002.xml
converted
Working on submission: astro-ph0204003...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204003
extracted
Converting to ../

  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_w

extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204006.xml
converted
Working on submission: astro-ph0204007...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204007
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204007.xml
converted
Working on submission: astro-ph0204008...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204008
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204008.xml
converted
Working on submission: astro-ph0204009...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204009
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204009.xml
converted
Working on submission: astro-ph0204010...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph

converted
Working on submission: astro-ph0204038...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204038
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204038.xml
converted
Working on submission: astro-ph0204039...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204039
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204039.xml
converted
Working on submission: astro-ph0204040...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204040
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204040.xml
converted
Working on submission: astro-ph0204041...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204041
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204069...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204069
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204069.xml
converted
Working on submission: astro-ph0204070...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204070
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204070.xml
converted
Working on submission: astro-ph0204071...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204071
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204071.xml
converted
Working on submission: astro-ph0204072...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204072
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204101...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204101
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204101.xml
converted
Working on submission: astro-ph0204102...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204102
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204102.xml
converted
Working on submission: astro-ph0204103...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204103
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204103.xml
converted
Working on submission: astro-ph0204104...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204104
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204134...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204134
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204134.xml
converted
Working on submission: astro-ph0204135...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204135
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204135.xml
converted
Working on submission: astro-ph0204136...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204136
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204136.xml
converted
Working on submission: astro-ph0204137...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204137
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204165...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204165
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204165.xml
converted
Working on submission: astro-ph0204166...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204166
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204166.xml
converted
Working on submission: astro-ph0204167...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204167
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204167.xml
converted
Working on submission: astro-ph0204168...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204168
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204197...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204197
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204197.xml
converted
Working on submission: astro-ph0204198...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204198
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204198.xml
converted
Working on submission: astro-ph0204199...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204199
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204199.xml
converted
Working on submission: astro-ph0204200...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204200
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204229...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204229
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204229.xml
converted
Working on submission: astro-ph0204230...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204230
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204230.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204230.gz.tex
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204230.xml
converted
Working on submission: astro-ph0204231...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204231
extracted
Converting to ../data/2020_03_09_ext

extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204259.xml
converted
Working on submission: astro-ph0204260...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204260
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204260.xml
converted
Working on submission: astro-ph0204261...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204261
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204261.xml
converted
Working on submission: astro-ph0204262...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204262
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204262.xml
converted
Working on submission: astro-ph0204263...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph

converted
Working on submission: astro-ph0204293...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204293
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204293.xml
converted
Working on submission: astro-ph0204294...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204294
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204294.xml
converted
Working on submission: astro-ph0204295...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204295
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204295.xml
converted
Working on submission: astro-ph0204296...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204296
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204325...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204325
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204325.xml
converted
Working on submission: astro-ph0204326...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204326
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204326.xml
converted
Working on submission: astro-ph0204327...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204327
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204327.xml
converted
Working on submission: astro-ph0204328...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204328
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204357...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204357
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204357.xml
converted
Working on submission: astro-ph0204358...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204358
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204358.xml
converted
Working on submission: astro-ph0204359...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204359
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204359.xml
converted
Working on submission: astro-ph0204360...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204360
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204389...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204389
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204389.xml
converted
Working on submission: astro-ph0204390...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204390
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204390.xml
converted
Working on submission: astro-ph0204391...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204391
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204391.xml
converted
Working on submission: astro-ph0204392...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204392
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204422...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204422
Working on submission: astro-ph0204423...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204423
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204423.xml
converted
Working on submission: astro-ph0204424...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204424
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204424.xml
converted
Working on submission: astro-ph0204425...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204425
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204425.xml
converted
Working on submission: astro-ph0204426...
Submission path: ../data/2020_03_03_original_arxiv_tars/arX

converted
Working on submission: astro-ph0204456...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204456
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204456.xml
converted
Working on submission: astro-ph0204457...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204457
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204457.xml
converted
Working on submission: astro-ph0204458...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204458
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204458.xml
converted
Working on submission: astro-ph0204459...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204459
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204

converted
Working on submission: astro-ph0204485...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204485
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204485.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204485.gz.tex
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204485.xml
converted
Working on submission: astro-ph0204486...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204486
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204486.xml
converted
Working on submission: astro-ph0204487...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204487
extracted
Converting to ../data/2020_03_09_ext

converted
Working on submission: astro-ph0204516...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204516
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204516.xml
converted
Working on submission: astro-ph0204517...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204517
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204517.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/astro-ph0204517.gz.tex
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/astro-ph0204517.xml
converted
Working on submission: astro-ph0204518...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/astro-ph0204518
extracted
Converting to ../data/2020_03_09_ext

converted
Working on submission: gr-qc0204065...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/gr-qc0204065
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/gr-qc0204065.xml
converted
Working on submission: gr-qc0204073...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/gr-qc0204073
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/gr-qc0204073.xml
converted
Working on submission: gr-qc0204074...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/gr-qc0204074
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/gr-qc0204074.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/gr-qc0204074.gz.tex
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/c

converted
Working on submission: hep-ph0204257...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/hep-ph0204257
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/hep-ph0204257.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/hep-ph0204257.gz.tex
extracted
Converting to ../data/2020_03_09_extract_and_convert_submissions/converted_xml/hep-ph0204257.xml
converted
Working on submission: hep-ph0204258...
Submission path: ../data/2020_03_03_original_arxiv_tars/arXiv_src_0204_001/hep-ph0204258
TEX FILE!!!! EXTRACTING TO ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/
OPENING ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/hep-ph0204258.gz
extracted tex: ../data/2020_03_09_extract_and_convert_submissions/temp_extractions/0204/hep-ph0204258.gz.tex
extracted

## 3. analyze

Get from Untitled.ipynb then delete Untitled.ipynb

## 4. Can retrieve individual preprints from online or from the associated tar file:

This helped: https://jarrodmcclean.com/simple-bash-parallel-commands-in-python/

First I need to confirm the main file in each repository. 
- If it doesn't contain a .bbl file, I need to add it to the bbl_lack folder. Later. Set aside and skip.
- If it doesn't contain a file, I need to retrieve it again. Later. Set aside and skip. 

I will look at each submission folder, check xml to see if a file exists with its name. If not, I will go into the submission folder to check each file if it contains \\documentclass. If it does, grab it and convert it. Break out of loop. 

In [165]:
def guess_extension_from_headers(h):
    """
    Given headers from an ArXiV e-print response, try and guess what the file
    extension should be.
    Based on: https://arxiv.org/help/mimetypes
    """
    if h.get('content-type') == 'application/pdf':
        return '.pdf'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/postscript':
        return '.ps.gz'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-eprint-tar':
        return '.tar.gz'
    # content-encoding is x-gzip but this appears to normally be a lie - it's
    # just plain text
    if h.get('content-type') == 'application/x-eprint':
        return '.tex'
    if h.get('content-encoding') == 'x-gzip' and h.get('content-type') == 'application/x-dvi':
        return '.dvi.gz'
    return None

def arxiv_id_to_source_url(arxiv_id):
    # This URL is normally a tarball, but sometimes something else.
    # ArXiV provides a /src/ URL which always serves up a tarball,
    # but if we used this, we'd have to untar the file to figure out
    # whether it's renderable or not. By using the /e-print/ endpoint
    # we can figure out straight away whether we should bother rendering
    # it or not.
    # https://arxiv.org/help/mimetypes has more info
    return 'https://arxiv.org/e-print/' + arxiv_id

def download_source_file(arxiv_id):
    """
    Download the LaTeX source of this paper and returns as ContentFile.
    """
    source_url = arxiv_id_to_source_url(arxiv_id)
    res = requests.get(source_url)
    res.raise_for_status()
    extension = guess_extension_from_headers(res.headers)
    if not extension:
        raise DownloadError("Could not determine file extension from "
                            "headers: Content-Type: {}; "
                            "Content-Encoding: {}".format(
                                res.headers.get('content-type'),
                                res.headers.get('content-encoding')))
    with open(arxiv_id + extension, 'wb+') as f:
        f.write(res.content)
        print('Created ' + arxiv_id + extension)

download_source_file('1010.3382')

Created 1010.3382.tar.gz
