In [1]:
import requests
import time
import sys
import pandas as pd
from grobid_client.grobid_client import GrobidClient

# Download papers

A script containing this exact code is also available (`download.py`) if you need to run this in the background (e.g. using tmux)

In [14]:
# CSV with papers to download
csv_path = 'math_papers.csv'

# path to folder where the papers should be saved (!!! must add '/' at the end)
download_path = './papers/math/'

# number of papers to be downloaded
# !!! this does not reflect the actual number of papers that will be downloaded, as each paper has more than one version (at least 2 * download_size will be downloaded)
download_size = 4000

# a CSV with the papers that the script attempted to downloaded will be saved here
# this may be useful if the script fails and you are not sure what it was trying to download to begin with
to_download_csv_path = './papers/to_download_math_papers.csv'

# a CSV with the papers that were downloaded will be saved here
downloaded_csv_path = './papers/downloaded_math_papers.csv'

In [15]:
papers = pd.read_csv(csv_path, parse_dates = ['date'], dtype = {'id': str})

In [16]:
def download(paper):
    # download all versions of `paper`
    for i in range(1, paper['total_versions'] + 1):
        # some papers have '/' in their id
        # this does not work well with paths, so we replace '/' with '_'
        idx = paper['id'].replace('/', '_')
        
        # the papers are saved as a Pdf that may look like `1811.11745v1.pdf`
        name = idx + 'v' + str(i) + '.pdf'
        
        # the paper is requested from `export.arxiv.org` which is a domain dedicated to bots
        url = 'https://export.arxiv.org/pdf/' + name
        
        response = requests.get(url)
        with open(download_path + name, 'wb') as f:
            f.write(response.content)
            
        # a hard time limit between downloads is imposed so as to not put too much pressure on the Arxiv servers
        time.sleep(0.5) 
    downloaded_papers_id.append(paper['id'])
    
    # for debugging purposes
    if len(downloaded_papers_id) % 10 == 0:
        print('Downloaded: ', len(downloaded_papers_id))

In [19]:
downloaded_papers_id = []
to_download_papers = papers.drop_duplicates('id').sample(frac = 1).head(download_size)
to_download_papers.to_csv(to_download_csv_path, index=False)

# download papers
to_download_papers.apply(download, axis = 1)

papers.query('id in @downloaded_papers_id').reset_index(drop=True).to_csv(downloaded_csv_path, index=False)

Downloaded:  10
Downloaded:  20
Downloaded:  30
Downloaded:  40
Downloaded:  50
Downloaded:  60
Downloaded:  70
Downloaded:  80
Downloaded:  90
Downloaded:  100


305714    None
51197     None
342155    None
532613    None
374550    None
          ... 
517657    None
506922    None
218654    None
378231    None
447170    None
Length: 100, dtype: object

# Process papers using Grobid

A script containing this exact code is also available (`process_to_tei.py`) if you need to run this in the background (e.g. using tmux)

In [None]:
# folder with input papers (!!! must add '/' at the end)
input_path = './papers/math/'
# folder where the output XML files will be saved (!!! must add '/' at the end)
output_path = './papers/math_tei/'

In [None]:
# you may need to change the config based on where you host Grobid, the number of threads available etc. (https://grobid.readthedocs.io/en/latest/) 
client = GrobidClient(config_path='./grobid_python_config.json')

In [None]:
# !!! this will reprocess any file that is already in `output_path` (bacause of `force=True`)
# you may want to change `n` based on the number of threads available on the server running Grobid (https://grobid.readthedocs.io/en/latest/) 
client.process('processFulltextDocument', input_path, output=output_path, consolidate_citations=False, tei_coordinates=False, force=True, n=7)

In [None]:
# after running the previoud block, you may want to run this as well
# if some papers where nor processed by Grobid, this will try to do so again (without rewriting already existing XML files)
# you may want to change `n` based on the number of threads available on the server running Grobid (https://grobid.readthedocs.io/en/latest/) 
client.process('processFulltextDocument', input_path, output=output_path, consolidate_citations=False, tei_coordinates=False, force=False, n=7)