# Building a dataset of pdf research papers
### Part 1. Collecting urls of pdf files from Arvix

In [1]:
import sys
import io
import os
import time
import urllib
import pickle
import csv
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool

nof_pdf_per_query = 30


In [2]:
print ('reading arxiv categories ...')

catlist = []
with io.open('ArxivSubjectCategory.csv', newline='\n') as csvfile:
    catreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    catlist = [row[0] for row in catreader]

print (len(catlist), 'categories read.')
print ('sample:', np.random.choice(catlist), ',', np.random.choice(catlist), ',', np.random.choice(catlist))

reading arxiv categories ...
153 categories read.
sample: stat.CO , math.MG , math.ST


In [3]:
print ('generating queries ...')
q = 'http://export.arxiv.org/api/query?search_query=cat:{cat}&all:{yr}&start=0&max_results={n}'
yrlist = list(range(2010, 2019)) # from 2010 to 2018
qrylist = []

for yr in yrlist:
    for cat in catlist:
        qry = q.format(cat=cat, yr=yr, n=nof_pdf_per_query)
        qrylist.append(qry)
        
print (len(yrlist), 'years, ', len(catlist), 'categories, ', 'total', len(qrylist), 'queries.')
print ('sample:', np.random.choice(qrylist))

generating queries ...
9 years,  153 categories,  total 1377 queries.
sample: http://export.arxiv.org/api/query?search_query=cat:math.OA&all:2015&start=0&max_results=30


In [4]:
# for reporting times
def t_print(t, msg):
    te = time.time()-t
    print (msg, 'in %.2fs' % te)
    return time.time()

# make 1 query, return an empty {} if error
def make_query(query, data={}):
    try:
        data[query] = urllib.request.urlopen(query).read()
    except:
        print ('urlopen error, returning empty{}')
    return data

# read data{} from pickle, otherwise create a new dict
def load_pickle(path):
    try:
        with open(path, 'rb') as f:
            data = pickle.load(f)
    except FileNotFoundError:
        print ('file not found, returning empty{}')
        data = {}
    return data

# dump data{} into pickle
def dump_pickle(data, path):
    pickle.dump(data, open(path, 'wb'))

# dump data{} into pickle
def check_dump_pickle(data, path, force=False):
    if not os.path.isfile(path) or force:
        pickle.dump(data, open(path, 'wb'))

# load or make 1 query, then save to pickle
def query_and_store(query, pickle_fpath='data.pickle', force=False):
    data = load_pickle(pickle_fpath)

    if query not in data:
        data[query] = make_query(query)
        dump_pickle(data, pickle_fpath)
    
    return data[query]

# run a function in multi-threads, reporting progress
def run_multi_thread(function, iterables, nof_thread=10):
    pool = ThreadPool(nof_thread)
    results = pool.imap(function, iterables)
    
    output = []
    for i, result in enumerate(results):
        output.append(result)
        sys.stdout.write('\rprogress: {0}/{1}'.format(i, len(iterables)))
    print ('')
    
    pool.close()
    pool.join()
    return output
    
# load or make a list of queries, then save to pickle
def qrylist_and_store(qrylist, pickle_fpath='data.pickle', force=False, nof_thread=10):
    
    #init a timer
    t = time.time()
    
    # read data(dict) from pickle, otherwise create a new dict
    data = load_pickle(pickle_fpath)
    t = t_print(t, 'finish load_pickle')
    
    # find query which is not in data
    todo_qrylist = list(set(qrylist) - set(data)) if not force  else qrylist.copy()
    
    # make query for those which are not found
    if todo_qrylist:
        
        # make query in multi-threads
        results = run_multi_thread(make_query, todo_qrylist, nof_thread=nof_thread)
        t = t_print(t, 'finish http requests')
        
        for i, result in enumerate(results):
            q = todo_qrylist[i]
            data[q] = result[q]
        if len(results) == 0: print ('ERROR: len(results) == 0')
        
        # save data to pickle
        dump_pickle(data, pickle_fpath)
        t = t_print(t, 'finish dump_pickle')
        
        print ('%d query(s) finished.' % len(todo_qrylist))
        
    else:
        print ('data found in "%s", no query is needed.' % pickle_fpath)
        
    return {qry : data[qry] for qry in qrylist}


In [5]:
print ('making queries thru arvix api ...')

data = qrylist_and_store(qrylist, pickle_fpath='xmls.pickle', nof_thread=100)
xmllist = list(data.values())

print ('sample:', np.random.choice(xmllist)[:1000], '...')

making queries thru arvix api ...
finish load_pickle in 0.13s
data found in "xmls.pickle", no query is needed.
sample: b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs.SY%26id_list%3D%26start%3D0%26max_results%3D30" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=cat:cs.SY&amp;id_list=&amp;start=0&amp;max_results=30</title>\n  <id>http://arxiv.org/api/+Y8zcWCKl+jPl+pMStoevWOjxz4</id>\n  <updated>2018-10-29T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">7325</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">30</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/1008.3222v1</id>\n    <updated>2010-08-19T06:

### Part 2. Retrieving pdf files

In [6]:
import feedparser

required_field = ['id', 'title', 'authors', 'link', 'updated'] # these fields will be keys of the papers[i]{}


In [7]:
print ('parsing xmls (queried results) ...')

force = False
feeds = load_pickle('feeds.pickle')
if not(feeds) or force:
    feeds = run_multi_thread(feedparser.parse, xmllist, nof_thread=100)
    dump_pickle(feeds, 'feeds.pickle')

print ('sample:', np.random.choice(feeds).feed, '...')

parsing xmls (queried results) ...
sample: {'links': [{'href': 'http://arxiv.org/api/query?search_query%3Dcat%3Acs.CG%26id_list%3D%26start%3D0%26max_results%3D30', 'rel': 'self', 'type': 'application/atom+xml'}], 'title': 'ArXiv Query: search_query=cat:cs.CG&id_list=&start=0&max_results=30', 'title_detail': {'type': 'text/html', 'language': None, 'base': '', 'value': 'ArXiv Query: search_query=cat:cs.CG&id_list=&start=0&max_results=30'}, 'id': 'http://arxiv.org/api/pm+iSSbgjZh4m3YpDS+SVzlMyt8', 'guidislink': True, 'link': 'http://arxiv.org/api/pm+iSSbgjZh4m3YpDS+SVzlMyt8', 'updated': '2018-10-29T00:00:00-04:00', 'updated_parsed': time.struct_time(tm_year=2018, tm_mon=10, tm_mday=29, tm_hour=4, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=302, tm_isdst=0), 'opensearch_totalresults': '3580', 'opensearch_startindex': '0', 'opensearch_itemsperpage': '30'} ...


In [8]:
print ('converting feeds into papers(a list of dict containing required fields) ...')

def id_to_pdfurl(arxiv_id):
    return arxiv_id.replace('abs', 'pdf') + '.pdf'

def id_to_filename(arxiv_id, ext='pdf'):
    parts = arxiv_id.split('/')
    dstdir = 'dataset/'
    return dstdir + '_'.join(parts[parts.index('abs')+1:]) + '.' + ext

papers = {}
for feed in feeds:
    for entry in feed.entries:
        paper = {}
        for i, k in enumerate(entry):
            if k in required_field:
                paper[k] = entry[k]
                
        paper['filename'] = id_to_filename(paper['id'])
        paper['xml'] = id_to_filename(paper['id'], 'xml')
        paper['info'] = id_to_filename(paper['id'], 'info.pickle')
        paper['pdfurl'] = id_to_pdfurl(paper['id'])
        
        if paper['id'] not in papers:
            papers[paper['id']] = paper
            
papers = list(papers.values())

print (len(papers), 'papers read.')
i = np.random.randint(1, len(papers)) - 1
print ('sample:', papers[i])
print ('pdf url: ', papers[i]['pdfurl'])
print ('local filename: ', papers[i]['filename'])
print ('local xml: ', papers[i]['xml'])
print ('local info.pickle:', papers[i]['info'])

converting feeds into papers(a list of dict containing required fields) ...
4500 papers read.
sample: {'id': 'http://arxiv.org/abs/cs/0404008v2', 'link': 'http://arxiv.org/abs/cs/0404008v2', 'updated': '2004-04-19T15:08:11Z', 'title': 'Efficient dot product over word-size finite fields', 'authors': [{'name': 'Jean-Guillaume Dumas'}], 'filename': 'dataset/cs_0404008v2.pdf', 'xml': 'dataset/cs_0404008v2.xml', 'info': 'dataset/cs_0404008v2.info.pickle', 'pdfurl': 'http://arxiv.org/pdf/cs/0404008v2.pdf'}
pdf url:  http://arxiv.org/pdf/cs/0404008v2.pdf
local filename:  dataset/cs_0404008v2.pdf
local xml:  dataset/cs_0404008v2.xml
local info.pickle: dataset/cs_0404008v2.info.pickle


In [43]:
from shutil import copyfile

def try_urlretrieve(url, local_fpath, retry_after_s=0, retry_times=3):
    for i in range(retry_times):
        try:
            return urllib.request.urlretrieve (url, local_fpath)
        except urllib.error.HTTPError as e:
            print ('error in retrieving %s . re-trying %d ...' % (url, i))
            time.sleep(retry_after_s)
            continue
    return False, set()

def download_file(url_localpath_tuple, retry_after_s=0, retry_times=5):
    url, local_fpath = url_localpath_tuple
    # print ('downloading %s to %s ...' % (url, local_fpath))
    # saved_fpath, header = urllib.request.urlretrieve (url, local_fpath)
    saved_fpath, header = try_urlretrieve (url, local_fpath, retry_after_s=retry_after_s, retry_times=retry_times)
    
    while ('text/html' in header['content-type']) and (retry_times >= 1):
        f = open(saved_fpath, 'r')
        fstr = f.read()
        if 'reload this URL' in fstr:
            # print ('re-try after %d s ...' % retry_after_s)
            # saved_fpath, header = urllib.request.urlretrieve (url, local_fpath)
            time.sleep(retry_after_s)
            saved_fpath, header = try_urlretrieve (url, local_fpath, retry_after_s=retry_after_s, retry_times=retry_times)
            retry_times -= 1
        else:
            if 'No PDF' in fstr: copyfile(local_fpath, local_fpath+'.no')
            retry_times = 0
        f.close()
                
    return ('application/pdf' in header['content-type'])


In [46]:
print ('downloading pdfs ...')

url_localpath_tuples = [(p['pdfurl'], p['filename']) for p in papers if not os.path.isfile(p['filename'])]
print ('files to be downloaded: ', len(url_localpath_tuples))
results = run_multi_thread(download_file, url_localpath_tuples, nof_thread=10)
print ('finished')

downloading pdfs ...
files to be downloaded:  3362
progress: 3/3362error in retrieving http://arxiv.org/pdf/cs/0312004v1.pdf . re-trying 0 ...
progress: 163/3362error in retrieving http://arxiv.org/pdf/cs/0703119v2.pdf . re-trying 0 ...
progress: 171/3362error in retrieving http://arxiv.org/pdf/0806.4286v1.pdf . re-trying 0 ...
progress: 173/3362error in retrieving http://arxiv.org/pdf/cs/0402049v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/cs/0402050v1.pdf . re-trying 0 ...
progress: 256/3362error in retrieving http://arxiv.org/pdf/cs/0207019v1.pdf . re-trying 0 ...
progress: 292/3362error in retrieving http://arxiv.org/pdf/cs/0103014v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/cs/0205062v1.pdf . re-trying 0 ...
progress: 298/3362error in retrieving http://arxiv.org/pdf/cs/0205062v1.pdf . re-trying 1 ...
progress: 314/3362error in retrieving http://arxiv.org/pdf/0805.3897v1.pdf . re-trying 0 ...
progress: 340/3362error in retrieving http://arxi

TypeError: 'set' object is not subscriptable

error in retrieving http://arxiv.org/pdf/hep-ex/9405011v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9405006v2.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9405008v1.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/gr-qc/9211001v2.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/gr-qc/9210020v3.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/gr-qc/9211002v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/gr-qc/9211005v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9405002v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9405003v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9406001v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9405006v2.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9405011v1.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/hep

error in retrieving http://arxiv.org/pdf/hep-ex/9408003v2.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409001v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409002v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409004v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409003v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409005v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9410002v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409006v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409008v2.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409007v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9409001v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ex/9410003v3.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf

error in retrieving http://arxiv.org/pdf/hep-lat/9201006v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9201005v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9201007v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9203002v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9202001v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9202004v1.pdf . re-trying 2 ...error in retrieving http://arxiv.org/pdf/hep-lat/9202002v1.pdf . re-trying 3 ...

error in retrieving http://arxiv.org/pdf/hep-lat/9202003v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9204001v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9201004v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9201006v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9201005v1.pdf . re-trying 4 ...
error in retrieving http://a

error in retrieving http://arxiv.org/pdf/hep-ph/9203201v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203205v1.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203203v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203202v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203206v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9204012v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203204v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203207v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9204013v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-lat/9204014v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203201v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203205v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/

error in retrieving http://arxiv.org/pdf/hep-ph/9203218v2.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203225v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9204202v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203219v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203220v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203222v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203223v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9204201v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203224v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203226v2.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9204203v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-ph/9203225v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf

error in retrieving http://arxiv.org/pdf/hep-th/9108009v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108010v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108011v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108016v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108012v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108015v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108013v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108014v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108018v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108017v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108019v2.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/hep-th/9108020v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf

error in retrieving http://arxiv.org/pdf/math/9412210v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/math/9403204v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/math/9411209v1.pdf . re-trying 3 ...
error in retrieving http://arxiv.org/pdf/math/9409208v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/math/9406208v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/math/9804052v2.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/9504203v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/math/9812112v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/9901119v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/9904143v2.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/9412210v1.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/math/9905125v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/0003097v1.pdf . re

error in retrieving http://arxiv.org/pdf/math/0104175v1.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/math/0109207v2.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/0110089v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/9204225v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/0103099v1.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/math/9204230v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/0101122v1.pdf . re-trying 4 ...
error in retrieving http://arxiv.org/pdf/math/9304212v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/9410219v1.pdf . re-trying 0 ...
error in retrieving http://arxiv.org/pdf/math/0106226v2.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/math/0104175v1.pdf . re-trying 2 ...
error in retrieving http://arxiv.org/pdf/math/0109207v2.pdf . re-trying 1 ...
error in retrieving http://arxiv.org/pdf/math/0110089v1.pdf . re

In [None]:
print ('generating xmls ...')

import os
import subprocess

fnames = [f for f in os.listdir('dataset/') if os.path.isfile(f) and f.endswith('.pdf')]
ifnames = ['pdf/' + f for f in fnames]
ofnames = ['xml/' + f + '.xml' for f in fnames]

for i, fname in enumerate(fnames):
    subprocess.run(['pdftohtml', '-i', '-xml', ifnames[i], ofnames[i]], stdout=subprocess.PIPE)

In [35]:
a = {'a':'asdf', 'z':'zxcv', 's':'sdfg'}
b = {'a':'asdf', 'z':'zxcv', 'q':'qwer'}
c = {}

In [65]:
print (set(b)-set(a))
if a: print ('a')
if b: print ('b')
if c: print ('c')

{'q'}
a
b


In [100]:
arxiv_ids = ['http://arxiv.org/abs/1411.2167v2',
             'http://arxiv.org/abs/physics/0310161v1',
             'http://arxiv.org/abs/hep-ph/9203210v1'
            ]
for arxiv_id in arxiv_ids:
    parts = arxiv_id.split('/')
    print ('_'.join(parts[parts.index('abs')+1:]))

1411.2167v2
physics_0310161v1
hep-ph_9203210v1


In [147]:
a, b = (1, 2)
a
b

2