# Building a dataset of pdf research papers
### Part 1. Collecting urls of pdf files from Arvix

In [1]:
import sys
import io
import os
import time
import urllib
import pickle
import csv
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool

nof_pdf_per_query = 30


In [2]:
print ('reading arxiv categories ...')

catlist = []
with io.open('ArxivSubjectCategory.csv', newline='\n') as csvfile:
    catreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    catlist = [row[0] for row in catreader]

print (len(catlist), 'categories read.')
print ('sample:', np.random.choice(catlist), ',', np.random.choice(catlist), ',', np.random.choice(catlist))

reading arxiv categories ...
153 categories read.
sample: hep-lat , cs.LG , cs.SC


In [3]:
print ('generating queries ...')
q = 'http://export.arxiv.org/api/query?search_query=cat:{cat}&all:{yr}&start=0&max_results={n}'
yrlist = list(range(2010, 2019)) # from 2010 to 2018
qrylist = []

for yr in yrlist:
    for cat in catlist:
        qry = q.format(cat=cat, yr=yr, n=nof_pdf_per_query)
        qrylist.append(qry)
        
print (len(yrlist), 'years, ', len(catlist), 'categories, ', 'total', len(qrylist), 'queries.')
print ('sample:', np.random.choice(qrylist))

generating queries ...
9 years,  153 categories,  total 1377 queries.
sample: http://export.arxiv.org/api/query?search_query=cat:cs.NA&all:2010&start=0&max_results=30


In [4]:
# for reporting times
def t_print(t, msg):
    te = time.time()-t
    print (msg, 'in %.2fs' % te)
    return time.time()

# make 1 query, return an empty {} if error
def make_query(query, data={}):
    try:
        data[query] = urllib.request.urlopen(query).read()
    except:
        print ('urlopen error, returning empty{}')
    return data

# read data{} from pickle, otherwise create a new dict
def load_pickle(path):
    try:
        with open(path, 'rb') as f:
            data = pickle.load(f)
    except FileNotFoundError:
        print ('file not found, returning empty{}')
        data = {}
    return data

# dump data{} into pickle
def dump_pickle(data, path):
    pickle.dump(data, open(path, 'wb'))

# dump data{} into pickle if the file is not there
def check_dump_pickle(data, path, force=False):
    if not os.path.isfile(path) or force:
        pickle.dump(data, open(path, 'wb'))

# load or make 1 query, then save to pickle
def query_and_store(query, pickle_fpath='data.pickle', force=False):
    data = load_pickle(pickle_fpath)

    if query not in data:
        data[query] = make_query(query)
        dump_pickle(data, pickle_fpath)
    
    return data[query]

# run a function in multi-threads, reporting progress
def run_multi_thread(function, iterables, nof_thread=10):
    pool = ThreadPool(nof_thread)
    results = pool.imap(function, iterables)
    
    output = []
    for i, result in enumerate(results):
        output.append(result)
        sys.stdout.write('\rprogress: {0}/{1}'.format(i, len(iterables)))
    print ('')
    
    pool.close()
    pool.join()
    return output
    
# load or make a list of queries, then save to pickle
def qrylist_and_store(qrylist, pickle_fpath='data.pickle', force=False, nof_thread=10):
    
    #init a timer
    t = time.time()
    
    # read data(dict) from pickle, otherwise create a new dict
    data = load_pickle(pickle_fpath)
    t = t_print(t, 'finish load_pickle')
    
    # find query which is not in data
    todo_qrylist = list(set(qrylist) - set(data)) if not force  else qrylist.copy()
    
    # make query for those which are not found
    if todo_qrylist:
        
        # make query in multi-threads
        results = run_multi_thread(make_query, todo_qrylist, nof_thread=nof_thread)
        t = t_print(t, 'finish http requests')
        
        for i, result in enumerate(results):
            q = todo_qrylist[i]
            data[q] = result[q]
        if len(results) == 0: print ('ERROR: len(results) == 0')
        
        # save data to pickle
        dump_pickle(data, pickle_fpath)
        t = t_print(t, 'finish dump_pickle')
        
        print ('%d query(s) finished.' % len(todo_qrylist))
        
    else:
        print ('data found in "%s", no query is needed.' % pickle_fpath)
        
    return {qry : data[qry] for qry in qrylist}


In [5]:
print ('making queries thru arvix api ...')

data = qrylist_and_store(qrylist, pickle_fpath='xmls.pickle', nof_thread=100)
xmllist = list(data.values())

print ('sample:', np.random.choice(xmllist)[:1000], '...')

making queries thru arvix api ...
finish load_pickle in 1.44s
data found in "xmls.pickle", no query is needed.
sample: b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs.FL%26id_list%3D%26start%3D0%26max_results%3D30" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=cat:cs.FL&amp;id_list=&amp;start=0&amp;max_results=30</title>\n  <id>http://arxiv.org/api/kR51HcppIHuzzWwG+SVLlcxFQBM</id>\n  <updated>2018-10-29T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">2462</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">30</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/0903.2554v2</id>\n    <updated>2013-07-31T16:

### Part 2. Retrieving pdf files

In [6]:
import feedparser

required_field = ['id', 'title', 'authors', 'link', 'updated'] # these fields will be keys of the papers[i]{}


In [7]:
print ('parsing xmls (queried results) ...')

force = False
feeds = load_pickle('feeds.pickle')
if not(feeds) or force:
    feeds = run_multi_thread(feedparser.parse, xmllist, nof_thread=100)
    dump_pickle(feeds, 'feeds.pickle')

print ('sample:', np.random.choice(feeds).feed, '...')

parsing xmls (queried results) ...
sample: {'links': [{'href': 'http://arxiv.org/api/query?search_query%3Dcat%3Acs.DB%26id_list%3D%26start%3D0%26max_results%3D30', 'rel': 'self', 'type': 'application/atom+xml'}], 'title': 'ArXiv Query: search_query=cat:cs.DB&id_list=&start=0&max_results=30', 'title_detail': {'type': 'text/html', 'language': None, 'base': '', 'value': 'ArXiv Query: search_query=cat:cs.DB&id_list=&start=0&max_results=30'}, 'id': 'http://arxiv.org/api/63/IcBsQBzDvn01GqFFrQrNQed0', 'guidislink': True, 'link': 'http://arxiv.org/api/63/IcBsQBzDvn01GqFFrQrNQed0', 'updated': '2018-10-29T00:00:00-04:00', 'updated_parsed': time.struct_time(tm_year=2018, tm_mon=10, tm_mday=29, tm_hour=4, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=302, tm_isdst=0), 'opensearch_totalresults': '3917', 'opensearch_startindex': '0', 'opensearch_itemsperpage': '30'} ...


In [8]:
print ('converting feeds into papers(a list of dict containing required fields) ...')

def id_to_pdfurl(arxiv_id):
    return arxiv_id.replace('abs', 'pdf') + '.pdf'

def id_to_filename(arxiv_id, ext='pdf'):
    parts = arxiv_id.split('/')
    dstdir = 'dataset/'
    if ext[0] != '.':
        ext = '.' + ext
    return dstdir + '_'.join(parts[parts.index('abs')+1:]) + ext

papers = {}
for feed in feeds:
    for entry in feed.entries:
        paper = {}
        for i, k in enumerate(entry):
            if k in required_field:
                paper[k] = entry[k]
                
        paper['filename'] = id_to_filename(paper['id'])
        paper['xml'] = id_to_filename(paper['id'], 'xml')
        paper['info'] = id_to_filename(paper['id'], 'info.pickle')
        paper['pdfurl'] = id_to_pdfurl(paper['id'])
        
        if paper['id'] not in papers:
            papers[paper['id']] = paper
            
papers = list(papers.values())

print (len(papers), 'papers read.')
i = np.random.randint(1, len(papers)) - 1
print ('sample:', papers[i])
print ('pdf url: ', papers[i]['pdfurl'])
print ('local filename: ', papers[i]['filename'])
print ('local xml: ', papers[i]['xml'])
print ('local info.pickle:', papers[i]['info'])

converting feeds into papers(a list of dict containing required fields) ...
4500 papers read.
sample: {'id': 'http://arxiv.org/abs/hep-ph/9203217v1', 'link': 'http://arxiv.org/abs/hep-ph/9203217v1', 'updated': '1992-03-18T20:13:19Z', 'title': 'Multiple photon effects in fermion-(anti)fermion scattering at SSC\n  energies', 'authors': [{'name': 'D. B. DeLaney'}, {'name': 'S. Jadach'}, {'name': 'Ch. Shio'}, {'name': 'G. Siopsis'}, {'name': 'B. F. L. Ward'}], 'filename': 'dataset/hep-ph_9203217v1.pdf', 'xml': 'dataset/hep-ph_9203217v1.xml', 'info': 'dataset/hep-ph_9203217v1.info.pickle', 'pdfurl': 'http://arxiv.org/pdf/hep-ph/9203217v1.pdf'}
pdf url:  http://arxiv.org/pdf/hep-ph/9203217v1.pdf
local filename:  dataset/hep-ph_9203217v1.pdf
local xml:  dataset/hep-ph_9203217v1.xml
local info.pickle: dataset/hep-ph_9203217v1.info.pickle


In [9]:
from shutil import copyfile

def download_file(url_localpath_tuple, nof_trial=5):
    url, local_fpath = url_localpath_tuple

    for i in range(nof_trial):
        try:
            saved_fpath, header = urllib.request.urlretrieve (url, local_fpath)
            
            if 'application/pdf' in header['content-type']:
                print ('saved %s to %s.' % (url, local_fpath))
                return True
            
            elif 'text/html' in header['content-type']:
                with open(saved_fpath, 'r') as f:
                    fcontent = f.read()
                if 'No PDF' in fcontent:
                    copyfile(local_fpath, local_fpath+'.no')
                    print ('failed downloading %s to %s.' % (url, local_fpath))
                    return False
                elif 'reload this URL' in fcontent:
#                     print ('Reload needed: re-trying %s %d time%s ... ' % (url, i+1, '' if i == 0 else 's'))
                    pass
                else:
#                     print ('Unknown HTML: re-trying %s %d time%s ... ' % (url, i+1, '' if i == 0 else 's'))
                    pass
            else:
#                 print ('Unknown content: re-trying %s %d time%s ... ' % (url, i+1, '' if i == 0 else 's'))
                pass    
        except:
#             print ('Connection error: re-trying %s %d time%s ... ' % (url, i+1, '' if i == 0 else 's'))
            pass
    
    print ('failed downloading %s to %s.' % (url, local_fpath))
    return False
#     print ('Failed')


In [10]:
print ('downloading pdfs ...')

url_localpath_tuples = [(p['pdfurl'], p['filename']) for p in papers if not os.path.isfile(p['filename'])]
print ('files to be downloaded: ', len(url_localpath_tuples))
results = run_multi_thread(download_file, url_localpath_tuples, nof_thread=20)
print ('finished')

downloading pdfs ...
files to be downloaded:  0

finished


In [11]:
def list_dataset(directory = 'dataset/', ext = '.pdf'):
    if ext[0] != '.':
        ext = '.' + ext
    return [f for f in os.listdir(directory) if os.path.isfile(directory) and f.endswith(ext)]


In [None]:
print ('writing info files ...')

pdfs = list_dataset('dataset/', '.pdf')
papersset = {id_to_filename(p['id'], ext='pdf') for p in papers}
for pdf in pdfs:
    check_dump_pickle(data, path, force=False)
