# Building a dataset of pdf research papers
### Part 1. Collecting urls of pdf files from Arvix

In [1]:
import sys
import io
import os
import time
import urllib
import pickle
import csv
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool

nof_pdf_per_query = 3


In [2]:
print ('reading arxiv categories ...')

catlist = []
with io.open('ArxivSubjectCategory.csv', newline='\n') as csvfile:
    catreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    catlist = [row[0] for row in catreader]

print (len(catlist), 'categories read.')
print ('sample:', np.random.choice(catlist), ',', np.random.choice(catlist), ',', np.random.choice(catlist))

reading arxiv categories ...
153 categories read.
sample: math.LO , astro-ph , cs.DS


In [3]:
print ('generating queries ...')
q = 'http://export.arxiv.org/api/query?search_query=cat:{cat}&all:{yr}&start=0&max_results={n}'
yrlist = list(range(2001, 2019)) # from 2001 to 2018
qrylist = []

for yr in yrlist:
    for cat in catlist:
        qry = q.format(cat=cat, yr=yr, n=nof_pdf_per_query)
        qrylist.append(qry)
        
print (len(yrlist), 'years, ', len(catlist), 'categories, ', 'total', len(qrylist), 'queries.')
print ('sample:', np.random.choice(qrylist))

generating queries ...
18 years,  153 categories,  total 2754 queries.
sample: http://export.arxiv.org/api/query?search_query=cat:hep-th&all:2009&start=0&max_results=3


In [4]:
# for reporting times
def t_print(t, msg):
    te = time.time()-t
    print (msg, 'in %.2fs' % te)
    return time.time()

# make 1 query, return an empty {} if error
def make_query(query, data={}):
    try:
        data[query] = urllib.request.urlopen(query).read()
    except:
        print ('urlopen error, returning empty{}')
    return data

# read data{} from pickle, otherwise create a new dict
def load_pickle(path):
    try:
        with open(path, 'rb') as f:
            data = pickle.load(f)
    except FileNotFoundError:
        print ('file not found, returning empty{}')
        data = {}
    return data

# dump data{} into pickle
def dump_pickle(data, path):
    pickle.dump(data, open(path, 'wb'))

# load or make 1 query, then save to pickle
def query_and_store(query, pickle_fpath='data.pickle', force=False):
    data = load_pickle(pickle_fpath)

    if query not in data:
        data[query] = make_query(query)
        dump_pickle(data, pickle_fpath)
    
    return data[query]

# run a function in multi-threads, reporting progress
def run_multi_thread(function, iterables, nof_thread=10):
    pool = ThreadPool(nof_thread)
    results = pool.imap(function, iterables)
    
    output = []
    for i, result in enumerate(results):
        output.append(result)
        sys.stdout.write('\rprogress: {0}/{1}'.format(i, len(iterables)))
    print ('')
    
    pool.close()
    pool.join()
    return output
    
# load or make a list of queries, then save to pickle
def qrylist_and_store(qrylist, pickle_fpath='data.pickle', force=False, nof_thread=10):
    
    #init a timer
    t = time.time()
    
    # read data(dict) from pickle, otherwise create a new dict
    data = load_pickle(pickle_fpath)
    t = t_print(t, 'finish load_pickle')
    
    # find query which is not in data
    todo_qrylist = list(set(qrylist) - set(data)) if not force  else qrylist.copy()
    
    # make query for those which are not found
    if todo_qrylist:
        
        # make query in multi-threads
        results = run_multi_thread(make_query, todo_qrylist, nof_thread=nof_thread)
        t = t_print(t, 'finish http requests')
        
        for i, result in enumerate(results):
            q = todo_qrylist[i]
            data[q] = result[q]
        if len(results) == 0: print ('ERROR: len(results) == 0')
        
        # save data to pickle
        dump_pickle(data, pickle_fpath)
        t = t_print(t, 'finish dump_pickle')
        
        print ('%d query(s) finished.' % len(todo_qrylist))
        
    else:
        print ('data found in "%s", no query is needed.' % pickle_fpath)
        
    return {qry : data[qry] for qry in qrylist}


In [None]:
print ('making queries thru arvix api ...')

data = qrylist_and_store(qrylist, pickle_fpath='xmls.pickle', nof_thread=100)
xmllist = list(data.values())

print ('sample:', np.random.choice(xmllist)[:1000], '...')

making queries thru arvix api ...
finish load_pickle in 0.58s
data found in "xmls.pickle", no query is needed.
sample: b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs.MA%26id_list%3D%26start%3D0%26max_results%3D3" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=cat:cs.MA&amp;id_list=&amp;start=0&amp;max_results=3</title>\n  <id>http://arxiv.org/api/L0oxE8y1b/7nYmp5IOazKhJLIms</id>\n  <updated>2018-10-24T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">2104</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">3</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/cs/0306119v1</id>\n    <updated>2003-06-20T15:12

### Part 2. Retrieving pdf files

In [None]:
import feedparser

required_field = ['id', 'title', 'authors', 'link', 'updated'] # these fields will be keys of the papers[i]{}


In [None]:
print ('parsing xmls (queried results) ...')

force = False
feedlist = load_pickle('feeds.pickle')
if not(feedlist) or force:
    feedlist = run_multi_thread(feedparser.parse, xmllist, nof_thread=100)
    dump_pickle(feedlist, 'feeds.pickle')

print ('sample:', np.random.choice(feedlist).feed, '...')

parsing xmls (queried results) ...
sample: {'links': [{'href': 'http://arxiv.org/api/query?search_query%3Dcat%3Aphysics.bio-ph%26id_list%3D%26start%3D0%26max_results%3D3', 'rel': 'self', 'type': 'application/atom+xml'}], 'title': 'ArXiv Query: search_query=cat:physics.bio-ph&id_list=&start=0&max_results=3', 'title_detail': {'type': 'text/html', 'language': None, 'base': '', 'value': 'ArXiv Query: search_query=cat:physics.bio-ph&id_list=&start=0&max_results=3'}, 'id': 'http://arxiv.org/api/MLmwnW++wM0PRedEVrmAQOLJTZY', 'guidislink': True, 'link': 'http://arxiv.org/api/MLmwnW++wM0PRedEVrmAQOLJTZY', 'updated': '2018-10-24T00:00:00-04:00', 'updated_parsed': time.struct_time(tm_year=2018, tm_mon=10, tm_mday=24, tm_hour=4, tm_min=0, tm_sec=0, tm_wday=2, tm_yday=297, tm_isdst=0), 'opensearch_totalresults': '9268', 'opensearch_startindex': '0', 'opensearch_itemsperpage': '3'} ...


In [None]:
print ('converting feeds into a list of dict ...')

paperlist = []
for feed in feedlist:
    for entry in feed.entries:
        paper = {}
        for i, k in enumerate(entry):
            if k in required_field:
                paper[k] = entry[k]
        paperlist.append(paper)

print (len(paperlist), 'papers read.')
print ('sample:', np.random.choice(paperlist))

converting feeds into a list of dict ...
8260 papers read.
sample: {'id': 'http://arxiv.org/abs/math/9201273v1', 'link': 'http://arxiv.org/abs/math/9201273v1', 'updated': '1990-05-12T00:00:00Z', 'title': 'Remarks on iterated cubic maps', 'authors': [{'name': 'John W. Milnor'}]}


In [None]:
def download_file(url, local_fpath, force=False):
    if not os.path.isfile(local_fpath) or force:
        print ('downloading ' + local_fpath + '...', end='')
        urllib.request.urlretrieve (url, local_fpath)
        print ('done')
    else:
        print ('found ' + local_fpath)

def download_arxivpdfs(url_list, force=False):
    for url in url_list:
        local_fpath = 'pdf/' + url.split('/')[-1]
        download_file(url, local_fpath)


In [None]:
directory = '' #r'./pdf/'
for paper in reversed(paperlist):
    url = paper['link'].replace('abs', 'pdf') + '.pdf'
    name = paper['id'].split('/')[-1] + '.pdf'
    fpath = directory + name
    download_file(url, fpath)

downloading 0111152v1.pdf...

In [35]:
a = {'a':'asdf', 'z':'zxcv', 's':'sdfg'}
b = {'a':'asdf', 'z':'zxcv', 'q':'qwer'}
c = {}

In [65]:
print (set(b)-set(a))
if a: print ('a')
if b: print ('b')
if c: print ('c')

{'q'}
a
b
