In [1]:
import sys
import io
import os
import time
import urllib
import pickle
import csv
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool

nof_pdf_per_query = 3


In [2]:
print ('reading arxiv categories ...')

catlist = []
with io.open('ArxivSubjectCategory.csv', newline='\n') as csvfile:
    catreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    catlist = [row[0] for row in catreader]

print (len(catlist), 'categories read.')
print ('sample:', np.random.choice(catlist), ',', np.random.choice(catlist), ',', np.random.choice(catlist))

reading arxiv categories ...
153 categories read.
sample: cs.MS , q-fin.PR , q-bio.TO


In [3]:
print ('generating queries ...')
q = 'http://export.arxiv.org/api/query?search_query=cat:{cat}&all:{yr}&start=0&max_results={n}'
yrlist = list(range(2001, 2019)) # from 2001 to 2018
qrylist = []

for yr in yrlist:
    for cat in catlist:
        qry = q.format(cat=cat, yr=yr, n=nof_pdf_per_query)
        qrylist.append(qry)
        
print (len(yrlist), 'years, ', len(catlist), 'categories, ', 'total', len(qrylist), 'queries.')
print ('sample:', np.random.choice(qrylist))

generating queries ...
18 years,  153 categories,  total 2754 queries.
sample: http://export.arxiv.org/api/query?search_query=cat:physics.space-ph&all:2017&start=0&max_results=3


In [19]:
# for reporting times
def t_print(t, msg):
    te = time.time()-t
    print (msg, 'in %.2fs' % te)
    return time.time()

# make 1 query, return an empty {} if error
def make_query(query, data={}):
    try:
        data[query] = urllib.request.urlopen(query).read()
    except:
        print (err)
    return data

# read data{} from pickle, otherwise create a new dict
def load_pickle(path):
    try:
        with open(path, 'rb') as f:
            data = pickle.load(f)
    except:
        print (err)
        data = {}
    return data

# dump data{} into pickle
def dump_pickle(data, path):
    pickle.dump(data, open(path, 'wb'))

# load or make 1 query, then save to pickle
def query_and_store(query, pickle_fpath='data.pickle', force=False):
    data = load_pickle(pickle_fpath)

    if query not in data:
        data[query] = make_query(query)
        dump_pickle(data, pickle_fpath)
    
    return data[query]

# load or make a list of queries, then save to pickle
def qrylist_and_store(qrylist, pickle_fpath='data.pickle', force=False, nof_thread=10):
    
    #init a timer
    t = time.time()
    
    # read data(dict) from pickle, otherwise create a new dict
    data = load_pickle(pickle_fpath)
    t = t_print(t, 'finish load_pickle')
    
    # find query which is not in data
    todo_qrylist = list(set(qrylist) - set(data)) if not force  else qrylist.copy()
    t = t_print(t, 'finish todo_qrylist')
    
    # make query for those which are not found
    if todo_qrylist:
        
        # init multiple threads
        pool = ThreadPool(nof_thread)
        
        # make queries asynchronously
        results = pool.imap(make_query, todo_qrylist)
        t = t_print(t, 'finish imap')
        
        # wait and report progress
        for i, result in enumerate(results):
            q = todo_qrylist[i]
            data[q] = result[q]
            sys.stdout.write('\rquery progress: {0}/{1}'.format(i, len(todo_qrylist)))
        print ('')
        t = t_print(t, 'finish results')
        
        # close the pool and wait for the work to finish 
        pool.close()
        pool.join()
        t = t_print(t, 'finish join')
        
        # save data to pickle
        dump_pickle(data, pickle_fpath)
        t = t_print(t, 'finish dump')
    
    else:
        print ('data found in "%s", no query is needed.' % pickle_fpath)
        
    return {qry : data[qry] for qry in qrylist}


In [20]:
data = qrylist_and_store(qrylist, pickle_fpath='data.pickle', nof_thread=100)


finish load_pickle in 0.03s
finish todo_qrylist in 0.00s
data found in "data.pickle", no query is needed.


In [20]:
a = {'a':'asdf', 'z':'zxcv', 's':'sdfg'}
b = {'a':'asdf', 'z':'zxcv', 'q':'qwer'}
c = {}

In [65]:
print (set(b)-set(a))
if a: print ('a')
if b: print ('b')
if c: print ('c')

{'q'}
a
b


In [22]:
print (a.keys())

dict_keys(['a', 'z', 's'])


In [43]:
x = {'a':'a'}
type(None) is not dict


True