In [1]:
import urllib
import pickle
import io
import os
import csv
import numpy as np
import time

nof_pdf_per_query = 3


In [2]:
print ('reading arxiv categories ...')

catlist = []
with io.open('ArxivSubjectCategory.csv', newline='\n') as csvfile:
    catreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    catlist = [row[0] for row in catreader]

print (len(catlist), 'categories read.')
print ('sample:', np.random.choice(catlist), ',', np.random.choice(catlist), ',', np.random.choice(catlist))

reading arxiv categories ...
153 categories read.
sample: physics.ed-ph , cs.LG , cs.CC


In [3]:
print ('generating queries ...')
q = 'http://export.arxiv.org/api/query?search_query=cat:{cat}&all:{yr}&start=0&max_results={n}'
yrlist = list(range(2001, 2019)) # from 2001 to 2018
qrylist = []

for yr in yrlist:
    for cat in catlist:
        qry = q.format(cat=cat, yr=yr, n=nof_pdf_per_query)
        qrylist.append(qry)
        
print (len(yrlist), 'years, ', len(catlist), 'categories, ', 'total', len(qrylist), 'queries.')
print ('sample:', np.random.choice(qrylist))

generating queries ...
18 years,  153 categories,  total 2754 queries.
sample: http://export.arxiv.org/api/query?search_query=cat:cs.DM&all:2016&start=0&max_results=3


In [47]:
t1 = t2 = 0
t = {}
def timestart():
    t1 = time.time() / 1000000000
def timestop(mod='other'):
    t2 = time.time() / 1000000000
    t[mod] += t2 - t1
def printtimestat():
    print ('urllib', t['urllib'], 'pickle', t['pickle'], 'other', t['other'])
def timereset():
    t1 = t2 = 0
    t['urllib'] = 0
    t['pickle'] = 0
    t['other'] = 0
timereset()

In [66]:
def make_query(query):
    return urllib.request.urlopen(query).read()

def make_query_save_data(query, data):
    data[query] = urllib.request.urlopen(query).read()


def query_and_store(query, pickle_fpath='data.pickle', force=False):
    # read data(dict) from pickle, otherwise create a new dict
    timestart()
    try:
        f = open(pickle_fpath, 'rb')
        data = pickle.load(f)
        f.close()
    except IOError:
        data = {}
    timestop('pickle')

    if query not in data:
        timestart()
        data[query] = make_query(query)
        timestop('urllib')

    timestart()
    pickle.dump(data, open(pickle_fpath, 'wb'))
    timestop('pickle')
    
    return data[query]


import threading
def queryfew_and_storeonce(query, pickle_fpath='data.pickle', force=False):
    # read data(dict) from pickle, otherwise create a new dict
    timestart()
    try:
        f = open(pickle_fpath, 'rb')
        data = pickle.load(f)
        f.close()
    except IOError:
        data = {}
    timestop('pickle')

    # if query not found in local data or forced, do query and save back to the pickle file
    tobequery = set(query) - set(data) # compute query which is not in data
    if len(tobequery) != 0 or force: # not found in data{}, need some query
        timestart()
#       data[qry] = make_query(qry)

        threads = [threading.Thread(target=make_query_save_data, args=(qry,data,)) for qry in tobequery]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()

        timestop('urllib')

        timestart()
        pickle.dump(data, open(pickle_fpath, 'wb'))
        timestop('pickle')
        
    return {qry : data[qry] for qry in query}

def qrylist_and_store(qrylist, pickle_fpath='data.pickle', force=False):
    data = {}
    i = 0
#     smalllist = []
    for qry in qrylist:
#         smalllist.append(qry)
        data[qry] = query_and_store(qry, pickle_fpath=pickle_fpath, force=force)
        
        i += 1
        if i % 10 == 0:
#             smalldata = queryfew_and_storeonce(smalllist, pickle_fpath=pickle_fpath, force=force)
#             data.update(smalldata)
#             smalllist = []
            printtimestat()
            print (i)
    return data

def qrylist_and_store2(qrylist, pickle_fpath='data.pickle', force=False):
    data = {}
    i = 0
    smalllist = []
    for qry in qrylist:
        smalllist.append(qry)
#         data[qry] = query_and_store(qry, pickle_fpath=pickle_fpath, force=force)
        
        i += 1
        if i % 2 == 0:
            smalldata = queryfew_and_storeonce(smalllist, pickle_fpath=pickle_fpath, force=force)
            data.update(smalldata)
            smalllist = []
            printtimestat()
            print (i)
    return data

In [67]:
timereset()
data = qrylist_and_store2(qrylist)
print (len(data), len(qrylist))
for d in data:
    print (data[d])
    break

urllib 0 pickle 1.54022455733707 other 0
2
urllib 0 pickle 3.080449114716792 other 0
4
urllib 0 pickle 4.620673672135732 other 0
6
urllib 0 pickle 6.160898229593857 other 0
8
urllib 0 pickle 7.701122787092654 other 0
10
urllib 0 pickle 9.24134734463212 other 0
12
urllib 0 pickle 10.78157190221024 other 0
14
urllib 0 pickle 12.321796459829066 other 0
16
urllib 0 pickle 13.862021017486086 other 0
18
urllib 0 pickle 15.402245575178444 other 0
20
urllib 0 pickle 16.942470132906855 other 0
22
urllib 0 pickle 18.48269469068288 other 0
24
urllib 0 pickle 20.02291924850209 other 0
26
urllib 0 pickle 21.563143806360983 other 0
28
urllib 0 pickle 23.10336836426005 other 0
30
urllib 0 pickle 24.643592922200288 other 0
32
urllib 0 pickle 26.18381748018169 other 0
34
urllib 0 pickle 27.724042038204264 other 0
36
urllib 0 pickle 29.264266596265067 other 0
38
urllib 0 pickle 30.80449115436653 other 0
40
urllib 0 pickle 32.34471571250566 other 0
42
urllib 0 pickle 33.884940270681 other 0
44
urllib 0 p

urllib 0 pickle 275.70019646256844 other 0
358
urllib 0 pickle 277.24042102769175 other 0
360
urllib 0 pickle 278.7806455928513 other 0
362
urllib 0 pickle 280.32087015805195 other 0
364
urllib 0 pickle 281.8610947232933 other 0
366
urllib 0 pickle 283.40131928857534 other 0
368
urllib 0 pickle 284.9415438538971 other 0
370
urllib 0 pickle 286.4817684192594 other 0
372
urllib 0 pickle 288.021992984663 other 0
374
urllib 0 pickle 289.56221755010716 other 0
376
urllib 0 pickle 291.10244211559205 other 0
378
urllib 0 pickle 292.6426666811186 other 0
380
urllib 0 pickle 294.18289124668183 other 0
382
urllib 0 pickle 295.7231158122843 other 0
384
urllib 0 pickle 297.26334037792896 other 0
386
urllib 0 pickle 298.8035649436475 other 0
388
urllib 0 pickle 300.34378950940516 other 0
390
urllib 0 pickle 301.88401407520695 other 0
392
urllib 0 pickle 303.4242386410509 other 0
394
urllib 0 pickle 304.9644632069386 other 0
396
urllib 0 pickle 306.50468777287165 other 0
398
urllib 0 pickle 308.0449

urllib 0 pickle 556.0210680438358 other 0
722
urllib 0 pickle 557.561292616977 other 0
724
urllib 0 pickle 559.1015171901597 other 0
726
urllib 0 pickle 560.6417417633788 other 0
728
urllib 0 pickle 562.1819663366375 other 0
730
urllib 0 pickle 563.7221909099413 other 0
732
urllib 0 pickle 565.2624154833056 other 0
734
urllib 0 pickle 566.8026400567176 other 0
736
urllib 0 pickle 568.3428646301712 other 0
738
urllib 0 pickle 569.8830892036585 other 0
740
urllib 0 pickle 571.423313777184 other 0
742
urllib 0 pickle 572.9635383508053 other 0
744
urllib 0 pickle 574.5037629244922 other 0
746
urllib 0 pickle 576.043987498236 other 0
748
urllib 0 pickle 577.5842120720283 other 0
750
urllib 0 pickle 579.1244366458693 other 0
752
urllib 0 pickle 580.6646612197565 other 0
754
urllib 0 pickle 582.2048857936936 other 0
756
urllib 0 pickle 583.7451103676765 other 0
758
urllib 0 pickle 585.2853349417094 other 0
760
urllib 0 pickle 586.825559515797 other 0
762
urllib 0 pickle 588.365784089932 other

urllib 0 pickle 830.18104275795 other 0
1078
urllib 0 pickle 831.7212673388271 other 0
1080
urllib 0 pickle 833.2614919197446 other 0
1082
urllib 0 pickle 834.8017165007017 other 0
1084
urllib 0 pickle 836.3419410816985 other 0
1086
urllib 0 pickle 837.8821656627299 other 0
1088
urllib 0 pickle 839.4223902438022 other 0
1090
urllib 0 pickle 840.962614824915 other 0
1092
urllib 0 pickle 842.5028394060755 other 0
1094
urllib 0 pickle 844.0430639872721 other 0
1096
urllib 0 pickle 845.5832885685129 other 0
1098
urllib 0 pickle 847.1235131497954 other 0
1100
urllib 0 pickle 848.663737731119 other 0
1102
urllib 0 pickle 850.2039623124833 other 0
1104
urllib 0 pickle 851.7441868938968 other 0
1106
urllib 0 pickle 853.2844114753558 other 0
1108
urllib 0 pickle 854.824636056855 other 0
1110
urllib 0 pickle 856.3648606383993 other 0
1112
urllib 0 pickle 857.9050852199867 other 0
1114
urllib 0 pickle 859.4453098016184 other 0
1116
urllib 0 pickle 860.9855343832952 other 0
1118
urllib 0 pickle 86

urllib 0 pickle 1104.3410188104524 other 0
1434
urllib 0 pickle 1105.8812433985966 other 0
1436
urllib 0 pickle 1107.4214679867803 other 0
1438
urllib 0 pickle 1108.9616925750097 other 0
1440
urllib 0 pickle 1110.5019171632803 other 0
1442
urllib 0 pickle 1112.0421417515915 other 0
1444
urllib 0 pickle 1113.5823663399435 other 0
1446
urllib 0 pickle 1115.1225909283316 other 0
1448
urllib 0 pickle 1116.6628155167584 other 0
1450
urllib 0 pickle 1118.2030401052264 other 0
1452
urllib 0 pickle 1119.743264693735 other 0
1454
urllib 0 pickle 1121.2834892822839 other 0
1456
urllib 0 pickle 1122.823713870873 other 0
1458
urllib 0 pickle 1124.3639384595022 other 0
1460
urllib 0 pickle 1125.904163048172 other 0
1462
urllib 0 pickle 1127.4443876368905 other 0
1464
urllib 0 pickle 1128.9846122256497 other 0
1466
urllib 0 pickle 1130.5248368144466 other 0
1468
urllib 0 pickle 1132.0650614032822 other 0
1470
urllib 0 pickle 1133.6052859921535 other 0
1472
urllib 0 pickle 1135.1455105810685 other 0


urllib 0 pickle 1375.420547014341 other 0
1786
urllib 0 pickle 1376.9607716105859 other 0
1788
urllib 0 pickle 1378.5009962068714 other 0
1790
urllib 0 pickle 1380.0412208031964 other 0
1792
urllib 0 pickle 1381.5814453995633 other 0
1794
urllib 0 pickle 1383.1216699959748 other 0
1796
urllib 0 pickle 1384.6618945924245 other 0
1798
urllib 0 pickle 1386.2021191889132 other 0
1800
urllib 0 pickle 1387.7423437854427 other 0
1802
urllib 0 pickle 1389.2825683820158 other 0
1804
urllib 0 pickle 1390.8227929786292 other 0
1806
urllib 0 pickle 1392.3630175752808 other 0
1808
urllib 0 pickle 1393.903242171967 other 0
1810
urllib 0 pickle 1395.4434667686899 other 0
1812
urllib 0 pickle 1396.983691365452 other 0
1814
urllib 0 pickle 1398.5239159622563 other 0
1816
urllib 0 pickle 1400.0641405591036 other 0
1818
urllib 0 pickle 1401.60436515601 other 0
1820
urllib 0 pickle 1403.1445897529702 other 0
1822
urllib 0 pickle 1404.6848143499813 other 0
1824
urllib 0 pickle 1406.2250389470444 other 0
18

urllib 0 pickle 1643.4196274236795 other 0
2134
urllib 0 pickle 1644.9598520276386 other 0
2136
urllib 0 pickle 1646.500076631638 other 0
2138
urllib 0 pickle 1648.0403012356774 other 0
2140
urllib 0 pickle 1649.5805258397606 other 0
2142
urllib 0 pickle 1651.1207504439058 other 0
2144
urllib 0 pickle 1652.6609750481025 other 0
2146
urllib 0 pickle 1654.2011996523413 other 0
2148
urllib 0 pickle 1655.7414242566356 other 0
2150
urllib 0 pickle 1657.281648860975 other 0
2152
urllib 0 pickle 1658.8218734653608 other 0
2154
urllib 0 pickle 1660.362098069783 other 0
2156
urllib 0 pickle 1661.9023226742545 other 0
2158
urllib 0 pickle 1663.4425472787746 other 0
2160
urllib 0 pickle 1664.9827718833396 other 0
2162
urllib 0 pickle 1666.5229964879559 other 0
2164
urllib 0 pickle 1668.0632210926124 other 0
2166
urllib 0 pickle 1669.6034456973307 other 0
2168
urllib 0 pickle 1671.1436703020947 other 0
2170
urllib 0 pickle 1672.6838949068986 other 0
2172
urllib 0 pickle 1674.2241195117533 other 0


urllib 0 pickle 1909.8784845586395 other 0
2480
urllib 0 pickle 1911.418709169871 other 0
2482
urllib 0 pickle 1912.95893378114 other 0
2484
urllib 0 pickle 1914.499158392448 other 0
2486
urllib 0 pickle 1916.0393830037954 other 0
2488
urllib 0 pickle 1917.579607615182 other 0
2490
urllib 0 pickle 1919.1198322266055 other 0
2492
urllib 0 pickle 1920.6600568380686 other 0
2494
urllib 0 pickle 1922.2002814495718 other 0
2496
urllib 0 pickle 1923.7405060611156 other 0
2498
urllib 0 pickle 1925.2807306727007 other 0
2500
urllib 0 pickle 1926.8209552843248 other 0
2502
urllib 0 pickle 1928.3611798959862 other 0
2504
urllib 0 pickle 1929.9014045076813 other 0
2506
urllib 0 pickle 1931.4416291194143 other 0
2508
urllib 0 pickle 1932.9818537311833 other 0
2510
urllib 0 pickle 1934.522078342995 other 0
2512
urllib 0 pickle 1936.0623029548562 other 0
2514
urllib 0 pickle 1937.6025275667632 other 0
2516
urllib 0 pickle 1939.1427521787127 other 0
2518
urllib 0 pickle 1940.6829767907059 other 0
252

In [50]:
a = {'a':'asdf', 'z':'zxcv', 's':'sdfg'}
b = {'a':'asdf', 'z':'zxcv', 'q':'qwer'}

In [51]:
print (set(b)-set(a))

{'q'}
