# Building a dataset of pdf research papers
### Part 1. Collecting urls of pdf files from Arvix

In [1]:
import sys
import io
import os
import time
import urllib
import pickle
import csv
import numpy as np
from multiprocessing.dummy import Pool as ThreadPool

nof_pdf_per_query = 3


In [2]:
print ('reading arxiv categories ...')

catlist = []
with io.open('ArxivSubjectCategory.csv', newline='\n') as csvfile:
    catreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    catlist = [row[0] for row in catreader]

print (len(catlist), 'categories read.')
print ('sample:', np.random.choice(catlist), ',', np.random.choice(catlist), ',', np.random.choice(catlist))

reading arxiv categories ...
153 categories read.
sample: stat.ML , stat.CO , physics.bio-ph


In [3]:
print ('generating queries ...')
q = 'http://export.arxiv.org/api/query?search_query=cat:{cat}&all:{yr}&start=0&max_results={n}'
yrlist = list(range(2001, 2019)) # from 2001 to 2018
qrylist = []

for yr in yrlist:
    for cat in catlist:
        qry = q.format(cat=cat, yr=yr, n=nof_pdf_per_query)
        qrylist.append(qry)
        
print (len(yrlist), 'years, ', len(catlist), 'categories, ', 'total', len(qrylist), 'queries.')
print ('sample:', np.random.choice(qrylist))

generating queries ...
18 years,  153 categories,  total 2754 queries.
sample: http://export.arxiv.org/api/query?search_query=cat:q-fin.RM&all:2005&start=0&max_results=3


In [31]:
# for reporting times
def t_print(t, msg):
    te = time.time()-t
    print (msg, 'in %.2fs' % te)
    return time.time()

# make 1 query, return an empty {} if error
def make_query(query, data={}):
    try:
        data[query] = urllib.request.urlopen(query).read()
    except:
        print ('urlopen error, returning empty{}')
    return data

# read data{} from pickle, otherwise create a new dict
def load_pickle(path):
    try:
        with open(path, 'rb') as f:
            data = pickle.load(f)
    except FileNotFoundError:
        print ('file not found, returning empty{}')
        data = {}
    return data

# dump data{} into pickle
def dump_pickle(data, path):
    pickle.dump(data, open(path, 'wb'))

# load or make 1 query, then save to pickle
def query_and_store(query, pickle_fpath='data.pickle', force=False):
    data = load_pickle(pickle_fpath)

    if query not in data:
        data[query] = make_query(query)
        dump_pickle(data, pickle_fpath)
    
    return data[query]

# run a function in multi-threads, reporting progress
def run_multi_thread(function, iterables, nof_thread=10):
    pool = ThreadPool(nof_thread)
    results = pool.imap(function, iterables)
    
    output = []
    for i, result in enumerate(results):
        output.append(result)
        sys.stdout.write('\rprogress: {0}/{1}'.format(i, len(iterables)))
    print ('')
    
    pool.close()
    pool.join()
    return output
    
# load or make a list of queries, then save to pickle
def qrylist_and_store(qrylist, pickle_fpath='data.pickle', force=False, nof_thread=10):
    
    #init a timer
    t = time.time()
    
    # read data(dict) from pickle, otherwise create a new dict
    data = load_pickle(pickle_fpath)
    t = t_print(t, 'finish load_pickle')
    
    # find query which is not in data
    todo_qrylist = list(set(qrylist) - set(data)) if not force  else qrylist.copy()
    
    # make query for those which are not found
    if todo_qrylist:
        
        # make query in multi-threads
        results = run_multi_thread(make_query, todo_qrylist, nof_thread=nof_thread)
        t = t_print(t, 'finish http requests')
        
        for i, result in enumerate(results):
            q = todo_qrylist[i]
            data[q] = result[q]
        if len(results) == 0: print ('ERROR: len(results) == 0')
        
        # save data to pickle
        dump_pickle(data, pickle_fpath)
        t = t_print(t, 'finish dump_pickle')
        
        print ('%d query(s) finished.' % len(todo_qrylist))
        
    else:
        print ('data found in "%s", no query is needed.' % pickle_fpath)
        
    return {qry : data[qry] for qry in qrylist}


In [47]:
print ('making queries thru arvix api ...')

data = qrylist_and_store(qrylist, pickle_fpath='xmls.pickle', nof_thread=100)
xmllist = list(data.values())

print ('sample:', np.random.choice(xmllist)[:1000], '...')

making queries thru arvix api ...
finish load_pickle in 0.04s
data found in "xmls.pickle", no query is needed.
sample: b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Amath.AT%26id_list%3D%26start%3D0%26max_results%3D3" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=cat:math.AT&amp;id_list=&amp;start=0&amp;max_results=3</title>\n  <id>http://arxiv.org/api/gdeimEdqCKiAAq5pD+xAQCUkTbw</id>\n  <updated>2018-10-24T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">9656</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">3</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/math/9503230v1</id>\n    <updated>1995-03-29

### Part 2. Retrieving pdf files

In [48]:
import feedparser

required_field = ['id', 'title', 'authors', 'link', 'updated'] # these fields will be keys of the papers[i]{}


In [54]:
print ('parsing xmls (queried results) ...')

force = False
if not load_pickle('feeds.pickle') or force:
    feeds = run_multi_thread(feedparser.parse, xmllist, nof_thread=100)
    dump_pickle(feeds, 'feeds.pickle')

print ('sample:', np.random.choice(feeds).feed, '...')

parsing xmls (queried results) ...


NameError: name 'err' is not defined

In [41]:
xmlss[2]

'http://export.arxiv.org/api/query?search_query=cat:astro-ph.EP&all:2001&start=0&max_results=3'

In [33]:
i = 0
for f in feeds:
    i += 1
    print (i)
    print (f.title)

In [17]:
print ('converting feeds ...')

papers = []
for feed in feeds:
    for entry in feed.entries:
        paper = {}
        for i, k in enumerate(entry):
            if k in required_field:
                paper[k] = entry[k]
        papers.append(paper)

print (len(papers), 'papers read.')
print ('sample:', np.random.choice(papers))

converting feeds ...
0 papers read.


ValueError: a must be non-empty

In [66]:
i = 0
for xml in xmls:
    feed = feedparser.parse(xml)
    i += 1
    print (i)
    


1
2
3


KeyboardInterrupt: 

In [77]:
t = time.time()
pool = ThreadPool(200)
xmlss = list(xmls)[:200].copy()
results = pool.imap(feedparser.parse, xmlss)
t = t_print(t, 'start')
for i, result in enumerate(results):
    sys.stdout.write('\rquery progress: {0}/{1}'.format(i, len(xmlss)))
print ('')
t = t_print(t, 'finish')
pool.close()
pool.join()

start in 0.05s
query progress: 199/200
finish in 18.50s


In [61]:
dump_pickle(papers, 'papers.pickle')

In [64]:
x = load_pickle('papers.pickle')
print (x)

[{'id': 'http://arxiv.org/abs/astro-ph/9204001v1', 'link': 'http://arxiv.org/abs/astro-ph/9204001v1', 'updated': '1992-04-13T18:20:01Z', 'title': 'Gamma-Ray Bursts as the Death Throes of Massive Binary Stars', 'authors': [{'name': 'Ramesh Narayan'}, {'name': 'Bohdan Paczyński'}, {'name': 'Tsvi Piran'}]}, {'id': 'http://arxiv.org/abs/astro-ph/9204002v1', 'link': 'http://arxiv.org/abs/astro-ph/9204002v1', 'updated': '1992-04-26T17:54:00Z', 'title': 'Gravitational Lensing and the Variability of G', 'authors': [{'name': 'Lawrence Krauss'}, {'name': 'Martin White'}]}, {'id': 'http://arxiv.org/abs/astro-ph/9204003v2', 'link': 'http://arxiv.org/abs/astro-ph/9204003v2', 'updated': '1992-04-30T20:39:38Z', 'title': 'The Ptolemaic Gamma-Ray Burst Universe', 'authors': [{'name': 'J. I. Katz'}]}, {'id': 'http://arxiv.org/abs/0902.0003v1', 'link': 'http://arxiv.org/abs/0902.0003v1', 'updated': '2009-02-02T20:11:24Z', 'title': 'Characterizing the Properties of Clusters of Galaxies as a Function of\n 

In [31]:
def retrieve_pdf_urls(xml):
    for entry in feed.entries:
    print (entry.title)
    for link in entry.links:
        if link.get('title') == 'pdf':
            url = link.get('href') + '.pdf'
            print (url)



Gamma-Ray Bursts as the Death Throes of Massive Binary Stars
http://arxiv.org/pdf/astro-ph/9204001v1.pdf
Gravitational Lensing and the Variability of G
http://arxiv.org/pdf/astro-ph/9204002v1.pdf
The Ptolemaic Gamma-Ray Burst Universe
http://arxiv.org/pdf/astro-ph/9204003v2.pdf


In [None]:
def xml_to_pdfurls(xml):
    url_list = []
    feed = feedparser.parse(xml)
    
    for i in range(len(feed.entries)):
        for l in feed.entries[i].links:
            if l.get('title') == 'pdf':
                url = l.get('href')
                url_list.append(url+'.pdf')
                
    return url_list
                
def download_file(url, local_fpath, force=False):
    if not os.path.isfile(local_fpath) or force:
        print 'downloading ' + local_fpath + '...',
        urllib.urlretrieve (url, local_fpath)
        print 'done'

def download_arxivpdfs(url_list, force=False):
    for url in url_list:
        local_fpath = 'pdf/' + url.split('/')[-1]
        download_file(url, local_fpath)


In [35]:
a = {'a':'asdf', 'z':'zxcv', 's':'sdfg'}
b = {'a':'asdf', 'z':'zxcv', 'q':'qwer'}
c = {}

In [38]:
a.values()

dict_values(['asdf', 'zxcv', 'sdfg'])

In [65]:
print (set(b)-set(a))
if a: print ('a')
if b: print ('b')
if c: print ('c')

{'q'}
a
b


In [80]:
def run_multi_thread(function, iterables, nof_thread=10):
    pool = ThreadPool(nof_thread)
    results = pool.imap(function, iterables)
    
    output_list = []
    for i, result in enumerate(results):
        print ('$$', i, result, '$$')
#         output_list.append(result)
        sys.stdout.write('\rprogress: {0}/{1}'.format(i, len(iterables)))
    print ('')
    
    pool.close()
    pool.join()
    return results

def func1(x):
    return {x : x*x}

In [83]:
results = run_multi_thread(func1, [3, 4, 5, 6, 7])


$$ 0 {3: 9} $$
progress: 0/5$$ 1 {4: 16} $$
progress: 1/5$$ 2 {5: 25} $$
progress: 2/5$$ 3 {6: 36} $$
progress: 3/5$$ 4 {7: 49} $$
progress: 4/5
