First, we import the required packages.

In [1]:
## packages
import pandas as pd
import datetime
import time
import sys
import string
import gc
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlencode
from urllib.request import urlopen
from urllib.error import HTTPError

Now, let's specify the parameters as though we were going to pass them to the arxivsearch function.

In [2]:
## params
start_date = str(datetime.date(2020,5,1))
end_date = str(datetime.date(2020,5,15))
kwd_req = ['online'] 
kwd_exc = ['education'] 
kwd_one = [['bandit', 'partial information'], ['regret', 'generalization error']]
subject = 'stat' # subjects = ['stat']
cols = ['id', 'title', 'authors', 'date', 'categories', 'abstract']
export = '/Users/blairbilodeau/Desktop/arxiv/'
exportfile = ''
download = '/Users/blairbilodeau/Desktop/arxiv/'

We try accessing the url, which we print.

In [3]:
# set url to extract papers from this subject
OAI = '{http://www.openarchives.org/OAI/2.0/}'
ARXIV = '{http://arxiv.org/OAI/arXiv/}'
BASE = 'http://export.arxiv.org/oai2?verb=ListRecords&'

url = BASE + 'from=' + start_date + '&until=' + end_date + '&metadataPrefix=arXiv&set=%s' %subject

## error handling for opening and closing URL
try:
	url_response = urlopen(url)
	xml = url_response.read() # get raw xml data from server
	gc.collect()
except HTTPError as e:
	# handle service unavailable error for requesting too often
	if e.code == 503:
		retry_time = int(e.hdrs.get('retry-after', 30))
		print('Got 503. Retry after {0:d} seconds.'.format(retry_time))
	# if it's a different error, just have to raise it
	else:
		raise

In [4]:
url

'http://export.arxiv.org/oai2?verb=ListRecords&from=2020-05-01&until=2020-05-15&metadataPrefix=arXiv&set=stat'

Let's visualize a bit how ugly XML is for humans, and start extracting elements from it we can work with.

In [5]:
xml[0:200]

b'<?xml version="1.0" encoding="UTF-8"?>\n<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2'

In [6]:
xml_root = ET.fromstring(xml) # get root of xml hierarchy
xml_root

<Element '{http://www.openarchives.org/OAI/2.0/}OAI-PMH' at 0x11f220290>

In [7]:
records = xml_root.findall(OAI + 'ListRecords/' + OAI + 'record') # list of all records from xml tree
print('We have fetched {0:d} records. Here is a sample.'.format(len(records)))
print(records[0:3])

We have fetched 1000 records. Here is a sample.
[<Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x11f234290>, <Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x11f267710>, <Element '{http://www.openarchives.org/OAI/2.0/}record' at 0x11f267ef0>]


We will use Python's list comprehension to extract the data. Here's an example of how that works.

In [8]:
## list comprehension
x = []
for i in range(10):
	x = x + [i]
print('x = ',x)

# v.s.

y = [i for i in range(10)]
print('y = ',y)

x =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
y =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [9]:
## extract metadata for each record
metadata = [record.find(OAI + 'metadata').find(ARXIV + 'arXiv') for record in records]
print(metadata[0:3])

[<Element '{http://arxiv.org/OAI/arXiv/}arXiv' at 0x11f234b30>, <Element '{http://arxiv.org/OAI/arXiv/}arXiv' at 0x11f267950>, <Element '{http://arxiv.org/OAI/arXiv/}arXiv' at 0x11f269170>]


Now, let's extract fields from the records.

In [10]:
## use metadata to get info for each record
titles = [meta.find(ARXIV + 'title').text.strip().lower().replace('\n', ' ') for meta in metadata]
dates = [meta.find(ARXIV + 'created').text.strip() if meta.find(ARXIV + 'updated') is None else meta.find(ARXIV + 'updated').text.strip() for meta in metadata]
ids = [meta.find(ARXIV + 'id').text.strip().lower().replace('\n', ' ') for meta in metadata]
abstracts = [meta.find(ARXIV + 'abstract').text.strip().lower().replace('\n', ' ') for meta in metadata]
category_lists = [meta.find(ARXIV + 'categories').text.strip().lower().replace('\n', ' ').split() for meta in metadata]
author_lists = [meta.findall(ARXIV + 'authors/' + ARXIV + 'author') for meta in metadata]
urls = ['https://arxiv.org/abs/' + meta.find(ARXIV + 'id').text.strip().lower().replace('\n', ' ') for meta in metadata]

To demonstrate why we use the strip(), lower(), and replace() commands, here's a before and after of the first abstract.

In [11]:
metadata[0].find(ARXIV + 'abstract').text

'  We prove a Gaussian process approximation for the sequence of random\ncompositions of a two-color randomly reinforced urn for both the cases with the\nequal and unequal reinforcement means. By using the Gaussian approximation, the\nlaw of the iterated logarithm and the functional limit central limit theorem in\nboth the stable convergence sense and the almost-sure conditional convergence\nsense are established. Also as a consequence, we are able to to prove that the\ndistribution of the urn composition has no points masses both when the\nreinforcement means are equal and unequal under the assumption of only finite\n$(2+\\epsilon)$-th moments.\n'

In [12]:
abstracts[0]

'we prove a gaussian process approximation for the sequence of random compositions of a two-color randomly reinforced urn for both the cases with the equal and unequal reinforcement means. by using the gaussian approximation, the law of the iterated logarithm and the functional limit central limit theorem in both the stable convergence sense and the almost-sure conditional convergence sense are established. also as a consequence, we are able to to prove that the distribution of the urn composition has no points masses both when the reinforcement means are equal and unequal under the assumption of only finite $(2+\\epsilon)$-th moments.'

The author lists currently are still in element form.

In [13]:
author_lists[0]

[<Element '{http://arxiv.org/OAI/arXiv/}author' at 0x11f234e90>]

Let's clean them up and check it worked.

In [14]:
## extract first and last names, clean them, and put them together to make human readable
last_name_lists = [[author.find(ARXIV + 'keyname').text.lower() for author in author_list] for author_list in author_lists]
first_name_meta_lists = [[author.find(ARXIV + 'forenames') for author in author_list] for author_list in author_lists]
first_name_lists = [['' if name == None else name.text.lower() for name in first_name_meta_list] for first_name_meta_list in first_name_meta_lists]
full_name_temp_lists = [zip(a,b) for a,b in zip(first_name_lists, last_name_lists)]
full_name_lists = [[a+' '+b for a,b in full_name_temp_list] for full_name_temp_list in full_name_temp_lists]

In [15]:
print(last_name_lists[2])
print(first_name_lists[2])
print(full_name_lists[2])

['mossel', 'mueller-frank', 'sly', 'tamuz']
['elchanan', 'manuel', 'allan', 'omer']
['elchanan mossel', 'manuel mueller-frank', 'allan sly', 'omer tamuz']


Now we're ready to put this into a dataframe (table).

In [16]:
## compile all info into big dataframe
records_data = list(zip(titles, dates, ids, abstracts, category_lists, urls, full_name_lists))
records_df = pd.DataFrame(records_data,columns=['title','date','id','abstract','categories','url','authors'])
records_df.iloc[2]

title                                social learning equilibria
date                                                 2019-09-27
id                                                    1207.5895
abstract      we consider a large class of social learning m...
categories                          [math.st, econ.th, stat.th]
url                             https://arxiv.org/abs/1207.5895
authors       [elchanan mossel, manuel mueller-frank, allan ...
Name: 2, dtype: object

We can finally start searching to only pick out the papers we desire.

In [17]:
## merge abstracts and titles into one big text blob without punctuation to search for keywords in
abstract_title_concats = [title+'. '+abstract for title,abstract in zip(titles,abstracts)]
print(titles[2] + '\n')
print(abstracts[2] + '\n')
print(abstract_title_concats[2] + '\n')

social learning equilibria

we consider a large class of social learning models in which a group of agents face uncertainty regarding a state of the world, share the same utility function, observe private signals, and interact in a general dynamic setting. we introduce social learning equilibria, a static equilibrium concept that abstracts away from the details of the given extensive form, but nevertheless captures the corresponding asymptotic equilibrium behavior. we establish general conditions for agreement, herding, and information aggregation in equilibrium, highlighting a connection between agreement and information aggregation.

social learning equilibria. we consider a large class of social learning models in which a group of agents face uncertainty regarding a state of the world, share the same utility function, observe private signals, and interact in a general dynamic setting. we introduce social learning equilibria, a static equilibrium concept that abstracts away from the 

First, we look at the keyword lists where we need at least one hit from each.

In [18]:
print(kwd_one)

[['bandit', 'partial information'], ['regret', 'generalization error']]


Here's an example of a hit.

In [19]:
print(abstract_title_concats[66])

thompson sampling algorithms for cascading bandits. motivated by efficient optimization for online recommender systems, we revisit the cascading bandit model proposed by kveton et al. (2015). while thompson sampling (ts) algorithms have been shown to be empirically superior to upper confidence bound (ucb) algorithms for cascading bandits, theoretical guarantees are only known for the latter, not the former. in this paper, we close the gap by designing and analyzing a ts algorithm, ts-cascade, that achieves the state-of-the-art regret bound for cascading bandits. next, we derive a nearly matching regret lower bound, with information-theoretic techniques and judiciously constructed cascading bandit instances. in complement, we also provide a problem-dependent upper bound on the regret of the thompson sampling algorithm with beta-bernoulli update; this upper bound is tighter than a recent derivation by huyuk and tekin (2019). finally, we consider a linear generalization of the cascading b

In [20]:
print(kwd_one[0][0] in abstract_title_concats[66]) # check for bandit
print(kwd_one[0][1] in abstract_title_concats[66]) # check for partial information
print('')
print(kwd_one[1][0] in abstract_title_concats[66]) # check for regret
print(kwd_one[1][1] in abstract_title_concats[66]) # check for generalization error

True
False

True
False


In [21]:
## only abstracts and titles that have intersection with required categories

# for each kwd_one list take indexes such that title/abstract intersect with the list
kwd_one_idxs_lists = [set([idx for idx,val in enumerate(list(map(lambda x: any([kwd in x for kwd in kwd_one_list]), abstract_title_concats))) if val]) for kwd_one_list in kwd_one]

In [22]:
kwd_one_idxs_lists[0] # everywhere that had bandits or partial information

{66, 86, 269, 386, 426, 467, 646, 672, 697, 736, 819, 891, 912, 942}

In [23]:
kwd_one_idxs_lists[1] # everywhere that had regret or generalization error

{66,
 76,
 86,
 148,
 209,
 234,
 269,
 287,
 317,
 335,
 386,
 426,
 446,
 485,
 531,
 532,
 646,
 672,
 707,
 719,
 736,
 764,
 819,
 912,
 920,
 942,
 991}

In [24]:
# take intersection of all the kwd_one index sets
kwd_one_idxs = kwd_one_idxs_lists[0].intersection(*kwd_one_idxs_lists) 
kwd_one_idxs # everywhere that has at least one from each list

{66, 86, 269, 386, 426, 646, 672, 736, 819, 912, 942}

In [25]:
kwd_req_idxs = set([idx for idx,val in enumerate(list(map(lambda x: all([kwd in x for kwd in kwd_req]), abstract_title_concats))) if val])
kwd_exc_idxs = set([idx for idx,val in enumerate(list(map(lambda x: all([kwd not in x for kwd in kwd_exc]), abstract_title_concats))) if val])
    
kwd_idxs = kwd_one_idxs.intersection(kwd_req_idxs, kwd_exc_idxs)
kwd_idxs

{66, 386, 736}

We didn't do any author restrictions here, but the code works exactly the same as for keywords.

Finally, we print a dataframe with the papers that we wanted.

In [26]:
## only take temp_df rows that match the desired indices for keywords and authors
records_df = records_df.iloc[list(kwd_idxs)]
records_df

Unnamed: 0,title,date,id,abstract,categories,url,authors
736,dtr bandit: learning to make response-adaptive...,2020-05-06,2005.02791,dynamic treatment regimes (dtrs) for are perso...,"[stat.ml, cs.lg, math.oc]",https://arxiv.org/abs/2005.02791,"[yichun hu, nathan kallus]"
66,thompson sampling algorithms for cascading ban...,2020-05-08,1810.01187,motivated by efficient optimization for online...,"[cs.lg, stat.ml]",https://arxiv.org/abs/1810.01187,"[wang chi cheung, zixin zhong, vincent y. f. tan]"
386,optimal no-regret learning in repeated first-p...,2020-05-08,2003.09795,we study online learning in repeated first-pri...,"[cs.lg, cs.gt, cs.it, math.it, stat.me, stat.ml]",https://arxiv.org/abs/2003.09795,"[yanjun han, zhengyuan zhou, tsachy weissman]"


Let's export this table to a csv.

In [27]:
print(export + '\n')

/Users/blairbilodeau/Desktop/arxiv/



In [28]:
# file to export csv to
if exportfile == '':
    exportfile = datetime.date.today().strftime('%Y-%m-%d') + '-arxiv-metadata.csv'
exportpath = export + exportfile

print(exportfile + '\n')
print(exportpath)

2020-05-30-arxiv-metadata.csv

/Users/blairbilodeau/Desktop/arxiv/2020-05-30-arxiv-metadata.csv


In [29]:
records_df.to_csv(exportpath, index=False)

We can also export the pdfs themselves.

In [30]:
# folder to download pdfs to
download

'/Users/blairbilodeau/Desktop/arxiv/'

In [31]:
## create urls to access pdfs
pdf_urls = ['https://arxiv.org/pdf/' + id + '.pdf' for id in records_df.id] # list of urls to pull pdfs from

print(pdf_urls)

['https://arxiv.org/pdf/2005.02791.pdf', 'https://arxiv.org/pdf/1810.01187.pdf', 'https://arxiv.org/pdf/2003.09795.pdf']


In [32]:
# create filenames to export pdfs to
# currently setup in year_lastname format
pdf_lastnames_full = ['_'.join([name.split()[-1] for name in namelist]) for namelist in records_df.authors] # pull out lastnames only
pdf_lastnames = [name if len(name) < 200 else name.split('_')[0] + '_et_al' for name in pdf_lastnames_full] # make sure file names don't get longer than ~200 chars
pdf_paths = [download + date + '_' + lastname + '.pdf' for date,lastname in zip(records_df.date, pdf_lastnames)] # full path for each file

print(pdf_paths)

['/Users/blairbilodeau/Desktop/arxiv/2020-05-06_hu_kallus.pdf', '/Users/blairbilodeau/Desktop/arxiv/2020-05-08_cheung_zhong_tan.pdf', '/Users/blairbilodeau/Desktop/arxiv/2020-05-08_han_zhou_weissman.pdf']


In [33]:
# export pdfs
for paper_idx in range(len(pdf_urls)):
    response = requests.get(pdf_urls[paper_idx])
    file = open(pdf_paths[paper_idx], 'wb')
    file.write(response.content)
    file.close()
    gc.collect()
print('Download complete.')

Download complete.
