In [1]:
import os
from argparse import Namespace
from tqdm.auto import tqdm

from collections import defaultdict
import pandas as pd

import urllib.request
from bs4 import BeautifulSoup

from nltk.tokenize import RegexpTokenizer

In [2]:
class Article:
    def __init__(self, title, authors, pdf):
        self.title = title
        self.authors = authors
        self.pdf = pdf
        
    def __repr__(self):
        s = 'Article {\n'
        for k, v in self.__dict__.items():
            s += f'{k:>10}: {v}\n'
        s += '}'
        return s

In [3]:
CFG = Namespace()

CFG.url      = 'http://arxiv.org/list/cs.{}/{}{}?show={}'
CFG.fields   = ['CV']
CFG.years    = ['{:0>2d}'.format(i) for i in range(18, 19)]
CFG.months   = ['{:0>2d}'.format(i+1) for i in range(3, 4)]
CFG.max_show = 1000

tokenizer = RegexpTokenizer(r'\w+')

In [4]:
def crawl(fields, years, months, max_show=1000):
    out = []
    
    for field in fields:
        for year in years:
            for month in months:
                query_url = CFG.url.format(field, year, month, max_show)
                print(f'> retrieving [{query_url}]')
                uh = urllib.request.urlopen(query_url)
                soup = BeautifulSoup(str(uh.read()))
                out.append(soup)
                  
    return out

In [5]:
out = crawl(CFG.fields, CFG.years, CFG.months)

> retrieving [http://arxiv.org/list/cs.CV/1804?show=1000]


In [6]:
titles = out[0].findAll('div', {'class': 'list-title'})
urls  = out[0].findAll('span', {'class': 'list-identifier'})
authors = out[0].findAll('div', {'class': 'list-authors'})

len(titles), len(authors), len(urls)

(752, 752, 752)

In [7]:
i = 1
u = 'http://arxiv.org' + urls[i].contents[0].attrs['href'].replace('abs', 'pdf') + '.pdf'
a = [x.string.strip() for x in authors[i].findAll('a')]
t = titles[i].contents[-1].strip().replace('\\n', '')

ar = Article(t, a, u)
ar

Article {
     title: Class Subset Selection for Transfer Learning using Submodularity
   authors: ['Varun Manjunatha', 'Srikumar Ramalingam', 'Tim K. Marks', 'Larry Davis']
       pdf: http://arxiv.org/pdf/1804.00060.pdf
}