In [1]:
import os
from argparse import Namespace
from tqdm.auto import tqdm

from collections import defaultdict
import pandas as pd

import urllib.request
from bs4 import BeautifulSoup

from nltk.tokenize import RegexpTokenizer

In [2]:
class Article:
    def __init__(self, title, authors, url):
        self.title = title
        self.authors = authors
        self.url = url
        
    def __repr__(self):
        s = 'Article {\n'
        s += ''.join([f'{k:>10}: {v}\n' for k, v in self.__dict__.items()])
        s += '}'
        return s

In [3]:
CFG = Namespace()

CFG.url      = 'http://arxiv.org/list/cs.{}/{}{}?show={}'
CFG.fields   = ['CV']
CFG.years    = ['{:0>2d}'.format(i) for i in range(18, 19)]
CFG.months   = ['{:0>2d}'.format(i+1) for i in range(3, 4)]
CFG.max_show = 1000

tokenizer = RegexpTokenizer(r'\w+')

In [4]:
def crawl(fields, years, months, max_show=1000):
    articles = []
    
    for field in fields:
        for year in years:
            for month in months:
                query_url = CFG.url.format(field, year, month, max_show)
                print(f'> retrieving [{query_url}]')
                uh = urllib.request.urlopen(query_url)
                soup = BeautifulSoup(str(uh.read()))
                
                titles = soup.findAll('div', {'class': 'list-title'})
                authors = soup.findAll('div', {'class': 'list-authors'})
                urls = soup.findAll('span', {'class': 'list-identifier'})
                
                assert len(titles) == len(authors) == len(urls)
                
                for i in range(len(titles)):
                    t = titles[i].contents[-1].strip().replace('\\n', '')
                    a = [x.string.strip() for x in authors[i].findAll('a')]
                    u = 'http://arxiv.org' + urls[i].contents[0].attrs['href'].replace('abs', 'pdf') + '.pdf'
                    
                    articles.append(Article(title=t, authors=a, url=u))

    return articles

In [5]:
out = crawl(CFG.fields, CFG.years, CFG.months)

> retrieving [http://arxiv.org/list/cs.CV/1804?show=1000]


In [7]:
out[:10]

[Article {
      title: Hierarchical Transfer Convolutional Neural Networks for Image  Classification
    authors: ['Xishuang Dong', 'Hsiang-Huang Wu', 'Yuzhong Yan', 'Lijun Qian']
        url: http://arxiv.org/pdf/1804.00021.pdf
 }, Article {
      title: Class Subset Selection for Transfer Learning using Submodularity
    authors: ['Varun Manjunatha', 'Srikumar Ramalingam', 'Tim K. Marks', 'Larry Davis']
        url: http://arxiv.org/pdf/1804.00060.pdf
 }, Article {
      title: Learning Beyond Human Expertise with Generative Models for Dental  Restorations
    authors: ['Jyh-Jing Hwang', 'Sergei Azernikov', 'Alexei A. Efros', 'Stella X. Yu']
        url: http://arxiv.org/pdf/1804.00064.pdf
 }, Article {
      title: FloorNet: A Unified Framework for Floorplan Reconstruction from 3D Scans
    authors: ['Chen Liu', 'Jiaye Wu', 'Yasutaka Furukawa']
        url: http://arxiv.org/pdf/1804.00090.pdf
 }, Article {
      title: Iterative Learning with Open-set Noisy Labels
    authors: ['Yi