In [1]:
import os
import time

from argparse import Namespace
from tqdm.auto import tqdm
import random
import pickle

import itertools

import urllib.request
from bs4 import BeautifulSoup

from common import Paper, get_abstract

In [2]:
CFG = Namespace()

CFG.url      = 'http://arxiv.org/list/cs.{}/{}{}?show={}'
CFG.fields   = ['CL', 'CV', 'IR']
CFG.years    = ['{:0>2d}'.format(i) for i in range(17, 20)]
CFG.months   = ['{:0>2d}'.format(i) for i in range(1, 12)]
CFG.max_show = 1000

In [3]:
def crawl(fields: [str], years: [str], months: [str], max_show=1000) -> [Paper]:
    all_papers = []
    done = True
    
    info = list(itertools.product(fields, years, months))
    random.shuffle(info)
        
    for field, year, month in info:
        fname = f'out/papers-{field}-{year}-{month}.pkl'

        if os.path.exists(fname):
            continue
        
        done = False
        query_url = CFG.url.format(field, year, month, max_show)

        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            req = urllib.request.Request(url=query_url, headers=headers) 
            uh = urllib.request.urlopen(req)
            soup = BeautifulSoup(str(uh.read()), features='lxml')
        except Exception as e:
            print(f'error for {query_url}: {e}')
            continue

        titles = soup.findAll('div', {'class': 'list-title'})
        authors = soup.findAll('div', {'class': 'list-authors'})
        urls = soup.findAll('span', {'class': 'list-identifier'})

        assert len(titles) == len(authors) == len(urls)

        papers = []
        for i in tqdm(range(len(titles)), desc=f'{field} | {year} | {month} '):
            t = titles[i].contents[-1].strip().replace('\\n', '')
            a = [x.string.strip() for x in authors[i].findAll('a')]
            
            ref = urls[i].contents[0].attrs['href'].split('/')[-1]
            p = f'http://arxiv.org/pdf/{ref}.pdf'
            abstract = get_abstract(url=f'http://arxiv.org/abs/{ref}', sleep=True)

            papers.append(Paper(field=field, title=t, authors=a, pdf=p, abstract=abstract))

        pickle.dump(papers, open(fname, 'wb'))
        all_papers += papers
        done = True

    return all_papers, done

### Crawler main

In [4]:
while True:
    _, done = crawl(CFG.fields, CFG.years, CFG.months, CFG.max_show)
    if done:
        break
        
print('Done')

HBox(children=(IntProgress(value=0, description='IR | 17 | 09 ', max=64, style=ProgressStyle(description_width…




KeyboardInterrupt: 