In [73]:
import os
from argparse import Namespace
from tqdm.auto import tqdm
import random

from collections import defaultdict
import pandas as pd

import urllib.request
from bs4 import BeautifulSoup

from nltk.tokenize import RegexpTokenizer

In [63]:
def get_fields(domain='cs'):
    fields = {}

    uh = urllib.request.urlopen(f'https://arxiv.org/archive/{domain}')
    soup = BeautifulSoup(str(uh.read()))

    for x in soup.findAll('ul')[2].findAll('li'):
        k = x.findAll('b')[0].text
        field, desc = k[3:5], k[8:]
        fields[field] = desc

    return fields

get_fields('cs')

{'AI': 'Artificial Intelligence',
 'CL': 'Computation and Language',
 'CC': 'Computational Complexity',
 'CE': 'Computational Engineering, Finance, and Science',
 'CG': 'Computational Geometry',
 'GT': 'Computer Science and Game Theory',
 'CV': 'Computer Vision and Pattern Recognition',
 'CY': 'Computers and Society',
 'CR': 'Cryptography and Security',
 'DS': 'Data Structures and Algorithms',
 'DB': 'Databases',
 'DL': 'Digital Libraries',
 'DM': 'Discrete Mathematics',
 'DC': 'Distributed, Parallel, and Cluster Computing',
 'ET': 'Emerging Technologies',
 'FL': 'Formal Languages and Automata Theory',
 'GL': 'General Literature',
 'GR': 'Graphics',
 'AR': 'Hardware Architecture',
 'HC': 'Human-Computer Interaction',
 'IR': 'Information Retrieval',
 'IT': 'Information Theory',
 'LO': 'Logic in Computer Science',
 'LG': 'Machine Learning',
 'MS': 'Mathematical Software',
 'MA': 'Multiagent Systems',
 'MM': 'Multimedia',
 'NI': 'Networking and Internet Architecture',
 'NE': 'Neural and E

In [67]:
class Article:
    def __init__(self, field, title, authors, url):
        self.field = field
        self.title = title
        self.authors = authors
        self.url = url
        
    def __repr__(self):
        s = 'Article {\n'
        s += ''.join([f'{k:>10}: {v}\n' for k, v in self.__dict__.items()])
        s += '}'
        return s

In [68]:
CFG = Namespace()

CFG.url      = 'http://arxiv.org/list/cs.{}/{}{}?show={}'
CFG.fields   = ['AI', 'CL', 'IR']
CFG.years    = ['{:0>2d}'.format(i) for i in range(18, 19)]
CFG.months   = ['{:0>2d}'.format(i+1) for i in range(3, 4)]
CFG.max_show = 1000

tokenizer = RegexpTokenizer(r'\w+')

In [69]:
def crawl(fields, years, months, max_show=1000):
    articles = []
    
    for field in fields:
        for year in years:
            for month in months:
                query_url = CFG.url.format(field, year, month, max_show)
                print(f'> retrieving [{query_url}]')
                uh = urllib.request.urlopen(query_url)
                soup = BeautifulSoup(str(uh.read()))
                
                titles = soup.findAll('div', {'class': 'list-title'})
                authors = soup.findAll('div', {'class': 'list-authors'})
                urls = soup.findAll('span', {'class': 'list-identifier'})
                
                assert len(titles) == len(authors) == len(urls)
                
                for i in range(len(titles)):
                    t = titles[i].contents[-1].strip().replace('\\n', '')
                    a = [x.string.strip() for x in authors[i].findAll('a')]
                    u = 'http://arxiv.org' + urls[i].contents[0].attrs['href'].replace('abs', 'pdf') + '.pdf'
                    
                    articles.append(Article(field=field, title=t, authors=a, url=u))

    return articles

In [70]:
out = crawl(CFG.fields, CFG.years, CFG.months)

> retrieving [http://arxiv.org/list/cs.AI/1804?show=1000]
> retrieving [http://arxiv.org/list/cs.CL/1804?show=1000]
> retrieving [http://arxiv.org/list/cs.IR/1804?show=1000]


In [78]:
random.shuffle(out)
df = pd.DataFrame([a.__dict__ for a in out])
df

Unnamed: 0,field,title,authors,url
0,CL,Word Embedding Perturbation for Sentence Class...,"[Dongxu Zhang, Zhichao Yang]",http://arxiv.org/pdf/1804.08166.pdf
1,CL,A Tree Search Algorithm for Sequence Labeling,"[Yadi Lao, Jun Xu, Yanyan Lan, Jiafeng Guo, Sh...",http://arxiv.org/pdf/1804.10911.pdf
2,CL,Real Time Sentiment Change Detection of Twitte...,"[Sotiris K. Tasoulis, Aristidis G. Vrahatis, S...",http://arxiv.org/pdf/1804.00482.pdf
3,AI,Object Ordering with Bidirectional Matchings f...,"[Hao Tan, Mohit Bansal]",http://arxiv.org/pdf/1804.06870.pdf
4,AI,Learning to Run challenge: Synthesizing physio...,"[Łukasz Kidziński, Sharada P. Mohanty, Carmich...",http://arxiv.org/pdf/1804.00198.pdf
...,...,...,...,...
775,AI,Swarm robotics in wireless distributed protoco...,"[F. De Rango, N. Palmieri, X.S. Yang, S. Marano]",http://arxiv.org/pdf/1804.08096.pdf
776,AI,Compositional Obverter Communication Learning ...,"[Edward Choi, Angeliki Lazaridou, Nando de Fre...",http://arxiv.org/pdf/1804.02341.pdf
777,CL,Commonsense mining as knowledge base completio...,"[Stanisław Jastrzębski, Dzmitry Bahdanau, Seye...",http://arxiv.org/pdf/1804.09259.pdf
778,CL,Demo of Sanskrit-Hindi SMT System,"[Rajneesh Pandey, Atul Kr. Ojha, Girish Nath Jha]",http://arxiv.org/pdf/1804.06716.pdf
