In [1]:
import os
from argparse import Namespace
from tqdm.auto import tqdm
import random

from collections import defaultdict
import pandas as pd

import urllib.request
from bs4 import BeautifulSoup

from nltk.tokenize import RegexpTokenizer

In [2]:
def get_fields(domain='cs'):
    fields = {}

    uh = urllib.request.urlopen(f'https://arxiv.org/archive/{domain}')
    soup = BeautifulSoup(str(uh.read()))

    for x in soup.findAll('ul')[2].findAll('li'):
        k = x.findAll('b')[0].text
        field, desc = k[3:5], k[8:]
        fields[field] = desc

    return fields

def get_abstract(url):
    uh = urllib.request.urlopen(url)
    soup = BeautifulSoup(str(uh.read()))
    a = soup.findAll('blockquote', {'class': 'abstract mathjax'})[0].text
    a = a.replace('\\n', ' ')
    a = a.replace('Abstract:  ', '')
    return a

get_fields('cs')

{'AI': 'Artificial Intelligence',
 'CL': 'Computation and Language',
 'CC': 'Computational Complexity',
 'CE': 'Computational Engineering, Finance, and Science',
 'CG': 'Computational Geometry',
 'GT': 'Computer Science and Game Theory',
 'CV': 'Computer Vision and Pattern Recognition',
 'CY': 'Computers and Society',
 'CR': 'Cryptography and Security',
 'DS': 'Data Structures and Algorithms',
 'DB': 'Databases',
 'DL': 'Digital Libraries',
 'DM': 'Discrete Mathematics',
 'DC': 'Distributed, Parallel, and Cluster Computing',
 'ET': 'Emerging Technologies',
 'FL': 'Formal Languages and Automata Theory',
 'GL': 'General Literature',
 'GR': 'Graphics',
 'AR': 'Hardware Architecture',
 'HC': 'Human-Computer Interaction',
 'IR': 'Information Retrieval',
 'IT': 'Information Theory',
 'LO': 'Logic in Computer Science',
 'LG': 'Machine Learning',
 'MS': 'Mathematical Software',
 'MA': 'Multiagent Systems',
 'MM': 'Multimedia',
 'NI': 'Networking and Internet Architecture',
 'NE': 'Neural and E

In [3]:
class Article:
    def __init__(self, field, title, authors, pdf, abstract):
        self.field = field
        self.title = title
        self.authors = authors
        self.pdf = pdf
        self.abstract = abstract # TODO
        
    def __repr__(self):
        s = 'Article {\n'
        s += ''.join([f'{k:>10}: {v}\n' for k, v in self.__dict__.items()])
        s += '}'
        return s

In [4]:
CFG = Namespace()

CFG.url      = 'http://arxiv.org/list/cs.{}/{}{}?show={}'
CFG.fields   = ['AI', 'CL', 'IR'][:1]
CFG.years    = ['{:0>2d}'.format(i) for i in range(18, 19)]
CFG.months   = ['{:0>2d}'.format(i+1) for i in range(3, 4)]
CFG.max_show = 1000

tokenizer = RegexpTokenizer(r'\w+')

In [5]:
def crawl(fields, years, months, max_show=1000):
    articles = []
    
    for field in fields:
        for year in years:
            for month in months:
                query_url = CFG.url.format(field, year, month, max_show)
                print(f'> retrieving [{query_url}]')
                uh = urllib.request.urlopen(query_url)
                soup = BeautifulSoup(str(uh.read()))
                
                titles = soup.findAll('div', {'class': 'list-title'})
                authors = soup.findAll('div', {'class': 'list-authors'})
                urls = soup.findAll('span', {'class': 'list-identifier'})
                
                assert len(titles) == len(authors) == len(urls)
                
                for i in range(len(titles)):
                    t = titles[i].contents[-1].strip().replace('\\n', '')
                    a = [x.string.strip() for x in authors[i].findAll('a')]
                    
                    ref = urls[i].contents[0].attrs['href'].split('/')[-1]
                    
                    p = f'http://arxiv.org/pdf/{ref}.pdf'
                    abstract = get_abstract(url=f'http://arxiv.org/abs/{ref}')
                    
                    articles.append(Article(field=field, title=t, authors=a, pdf=p, abstract=abstract))

    return articles

In [6]:
out = crawl(CFG.fields, CFG.years, CFG.months, max_show=5)

> retrieving [http://arxiv.org/list/cs.AI/1804?show=5]


In [7]:
random.shuffle(out)
df = pd.DataFrame([a.__dict__ for a in out])
df

Unnamed: 0,field,title,authors,pdf,abstract
0,AI,Modeling Individual Differences in Game Behavi...,"[Sara Bunian, Alessandro Canossa, Randy Colvin...",http://arxiv.org/pdf/1804.00245.pdf,Player modeling is an important concept that h...
1,AI,Learning to Run challenge: Synthesizing physio...,"[Łukasz Kidziński, Sharada P. Mohanty, Carmich...",http://arxiv.org/pdf/1804.00198.pdf,Synthesizing physiologically-accurate human mo...
2,AI,Learning to Navigate in Cities Without a Map,"[Piotr Mirowski, Matthew Koichi Grimes, Mateus...",http://arxiv.org/pdf/1804.00168.pdf,Navigating through unstructured environments i...
3,AI,Overview: A Hierarchical Framework for Plan Ge...,"[Hang Ma, Wolfgang Hönig, Liron Cohen, Tansel ...",http://arxiv.org/pdf/1804.00038.pdf,The authors present an overview of a hierarchi...
4,AI,Efficient Encodings of Conditional Cardinality...,"[Abdelhamid Boudane, Said Jabbour, Badran Radd...",http://arxiv.org/pdf/1804.00211.pdf,In the encoding of many real-world problems to...


In [8]:
out[:2]

[Article {
      field: AI
      title: Modeling Individual Differences in Game Behavior using HMM
    authors: ['Sara Bunian', 'Alessandro Canossa', 'Randy Colvin', 'Magy Seif El-Nasr']
        pdf: http://arxiv.org/pdf/1804.00245.pdf
   abstract: Player modeling is an important concept that has gained much attention in game research due to its utility in developing adaptive techniques to target better designs for engagement and retention. Previous work has explored modeling individual differences using machine learning algorithms per- formed on aggregated game actions. However, players\' individual differences may be better manifested through sequential patterns of the in-game player\'s actions. While few works have explored sequential analysis of player data, none have explored the use of Hidden Markov Models (HMM) to model individual differences, which is the topic of this paper. In par- ticular, we developed a modeling approach using data col- lected from players playing a Role-Pl