##Crawl the arXiv papers

In [0]:
from google.colab import drive
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import os

In [0]:
def crawl_page(field, yymm, skip, label_dict):
    url = f'https://arxiv.org/list/{field}/{yymm}?skip={skip}&show=1000'

    with urllib.request.urlopen(url) as respond:
        html = respond.read()
        soup = BeautifulSoup(html, 'html.parser')

    titles = soup.find_all('div', {'class': 'list-title'})
    paper_urls = soup.find_all('span', {'class': 'list-identifier'})

    papers = []
    for i in range(len(titles)):
        paper = []
        title = titles[i].contents[-1].strip()
        paper.append(title)

        paper_url = 'https://arxiv.org' + paper_urls[i].find_all('a')[0].attrs['href']
        headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
        req = urllib.request.Request(paper_url, headers=headers)
        try:
            respond = urllib.request.urlopen(req)
        except urllib.error.HTTPError as e:
            print(f'HTTPError occured: {i}-th paper of skip={skip}&show=1000')
            continue

        html = respond.read()
        soup = BeautifulSoup(html, 'html.parser')
        
        abstract = soup.find('meta', {'property': 'og:description'}).attrs['content']
        paper.append(abstract)
        
        subject_list = soup.find('td', {'class': 'tablecell subjects'}).text.split(';')
        for subject in subject_list:
            subject = subject.split('(')[-1].split(')')[0]
            if subject in label_dict:
                paper.append(label_dict.get(subject))
                break
        if subject not in label_dict:
            continue
        
        papers.append(paper)
    
    dataframe = pd.DataFrame(papers)
    dataframe.to_csv(os.path.join(cwd, f'data/arxiv/{field}/{yymm}-{skip}.csv'),
                     header=['title', 'abstract', 'subject'],
                     index=False,
                     mode='w')
    
    print(f'Successfully written: {field}/{yymm}-{skip}.csv')

In [0]:
drive.mount('/gdrive', force_remount=True)
cwd = '/gdrive/My Drive/paper-classification'

field = 'cs'
yymm = 1910
skip = 0

with open(os.path.join(cwd, f'data/label/{field}.subject'), 'r') as f:
    label_list = f.read().split()
    label_dict = {s:i for i, s in enumerate(label_list)}

crawl_page(field, yymm, skip, label_dict)

##Concat csv files (same yymm)

In [0]:
import glob
filelist = sorted(glob.glob(os.path.join(cwd, f'data/arxiv/{field}/{yymm}-*')))
print(filelist)
datalist = [pd.read_csv(f) for f in filelist]
dataframe = pd.concat(datalist, axis=0, ignore_index=False)
dataframe.to_csv(os.path.join(cwd, f'data/arxiv/{field}-{yymm}.csv'),
                 index=False)

['/gdrive/My Drive/paper-classification/data/arxiv/new/1911-0.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1911-1000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1911-2000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1911-3000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1911-4000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1911-5000.csv']


##Concat csv files (same field)

In [0]:
import glob
filelist = sorted(glob.glob(os.path.join(cwd, f'data/arxiv/{field}/*')))
print(filelist)
datalist = [pd.read_csv(f) for f in filelist]
dataframe = pd.concat(datalist, axis=0, ignore_index=False)
dataframe.to_csv(os.path.join(cwd, f'data/arxiv/{field}/data.csv'),
                 index=False)

['/gdrive/My Drive/paper-classification/data/arxiv/new/1910.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1911.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/1912.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/2001.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/2002.csv', '/gdrive/My Drive/paper-classification/data/arxiv/new/2003.csv']
