##Crawl the arXiv papers

In [0]:
from google.colab import drive
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
drive.mount('/gdrive', force_remount=True)
cwd = '/gdrive/My Drive/paper-classification'
with open(os.path.join(cwd, 'data/label/cs.subject'), 'r') as f:
    label_list = f.read().split()
    label_dict = {s:i for i, s in enumerate(label_list)}
print(label_dict)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
{'cs.AI': 0, 'cs.CL': 1, 'cs.CC': 2, 'cs.CE': 3, 'cs.C': 4, 'cs.GT': 5, 'cs.CV': 6, 'cs.CY': 7, 'cs.CR': 8, 'cs.DS': 9, 'cs.DB': 10, 'cs.DL': 11, 'cs.DM': 12, 'cs.DC': 13, 'cs.ET': 14, 'cs.FL': 15, 'cs.GL': 16, 'cs.GR': 17, 'cs.AR': 18, 'cs.HC': 19, 'cs.IR': 20, 'cs.IT': 21, 'cs.LO': 22, 'cs.LG': 23, 'cs.MS': 24, 'cs.MA': 25, 'cs.MM': 26, 'cs.NI': 27, 'cs.NE': 28, 'cs.NA': 29, 'cs.OS': 30, 'cs.OH': 31, 'cs.PF': 32, 'cs.PL': 33, 'cs.RO': 34, 'cs.SI': 35, 'cs.SE': 36, 

In [0]:
def crawl_page(yymm, skip):
    url = 'https://arxiv.org/list/cs/{}?skip={}&show=1000'.format(yymm, skip)

    with urllib.request.urlopen(url) as respond:
        html = respond.read()
        soup = BeautifulSoup(html, 'html.parser')

    titles = soup.find_all('div', {'class': 'list-title'})
    paper_urls = soup.find_all('span', {'class': 'list-identifier'})

    papers = []
    for j in range(len(titles)):
        paper = []
        title = titles[j].contents[-1].strip()
        paper.append(title)

        paper_url = 'https://arxiv.org' + paper_urls[j].find_all('a')[0].attrs['href']
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                   'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                   'Accept-Encoding': 'none',
                   'Accept-Language': 'en-US,en;q=0.8',
                   'Connection': 'keep-alive'}
        req = urllib.request.Request(paper_url, headers=headers)
        try:
            respond = urllib.request.urlopen(req)
        except urllib.error.HTTPError as e:
            print('HTTPError occured in {}-th paper of {}-th page'.format(j, skip))
            continue

        html = respond.read()
        soup = BeautifulSoup(html, 'html.parser')
        
        abstract = soup.find('meta', {'property': 'og:description'}).attrs['content']
        paper.append(abstract)
        
        subject_list = soup.find('td', {'class': 'tablecell subjects'}).text.split(';')
        for subject in subject_list:
            subject = subject.split('(')[-1].split(')')[0]
            if subject in label_dict:
                paper.append(label_dict.get(subject))
                break
        if subject not in label_dict:
            continue
        
        papers.append(paper)
    
    dataframe = pd.DataFrame(papers)
    dataframe.to_csv(os.path.join(cwd, 'data/arxiv/{}-{}.csv'.format(yymm, skip)),
                         header=['title', 'abstract', 'subject'],
                         index=False,
                         mode='w')
    
    print('Successfully written: {}-{}.csv'.format(yymm, skip))

In [4]:
'''
for i in range (0, 6000, 1000):
    crawl_page(2003, i*1000)

-> 이렇게 for-loop으로 돌리면 HTTPError가 계속적으로 발생함. (403 forbidden)
-> 해결법을 구글링해보면 header를 붙이라고 나오나 붙여도 계속 발생함.
-> multiprocessing을 사용해도 동일한 현상 발생함.
-> 그냥 아래처럼 for-loop을 펼쳐서 한 페이지씩 크롤링하는 게 최선.
-> 같은 ipynb 여러 개 만들어서 동시에 실행했을 때는 괜찮았음.
'''
yymm = 2002
crawl_page(yymm, 0)
#crawl_page(yymm, 1000)
#crawl_page(yymm, 2000)
#crawl_page(yymm, 3000)
#crawl_page(yymm, 4000)
#crawl_page(yymm, 5000)

Successfully written: 2002-0.csv


##Concat csv files

In [6]:
import glob
filelist = sorted(glob.glob(os.path.join(cwd, 'data/arxiv/{}-*'.format(yymm))))
print(filelist)

['/gdrive/My Drive/paper-classification/data/arxiv/2002-0.csv', '/gdrive/My Drive/paper-classification/data/arxiv/2002-1000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/2002-2000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/2002-3000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/2002-4000.csv', '/gdrive/My Drive/paper-classification/data/arxiv/2002-5000.csv']


In [0]:
datalist = [pd.read_csv(f) for f in filelist]
dataframe = pd.concat(datalist, axis=0, ignore_index=False)
dataframe.to_csv(os.path.join(cwd, 'data/arxiv/{}.csv'.format(yymm)),
                 index=False)