In [2]:
import json
import pandas as pd
import numpy as np

> The following blocks should be run only if you want to reparse the Kaggle JSON list, otherwise it is recommended to jump to reading the `papers.csv` file.

In [21]:
# Due to some limitations of Python, the entire JSON file cannot be opened so we will parse it line by line
def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [22]:
data_file = 'arxiv-metadata-oai-snapshot.json'

# Lists for all of the information we will retain from the JSON
title = []
pid = []
vers = []
vers_cnt = []
cats = []
vers_date = []

metadata = get_metadata()
for paper in metadata:
    js = json.loads(paper)
    vs = js.get('versions')
    # Only add a paper if it has more than one version and less than five
    if len(vs) > 1 and len(vs) < 5:
        for i in range(len(vs)):
            title.append(js.get('title'))
            pid.append(js.get('id'))
            vers.append(i + 1)
            vers_cnt.append(len(vs))
            cats.append(js.get('categories'))
            vers_date.append(js.get('versions')[i].get('created'))

In [23]:
# Build a DataFrame from the extracted information
papers = pd.DataFrame({
    'title': title,
    'id': pid,
    'categories': cats,
    'version': vers,
    'total_versions': vers_cnt,
    'date': vers_date,
})

In [24]:
papers['date'] = pd.to_datetime(papers['date'])

In [27]:
papers.head()

Unnamed: 0,title,id,categories,version,total_versions,date
0,Calculation of prompt diphoton production cros...,704.0001,hep-ph,1,2,2007-04-02 19:18:42+00:00
1,Calculation of prompt diphoton production cros...,704.0001,hep-ph,2,2,2007-07-24 20:10:27+00:00
2,Sparsity-certifying Graph Decompositions,704.0002,math.CO cs.CG,1,2,2007-03-31 02:26:18+00:00
3,Sparsity-certifying Graph Decompositions,704.0002,math.CO cs.CG,2,2,2008-12-13 17:26:00+00:00
4,The evolution of the Earth-Moon system based o...,704.0003,physics.gen-ph,1,3,2007-04-01 20:46:54+00:00


In [28]:
papers.to_csv('papers.csv', index=False)

 > Start from here if you skipped reparsing the Kaggle JSON file.

In [4]:
# a better way of reading the csv file should be used, this is inefficient as it considers
# the entire file (low_memory=False) instead of reading it line by line. This is because, for some reason,
# pandas does not know how to parse the dates.
papers = pd.read_csv('papers.csv', parse_dates = ["date"], low_memory=False)

In [33]:
papers.head()

Unnamed: 0,title,id,categories,version,total_versions,date
0,Calculation of prompt diphoton production cros...,704.0001,hep-ph,1,2,2007-04-02 19:18:42+00:00
1,Calculation of prompt diphoton production cros...,704.0001,hep-ph,2,2,2007-07-24 20:10:27+00:00
2,Sparsity-certifying Graph Decompositions,704.0002,math.CO cs.CG,1,2,2007-03-31 02:26:18+00:00
3,Sparsity-certifying Graph Decompositions,704.0002,math.CO cs.CG,2,2,2008-12-13 17:26:00+00:00
4,The evolution of the Earth-Moon system based o...,704.0003,physics.gen-ph,1,3,2007-04-01 20:46:54+00:00


Split dataset in math and CS papers and download them in separate CSVs.

In [3]:
# Open JSON file with categories
categories_path = 'categories.json'
f = open(categories_path)
categories = json.load(f)

In [4]:
def is_math(x):
    for self_cat in x.split():
        for subcat in categories["mathematics"]:
            if subcat == self_cat:
                return True
    return False

In [56]:
def is_cs(x):
    for self_cat in x.split():
        for subcat in categories["computer science"]:
            if subcat == self_cat:
                return True
    return False

In [5]:
def is_phys(x):
    for self_cat in x.split():
        for subcat in categories["physics"]:
            if subcat == self_cat:
                return True
    return False

In [74]:
math_papers = papers.loc[papers["categories"].apply(is_math)].reset_index(drop=True)

In [75]:
cs_papers = papers.loc[papers["categories"].apply(is_cs)].reset_index(drop=True)

In [76]:
math_papers.to_csv('math_papers.csv', index=False)

In [77]:
cs_papers.to_csv('cs_papers.csv', index=False)