# Load data

arXiv.org submitters. (2024). arXiv Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DSV/7548853

* Download the data manually, put it in folder `data`.
* Load all the data into a dataframe

In [101]:
import pandas as pd
import json

frames = []
max_bytes = 1024 * 1024 * 10  # max bytes to read from file
# one json per line
with open('data/arxiv-metadata-oai-snapshot.json') as json_file:    
    lines = json_file.readlines(max_bytes)
    for line in lines:
        data = json.loads(line)
        frames.append(pd.json_normalize(data))
arxiv_df = pd.concat(frames)  

In [102]:
# arxiv_df.head(3)

## Data cleanup
* Keep paper only if journal-ref is not None
* save compressed csv

In [103]:
total = arxiv_df.shape[0]
idx = arxiv_df['journal-ref'].isna()
arxiv_df = arxiv_df[~idx]
print(f"The cleaned up data has {arxiv_df.shape[0]} entries. {total - arxiv_df.shape[0]} entries were discarded.")

The cleaned up data has 3951 entries. 3243 entries were discarded.


Add a column with a less specific category, e.g. "physics.gen-ph" -> "physics"

See: https://arxiv.org/category_taxonomy

In [106]:
gen_categories = []  # the categories for all entries
for categories in arxiv_df['categories']:
    categories = categories.split()
    entry_categories = []  # the categorie(s) for this publication
    for category in categories:
        entry_category = category
        if "cs." in category: entry_categories.append("Computer Science")
        elif "econ." in category: entry_categories.append("Economics")
        elif "eess." in category: entry_categories.append("Electrical Engineering and Systems Science")
        elif "math." in category: 
            if "math.GM" in category: entry_categories.append("Junk")  # General Mathematics is a bin for papers that are obviously wrong
            else: entry_categories.append("Mathematics")
        elif "physics.gen-ph" in category: entry_categories.append("Junk")  # General Physics is a bin for papers that are obviously wrong
        elif ("astro-ph" in category) or ("cond-mat." in category) or ("gr-qc" in category) or \
             ("hep-" in category) or ("math-ph" in category) or ("nlin." in category)  or ("nucl-" in category) or \
             ("physics." in category) or ("quant-ph" in category): entry_categories.append("Physics")
        elif "q-bio." in category: entry_categories.append("Quantitative Biology")
        elif "q-fin" in category: entry_categories.append("Quantitative Finance")
        elif "stat." in category: entry_categories.append("Statistics")
        else: entry_categories.append(category)
    entry_categories = list(set(entry_categories))
    gen_categories.append(entry_categories)
gen_categories = pd.Series(gen_categories)

One-hot encode the general categories

In [105]:
one_hot = gen_categories.str.join('|').str.get_dummies()
arxiv_df = arxiv_df.join(one_hot)

In [112]:
gen_categories[:20]

0                                             [Physics]
1                                         [Mathematics]
2                                             [Physics]
3                                             [Physics]
4                                             [Physics]
5                                             [Physics]
6                                             [Physics]
7                                             [Physics]
8                                         [Mathematics]
9                                             [Physics]
10    [Quantitative Biology, Computer Science, Physics]
11                                            [Physics]
12                                            [Physics]
13                                        [Mathematics]
14                                            [Physics]
15                                            [Physics]
16                                            [Physics]
17                                            [P

In [113]:
one_hot[:20]

Unnamed: 0,Computer Science,Economics,Junk,Mathematics,Physics,Quantitative Biology,Quantitative Finance,Statistics
0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0
5,0,0,0,0,1,0,0,0
6,0,0,0,0,1,0,0,0
7,0,0,0,0,1,0,0,0
8,0,0,0,1,0,0,0,0
9,0,0,0,0,1,0,0,0


In [127]:
arxiv_df.join(one_hot)['Mathematics'].sum()

ValueError: columns overlap but no suffix specified: Index(['Computer Science', 'Economics', 'Junk', 'Mathematics', 'Physics',
       'Quantitative Biology', 'Quantitative Finance', 'Statistics'],
      dtype='object')

Save as compressed CSV

In [8]:
import zipfile as zf

with zf.ZipFile('data/arxiv.zip', 'w') as ziparchive:
    ziparchive.writestr('arxiv.csv', arxiv_df.to_csv())

Metadata EDA

Published papers by category

In [9]:
counts = arxiv_df['categories'].value_counts()

In [10]:
counts.shape

(3196,)