# Load metadata

arXiv.org submitters. (2024). arXiv Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DSV/7548853

* Download the data manually, put it in folder `data`.
* Drop abstracts and comments (to save memory)
* Load all the data into a dataframe

In [1]:
%%time
import pandas as pd
import json

frames = []
max_bytes = -1  # 1024 * 1024 * 10  # max bytes to read from file
# one json per line
with open('data/arxiv-metadata-oai-snapshot.json') as json_file:    
    print("Reading file")
    lines = json_file.readlines(max_bytes)
    line_count = len(lines)
    counter = 0
    for line in lines:
        data = json.loads(line)
        frames.append(pd.json_normalize(data).drop('abstract', axis=1).drop('comments', axis=1))
        counter += 1
        if counter % 100000 == 0: print(f"Processed {counter} / {line_count} lines")
arxiv_df = pd.concat(frames, ignore_index=True)  

Reading file


KeyboardInterrupt: 

## Data cleanup
* ~~Keep paper only if journal-ref is not None~~

In [2]:
#total = arxiv_df.shape[0]
#idx = arxiv_df['journal-ref'].isna()
#arxiv_df = arxiv_df[~idx]
#arxiv_df = arxiv_df.reset_index()
#print(f"The cleaned up data has {arxiv_df.shape[0]} entries. {total - arxiv_df.shape[0]} entries were discarded.")

### Add a created date
Extract date of first versin from the version column, add a 'created' date column.

In [3]:
created = [version[0]['created'] for version in arxiv_df['versions']]
arxiv_df['created'] = pd.DatetimeIndex(created)
arxiv_df['year'] = [datetime.year for datetime in arxiv_df['created']]
arxiv_df['month'] = [datetime.month for datetime in arxiv_df['created']]

NameError: name 'arxiv_df' is not defined

## Category
Add a column with a less specific category, e.g. "physics.gen-ph" -> "physics"

See: https://arxiv.org/category_taxonomy

Note that "math.GM" and "physics.gen-ph" are junk categories.

In [None]:
gen_categories = []  # the categories for all entries
for categories in arxiv_df['categories']:
    categories = categories.split()
    entry_categories = []  # the categorie(s) for this publication
    for category in categories:
        entry_category = category
        if ("cs." in category) or ("cmp-lg" in category): entry_categories.append("Computer Science")
        elif "econ." in category: entry_categories.append("Economics")
        elif "eess." in category: entry_categories.append("Electrical Engineering and Systems Science")
        elif ("math." in category) or ("alg-geom" in category) or ("dg-ga" in category) or ("funct-an" in category) or ("dg-ga" in category) or ("q-alg" in category): 
            if "math.GM" in category: entry_categories.append("Junk")  # General Mathematics is a bin for papers that are obviously wrong
            else: entry_categories.append("Mathematics")
        elif "physics.gen-ph" in category: entry_categories.append("Junk")  # General Physics is a bin for papers that are obviously wrong
        elif ("astro-ph" in category) or ("cond-mat." in category) or ("gr-qc" in category) or \
             ("hep-" in category) or ("math-ph" in category) or ("nlin." in category)  or ("nucl-" in category) or \
             ("physics." in category) or ("quant-ph" in category) or ("acc-phys" in category) or ("adap-org" in category) or \
             ("ao-sci" in category) or ("atom-ph" in category) or ("bayes-an" in category) or \
             ("chao-dyn" in category) or ("chem-ph" in category) or ("comp-gas" in category) or \
             ("cond-mat" in category) or ("mtrl-th" in category) or ("patt-sol" in category) or \
             ("plasm-ph" in category) or ("solv-int" in category): entry_categories.append("Physics")
        elif ("q-bio." in category) or ("q-bio" in category) or ("supr-con" in category): entry_categories.append("Quantitative Biology")
        elif "q-fin" in category: entry_categories.append("Quantitative Finance")
        elif "stat." in category: entry_categories.append("Statistics")
        else: entry_categories.append(category)
    entry_categories = list(set(entry_categories))
    gen_categories.append(entry_categories)
gen_categories = pd.Series(gen_categories)

One-hot encode the general categories

In [None]:
one_hot = gen_categories.str.join('|').str.get_dummies()
arxiv_df = arxiv_df.join(one_hot)

In [None]:
arxiv_df.head(3)

In [None]:
arxiv_df.columns