# Load metadata

arXiv.org submitters. (2024). arXiv Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DSV/7548853

* Download the data manually, put it in folder `data`.
* Load all the data into a dataframe

In [1]:
import pandas as pd
import json

frames = []
max_bytes = -1  # 1024 * 1024 * 10  # max bytes to read from file
# one json per line
with open('data/arxiv-metadata-oai-snapshot.json') as json_file:    
    lines = json_file.readlines(max_bytes)
    for line in lines:
        data = json.loads(line)
#        data['entry_id'] = data.pop('id')  # rename 'id' to 'entry_id'
        frames.append(pd.json_normalize(data))
arxiv_df = pd.concat(frames, ignore_index=True)  

## Data cleanup
* ~~Keep paper only if journal-ref is not None~~
* Drop abstracts (to save memory)

In [2]:
#total = arxiv_df.shape[0]
#idx = arxiv_df['journal-ref'].isna()
#arxiv_df = arxiv_df[~idx]
#arxiv_df = arxiv_df.reset_index()
#print(f"The cleaned up data has {arxiv_df.shape[0]} entries. {total - arxiv_df.shape[0]} entries were discarded.")

In [3]:
arxiv_df = arxiv_df.drop('abstract', axis=1)

### Add a created date
Extract date of first versin from the version column, add a 'created' date column.

In [4]:
created = [version[0]['created'] for version in arxiv_df['versions']]
arxiv_df['created'] = pd.DatetimeIndex(created)
arxiv_df['year'] = [datetime.year for datetime in arxiv_df['created']]
arxiv_df['month'] = [datetime.month for datetime in arxiv_df['created']]

## Category
Add a column with a less specific category, e.g. "physics.gen-ph" -> "physics"

See: https://arxiv.org/category_taxonomy

Note that "math.GM" and "physics.gen-ph" are junk categories.

In [5]:
gen_categories = []  # the categories for all entries
for categories in arxiv_df['categories']:
    categories = categories.split()
    entry_categories = []  # the categorie(s) for this publication
    for category in categories:
        entry_category = category
        if ("cs." in category) or ("cmp-lg" in category): entry_categories.append("Computer Science")
        elif "econ." in category: entry_categories.append("Economics")
        elif "eess." in category: entry_categories.append("Electrical Engineering and Systems Science")
        elif ("math." in category) or ("alg-geom" in category) or ("dg-ga" in category) or ("funct-an" in category) or ("dg-ga" in category) or ("q-alg" in category): 
            if "math.GM" in category: entry_categories.append("Junk")  # General Mathematics is a bin for papers that are obviously wrong
            else: entry_categories.append("Mathematics")
        elif "physics.gen-ph" in category: entry_categories.append("Junk")  # General Physics is a bin for papers that are obviously wrong
        elif ("astro-ph" in category) or ("cond-mat." in category) or ("gr-qc" in category) or \
             ("hep-" in category) or ("math-ph" in category) or ("nlin." in category)  or ("nucl-" in category) or \
             ("physics." in category) or ("quant-ph" in category) or ("acc-phys" in category) or ("adap-org" in category) or \
             ("ao-sci" in category) or ("atom-ph" in category) or ("bayes-an" in category) or \
             ("chao-dyn" in category) or ("chem-ph" in category) or ("comp-gas" in category) or \
             ("cond-mat" in category) or ("mtrl-th" in category) or ("patt-sol" in category) or \
             ("plasm-ph" in category) or ("solv-int" in category): entry_categories.append("Physics")
        elif ("q-bio." in category) or ("q-bio" in category) or ("supr-con" in category): entry_categories.append("Quantitative Biology")
        elif "q-fin" in category: entry_categories.append("Quantitative Finance")
        elif "stat." in category: entry_categories.append("Statistics")
        else: entry_categories.append(category)
    entry_categories = list(set(entry_categories))
    gen_categories.append(entry_categories)
gen_categories = pd.Series(gen_categories)

One-hot encode the general categories

In [6]:
one_hot = gen_categories.str.join('|').str.get_dummies()
arxiv_df = arxiv_df.join(one_hot)

In [7]:
arxiv_df.head(3)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,...,month,Computer Science,Economics,Electrical Engineering and Systems Science,Junk,Mathematics,Physics,Quantitative Biology,Quantitative Finance,Statistics
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,...,4,0,0,0,0,0,1,0,0,0
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,...,3,1,0,0,0,1,0,0,0,0
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,...,4,1,0,0,0,0,0,0,0,0


In [8]:
arxiv_df.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'versions', 'update_date',
       'authors_parsed', 'created', 'year', 'month', 'Computer Science',
       'Economics', 'Electrical Engineering and Systems Science', 'Junk',
       'Mathematics', 'Physics', 'Quantitative Biology',
       'Quantitative Finance', 'Statistics'],
      dtype='object')

## Save as compressed CSV

In [None]:
import zipfile as zf

with zf.ZipFile('data/arxiv_metadata.zip', 'w') as ziparchive:
    ziparchive.writestr('arxiv_metadata.csv', arxiv_df.to_csv())