# Data Acquisition

Kaggle source: [https://www.kaggle.com/datasets/Cornell-University/arxiv/data](https://www.kaggle.com/datasets/Cornell-University/arxiv/data)

In [1]:
import os
from pathlib import Path
from zipfile import ZipFile
from dotenv import load_dotenv; load_dotenv();

In [2]:
for i in range(2):
    try: from kaggle.api.kaggle_api_extended import KaggleApi
    except: pass

Could not find kaggle.json. Make sure it's located in /Users/nic/.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [3]:
def download_arxiv(api: KaggleApi, out_path: str | os.PathLike = "./data", force: bool = False) -> Path:    
    out_path = Path(out_path)
    out_file_path = out_path / "arxiv-metadata-oai-snapshot.json"
    zip_path = out_path / "arxiv.zip"
    data_exists = os.path.isfile(out_file_path)
    
    if not data_exists or force:
        print("Downloading...")
        api.dataset_download_files("Cornell-University/arxiv", path=out_path)
    
        print("Extracting ZIP...")
        with ZipFile(zip_path, 'r') as zipfile:
            zipfile.extractall(out_path)
            os.remove(zip_path)
        
    print(f"Done! Data is stored at '{out_file_path}'.")
    
    return out_file_path

In [4]:
api = KaggleApi()
file_path = download_arxiv(api)
file_path

Done! Data is stored at 'data/arxiv-metadata-oai-snapshot.json'.


PosixPath('data/arxiv-metadata-oai-snapshot.json')

In [20]:
import pandas as pd

# Read file in chunks of 100000 lines
chunk_size = 100000
json_reader = pd.read_json(file_path, lines=True, chunksize=chunk_size)

total = 0
for i, chunk in enumerate(json_reader):
    if i == 0: df = chunk
    total += chunk.shape[0]
    
print(f"Total rows: {total}")

Total rows: 2895350


In [21]:
df.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [22]:
df.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')