In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Cornell-University/arxiv")

print("Path to dataset files:", path)

In [None]:
import json
import pandas as pd
from datetime import datetime

path = path+"/arxiv-metadata-oai-snapshot.json"

all_entries = []

with open(path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            record = json.loads(line)
            title = record.get("title")
            published_str = record.get("update_date") or record.get("created")
            if published_str:
                try:
                    published = datetime.strptime(published_str.split("T")[0], "%Y-%m-%d").date()
                except:
                    published = None
            else:
                published = None

            categories = record.get("categories", "")
            authors_parsed = record.get("authors_parsed", [])
            authors = ", ".join([" ".join(a) for a in authors_parsed]) if authors_parsed else record.get("authors", "")

            summary = record.get("abstract", "")
            url = f"https://arxiv.org/abs/{record.get('id')}" if record.get("id") else None

            # Last 5 years filter
            if published and published.year >= 2020:
                all_entries.append({
                    "title": title,
                    "published": published,
                    "category": categories,
                    "authors": authors,
                    "summary": summary,
                    "url": url
                })
        except Exception as e:
            continue

print(f"Total articles 2020+: {len(all_entries)}")

df = pd.DataFrame(all_entries)
df.to_csv("arxiv_data.csv", index=False)
