# ArXiv Data Ingestion
## Goal: Download and parse ArXiv bulk data for NLP papers

We're focusing on cs.CL (Computation and Language), cs.LG (Machine Learning), and stat.ML categories.

**Note:** ArXiv bulk data is HUGE. For testing, we'll work with a subset first.

In [None]:
# install required packages
!pip install pandas arxiv pyarrow -q

In [None]:
import os
import json
import pandas as pd
from datetime import datetime
import arxiv
from google.colab import drive

# mount drive to save data
drive.mount('/content/drive')

In [None]:
# create directories if they don't exist
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

print("Directories created!")

## Method 1: Using arxiv API (easier, but limited to 2000 papers)

Let's start with this approach for prototyping. We can switch to bulk data later if needed.

In [None]:
# test the API first with a small query
client = arxiv.Client()

search = arxiv.Search(
    query = 'cat:cs.CL',
    max_results = 5,
    sort_by = arxiv.SortCriterion.SubmittedDate
)

# let's see what we get
for result in client.results(search):
    print(f"Title: {result.title}")
    print(f"Authors: {', '.join([a.name for a in result.authors])}")
    print(f"Published: {result.published}")
    print(f"Categories: {result.categories}")
    print("Abstract:", result.summary[:200], "...")
    print("\n" + "="*80 + "\n")

Looks good! Now let's write a function to fetch papers from multiple categories.

In [None]:
def fetch_arxiv_papers(categories, max_results_per_category=500):
    """
    Fetch papers from arxiv for given categories
    
    Args:
        categories: list of category strings like ['cs.CL', 'cs.LG']
        max_results_per_category: how many papers to fetch per category
    
    Returns:
        list of paper dictionaries
    """
    client = arxiv.Client()
    papers = []
    
    for cat in categories:
        print(f"Fetching papers from {cat}...")
        
        search = arxiv.Search(
            query = f'cat:{cat}',
            max_results = max_results_per_category,
            sort_by = arxiv.SortCriterion.SubmittedDate
        )
        
        count = 0
        for result in client.results(search):
            paper = {
                'paper_id': result.entry_id.split('/')[-1],  # extract ID from URL
                'title': result.title,
                'authors': [author.name for author in result.authors],
                'abstract': result.summary.replace('\n', ' '),  # clean up newlines
                'categories': result.categories,
                'primary_category': result.primary_category,
                'published': result.published.isoformat(),
                'updated': result.updated.isoformat(),
                'pdf_url': result.pdf_url,
                'venue': 'arXiv',
                'year': result.published.year
            }
            papers.append(paper)
            count += 1
            
            # print progress every 50 papers
            if count % 50 == 0:
                print(f"  Fetched {count} papers from {cat}...")
        
        print(f"Completed {cat}: {count} papers fetched\n")
    
    return papers

In [None]:
# define our target categories
TARGET_CATEGORIES = ['cs.CL', 'cs.LG', 'stat.ML']

# start with smaller number for testing, increase later
# can go up to 2000 or so with API
papers_data = fetch_arxiv_papers(TARGET_CATEGORIES, max_results_per_category=300)

print(f"\nTotal papers fetched: {len(papers_data)}")

In [None]:
# let's check what we got
print(f"Sample paper structure:")
print(json.dumps(papers_data[0], indent=2))

## Save to JSON and Parquet

We'll save in both formats:
- JSON for easy inspection
- Parquet for efficient storage and loading

In [None]:
# save as JSON first
json_path = 'data/raw/arxiv_papers.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(papers_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(papers_data)} papers to {json_path}")

# check file size
file_size = os.path.getsize(json_path) / (1024 * 1024)  # convert to MB
print(f"File size: {file_size:.2f} MB")

In [None]:
# convert to dataframe for parquet
df = pd.DataFrame(papers_data)

# check the dataframe
print("DataFrame shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:")
print(df.dtypes)

In [None]:
# display some stats
print("\nPapers per category:")
print(df['primary_category'].value_counts())

print("\nPapers per year:")
print(df['year'].value_counts().sort_index())

In [None]:
# save as parquet
parquet_path = 'data/raw/arxiv_papers.parquet'
df.to_parquet(parquet_path, index=False)

print(f"Saved to {parquet_path}")

# check parquet file size
parquet_size = os.path.getsize(parquet_path) / (1024 * 1024)
print(f"Parquet file size: {parquet_size:.2f} MB")
print(f"Compression ratio: {file_size/parquet_size:.2f}x")

## Quality Checks

In [None]:
# check for missing values
print("Missing values:")
print(df.isnull().sum())

print("\nChecking abstracts...")
print(f"Min abstract length: {df['abstract'].str.len().min()}")
print(f"Max abstract length: {df['abstract'].str.len().max()}")
print(f"Mean abstract length: {df['abstract'].str.len().mean():.2f}")

# find papers with very short abstracts (potential issues)
short_abstracts = df[df['abstract'].str.len() < 100]
print(f"\nPapers with abstracts < 100 chars: {len(short_abstracts)}")
if len(short_abstracts) > 0:
    print("Example:")
    print(short_abstracts[['title', 'abstract']].head(1))

In [None]:
# check for duplicates based on paper_id
duplicates = df[df.duplicated(subset=['paper_id'], keep=False)]
print(f"Duplicate paper IDs found: {len(duplicates)}")

if len(duplicates) > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates(subset=['paper_id'], keep='first')
    print(f"New shape: {df.shape}")
    
    # re-save the cleaned data
    df.to_parquet(parquet_path, index=False)
    print("Updated parquet file saved.")

In [None]:
# show a few sample papers
print("Sample papers:")
for idx, row in df.head(3).iterrows():
    print(f"\n{idx+1}. {row['title']}")
    print(f"   Authors: {', '.join(row['authors'][:3])}{'...' if len(row['authors']) > 3 else ''}")
    print(f"   Year: {row['year']} | Category: {row['primary_category']}")
    print(f"   Abstract: {row['abstract'][:150]}...")

## Optional: Copy to Google Drive for persistence

In [None]:
# uncomment to save to drive
# !cp data/raw/arxiv_papers.parquet /content/drive/MyDrive/
# !cp data/raw/arxiv_papers.json /content/drive/MyDrive/
# print("Files copied to Google Drive!")

---
## Summary

✅ Fetched papers from ArXiv API  
✅ Extracted metadata (title, authors, abstract, categories, dates)  
✅ Saved to JSON and Parquet formats  
✅ Performed basic quality checks  

**Next steps:**
- Fetch more papers if needed (increase max_results)
- Consider bulk data download for comprehensive coverage
- Move to ACL Anthology ingestion