**PHASE 1: DATA PREPROCESSING - Load and Save Raw Data**

Purpose: Load 2.4M papers from JSON and save as efficient pickle  
Input: arxiv-metadata-oai-snapshot.json (3.93 GB)  
Output: arxiv_full_raw.pkl (3.57 GB)  
ML Involved: None - Pure data loading  
Runtime: ~ 2 minutes  
Run Once: ✓ Never need to run again

In [None]:
#imports
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import os

print(f"Working directory: {os.getcwd()}")

Working directory: /Users/aleksandralyubarskaja/Desktop/arxiv_case_study


In [None]:
# check if data file exists
data_path = 'data/raw/arxiv-metadata-oai-snapshot.json'
if os.path.exists(data_path):
    size_gb = os.path.getsize(data_path) / (1024**3)
    print(f"✓ Dataset found! Size: {size_gb:.2f} GB")
else:
    print(f"x Dataset not found at: {data_path}")
    print("Download it first!")

✓ Dataset found! Size: 3.93 GB


In [None]:
# load data in chunks and process
# only run once

# count lines first
print("Counting total papers...")
with open(data_path, 'r', encoding='utf-8', errors='ignore') as f:
    total_lines = sum(1 for _ in f)
print(f"Total papers: {total_lines:,}")

# load with error handling
print("\nLoading dataset...")
papers = []
errors = 0

with open(data_path, 'r', encoding='utf-8', errors='ignore') as f:
    for i, line in enumerate(tqdm(f, total=total_lines)):
        try:
            papers.append(json.loads(line))
        except json.JSONDecodeError:
            errors += 1
            continue  # skip bad lines

df = pd.DataFrame(papers)
print(f"\n✓ Loaded: {len(df):,} papers")
print(f"! Skipped {errors} lines with encoding issues")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

Counting total papers...
Total papers: 2,384,623

Loading dataset...


100%|██████████| 2384623/2384623 [01:06<00:00, 35671.15it/s] 



✓ Loaded: 2,384,622 papers
! Skipped 1 lines with encoding issues
Memory usage: 4.61 GB


OSError: Cannot save file into a non-existent directory: 'data/processed'

In [None]:
# save 

#create folder if needed
os.makedirs('data/processed', exist_ok=True)
print("✓ Processed folder ready!")

# save as pickle
df.to_pickle('data/processed/arxiv_full_raw.pkl')
print("✓ Saved as pickle for fast reloading!")

In [None]:
#confirm saved

pickle_path = 'data/processed/arxiv_full_raw.pkl'

if os.path.exists(pickle_path):
    size_gb = os.path.getsize(pickle_path) / 1024**3
    print(f"✓✓✓ Success! ✓✓✓")
    print(f"Pickle saved: {size_gb:.2f} GB")
else:
    print("Not saved yet")

✓✓✓ SUCCESS! ✓✓✓
Pickle saved: 3.57 GB
