**PHASE 1: DATA PREPROCESSING - Text Cleaning**

Purpose: Clean abstracts for TF-IDF vectorization  
Input: arxiv_metadata_features.pkl  
Output: arxiv_text_cleaned.pkl  
Cleaning: Lowercase, remove special chars, normalize whitespace  
Columns Kept: id, title, abstract_clean, year, categories, metadata  
ML Involved: None - Text preprocessing  
Runtime: ~10-15 minutes  
Run Once: Never need to run again

In [None]:
# imports

import pandas as pd
import re
from tqdm import tqdm
import os

# enable progress bar for pandas

tqdm.pandas()

Loaded: 2,384,622 papers
Columns: ['id', 'title', 'abstract', 'year', 'primary_category', 'all_categories', 'top_level_domain', 'num_categories', 'is_multi_category', 'has_journal', 'num_authors', 'abstract_length', 'title_length']


In [None]:
# load metadata

df = pd.read_pickle('data/processed/arxiv_metadata_features.pkl')
print(f"Loaded: {len(df):,} papers")
print(f"Columns: {list(df.columns)}")

In [2]:
def clean_text(text):

    """
    clean abstract text for TF-IDF
    - lowercase
    - remove special characters
    - remove extra whitespace
    """

    if pd.isna(text) or text == '':
        return ""
    
    # convert to string and lowercase
    text = str(text).lower()
    
    # remove special characters, keep only letters and spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# test on one abstract

sample_text = df['abstract'].iloc[0]
print("Original:")
print(sample_text[:200])
print("\nCleaned:")
print(clean_text(sample_text)[:200])

Original:
  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributi

Cleaned:
a fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders all next to leading order perturbative contributions


In [3]:
# clean abstracts (this could take ~10-15 minutes)

print("Cleaning abstracts...")
df['abstract_clean'] = df['abstract'].progress_apply(clean_text)

# check results

print(f"\nOriginal abstract length: {df['abstract_length'].mean():.0f} chars")
print(f"Cleaned abstract length: {df['abstract_clean'].str.len().mean():.0f} chars")
print(f"Empty abstracts: {(df['abstract_clean'] == '').sum():,}")

Cleaning abstracts...


100%|██████████| 2384622/2384622 [00:55<00:00, 43192.95it/s]



Original abstract length: 1020 chars
Cleaned abstract length: 983 chars
Empty abstracts: 0


In [None]:
# clean titles too (might use for visualization later)

print("Cleaning titles...")
df['title_clean'] = df['title'].progress_apply(clean_text)

print(f"Empty titles: {(df['title_clean'] == '').sum():,}")

In [4]:
# remove papers with empty abstracts after cleaning

df_before = len(df)
df = df[df['abstract_clean'] != ''].reset_index(drop=True)
df_after = len(df)

print(f"Removed {df_before - df_after:,} papers with no abstract")
print(f"Remaining: {df_after:,} papers")

Removed 0 papers with no abstract
Remaining: 2,384,622 papers


In [5]:
# save cleaned data

df.to_pickle('data/processed/arxiv_text_cleaned.pkl')
print(f"✓ Saved to: data/processed/arxiv_text_cleaned.pkl")
print(f"Memory: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

✓ Saved to: data/processed/arxiv_text_cleaned.pkl
Memory: 5.70 GB


In [6]:
# verify

pickle_path = 'data/processed/arxiv_text_cleaned.pkl'

if os.path.exists(pickle_path):
    size_gb = os.path.getsize(pickle_path) / 1024**3
    print(f"✓✓✓ Success! ✓✓✓")
    print(f"File size: {size_gb:.2f} GB")
    
    # Quick check
    df_check = pd.read_pickle(pickle_path)
    print(f"Papers: {len(df_check):,}")
    print(f"Columns: {list(df_check.columns)}")
    print(f"\nSample cleaned abstract:")
    print(df_check['abstract_clean'].iloc[0][:300])
else:
    print("x Not saved yet")

✓✓✓ Success! ✓✓✓
File size: 4.88 GB
Papers: 2,384,622
Columns: ['id', 'title', 'abstract', 'year', 'primary_category', 'all_categories', 'top_level_domain', 'num_categories', 'is_multi_category', 'has_journal', 'num_authors', 'abstract_length', 'title_length', 'abstract_clean']

Sample cleaned abstract:
a fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders all next to leading order perturbative contributions from quark antiquark gluon anti quark and gluon gluon subprocesses are included as well as all orde


In [7]:
# quick clean up

# keep what we need for clustering and analysis
# using keep rather than drop to keep overview of current columns

keep_columns = [
    'id',                    # paper identifier
    'title',                 # original title (for display in analysis)
    'abstract_clean',        # clean abstract (for TF-IDF) - MAIN FEATURE
    'year',                  # temporal analysis
    'primary_category',      # main category
    'all_categories',        # full category list
    'top_level_domain',      # cs, math, physics, etc.
    'num_categories',        # how many categories
    'is_multi_category',     # multi-category flag
    'has_journal',           # published or preprint (quality signal)
    'num_authors',           # collaboration size
    'abstract_length',       # original length (before cleaning)
    'title_length'           # original title length
]

df_check = pd.read_pickle('data/processed/arxiv_text_cleaned.pkl')
print(f"Before: {df_check.shape}")
print(f"Columns before: {list(df_check.columns)}")

# Keep only needed columns
df_final = df_check[keep_columns].copy()

print(f"\nAfter: {df_final.shape}")
print(f"Columns after: {list(df_final.columns)}")
print(f"Removed {len(df_check.columns) - len(keep_columns)} columns")
print(f"Memory: {df_final.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Save final version
df_final.to_pickle('data/processed/arxiv_text_cleaned.pkl')
print("\n✓ Saved optimized version")

Before: (2384622, 14)
Columns before: ['id', 'title', 'abstract', 'year', 'primary_category', 'all_categories', 'top_level_domain', 'num_categories', 'is_multi_category', 'has_journal', 'num_authors', 'abstract_length', 'title_length', 'abstract_clean']

After: (2384622, 13)
Columns after: ['id', 'title', 'abstract_clean', 'year', 'primary_category', 'all_categories', 'top_level_domain', 'num_categories', 'is_multi_category', 'has_journal', 'num_authors', 'abstract_length', 'title_length']
Removed 1 columns
Memory: 3.32 GB

✓ Saved optimized version
