**PHASE 1: DATA PREPROCESSING - Extract Metadata Features**

Purpose: Parse dates, categories, author counts, journal flags  
Input: arxiv_full_raw.pkl  
Output: arxiv_metadata_features.pkl  
Features: year, categories, domains, has_journal, num_authors, text lengths  
ML Involved: None - Feature extraction from metadata  
Runtime: ~10-15 minutes  
Run Once: ✓ Never need to run again

In [2]:
# load saved dataset

import pandas as pd

df = pd.read_pickle('data/processed/arxiv_full_raw.pkl')
print(f"Loaded: {len(df):,} papers")
print(f"Columns: {list(df.columns)}")

# check top rows
df.head()

Loaded: 2,384,622 papers
Columns: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [3]:
# extract year from update_date

def extract_year(date_str):
    try:
        # Format: "2024-11-12" or similar
        return int(str(date_str)[:4])
    except:
        return None

df['year'] = df['update_date'].apply(extract_year)

print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print(f"Missing years: {df['year'].isna().sum()}")

Year range: 2007 - 2025
Missing years: 0


In [None]:
# clean category features

# split categories string into list
df['all_categories'] = df['categories'].str.split()

# get primary category (first one)
df['primary_category'] = df['all_categories'].str[0]

# count categories per paper
df['num_categories'] = df['all_categories'].str.len()

# flag multi-category papers
df['is_multi_category'] = (df['num_categories'] > 1).astype(int)

print(f"Unique categories: {df['primary_category'].nunique()}")
print(f"Multi-category papers: {df['is_multi_category'].sum():,} ({df['is_multi_category'].mean()*100:.1f}%)")

Unique categories: 153
Multi-category papers: 1,197,269 (50.2%)


In [5]:
# extract top-level domain (cs, math, physics, etc.)

def get_top_domain(category):
    if pd.isna(category):
        return None
    # category format: "cs.AI" or "math.NT"
    return category.split('.')[0] if '.' in category else category

df['top_level_domain'] = df['primary_category'].apply(get_top_domain)

print("Top domains:")
print(df['top_level_domain'].value_counts().head(10))

Top domains:
top_level_domain
cs          642055
math        493214
cond-mat    253535
astro-ph    243545
physics     180110
quant-ph     99370
hep-ph       85783
hep-th       66360
eess         62792
stat         55976
Name: count, dtype: int64


In [6]:
# create has_journal flag (1 if published, 0 if preprint only)

df['has_journal'] = df['journal-ref'].notna().astype(int)

print(f"Papers with journal: {df['has_journal'].sum():,} ({df['has_journal'].mean()*100:.1f}%)")

Papers with journal: 684,967 (28.7%)


In [11]:
# count number of authors

def count_authors(authors_list):
    if authors_list is None or authors_list == [] or (isinstance(authors_list, float) and pd.isna(authors_list)):
        return 0
    return len(authors_list)

df['num_authors'] = df['authors_parsed'].apply(count_authors)

print(f"Average authors: {df['num_authors'].mean():.1f}")
print(f"Max authors: {df['num_authors'].max()}")
print(f"Papers with 0 authors: {(df['num_authors'] == 0).sum()}")

Average authors: 5.0
Max authors: 3435
Papers with 0 authors: 0


In [10]:
# abstract length

df['abstract_length'] = df['abstract'].fillna('').str.len()

# Title length  
df['title_length'] = df['title'].fillna('').str.len()

print(f"Average abstract length: {df['abstract_length'].mean():.0f} chars")
print(f"Average title length: {df['title_length'].mean():.0f} chars")

Average abstract length: 1020 chars
Average title length: 76 chars


In [12]:
# keep only needed columns

feature_cols = [
    'id',                    # paper ID
    'title',                 # for display
    'abstract',              # for text processing
    'year',                  # femporal analysis
    'primary_category',      # main category
    'all_categories',        # all categories
    'top_level_domain',      # cs, math, etc.
    'num_categories',        # how many categories
    'is_multi_category',     # flag
    'has_journal',           # published or not
    'num_authors',           # collaboration size
    'abstract_length',       # text length
    'title_length'           # text length
]

df_features = df[feature_cols].copy()

print(f"\nFinal shape: {df_features.shape}")
print(f"Memory: {df_features.memory_usage(deep=True).sum() / 1024**3:.2f} GB")


Final shape: (2384622, 13)
Memory: 3.44 GB


In [13]:
# save processed features

df_features.to_pickle('data/processed/arxiv_metadata_features.pkl')
print("✓ Saved to: data/processed/arxiv_metadata_features.pkl")

✓ Saved to: data/processed/arxiv_metadata_features.pkl


In [None]:
import os

pickle_path = 'data/processed/arxiv_metadata_features.pkl'

if os.path.exists(pickle_path):
    size_gb = os.path.getsize(pickle_path) / 1024**3
    print(f"✓✓✓ Success! ✓✓✓")
    print(f"File size: {size_gb:.2f} GB")
    
    # quick check
    df_check = pd.read_pickle(pickle_path)
    print(f"Papers: {len(df_check):,}")
    print(f"Columns: {list(df_check.columns)}")

✓✓✓ SUCCESS! ✓✓✓
File size: 2.69 GB
Papers: 2,384,622
Columns: ['id', 'title', 'abstract', 'year', 'primary_category', 'all_categories', 'top_level_domain', 'num_categories', 'is_multi_category', 'has_journal', 'num_authors', 'abstract_length', 'title_length']
