**PHASE 2: FEATURE ENGINEERING - TF-IDF Vectorization**

Purpose: Transform clean text into numerical feature matrix  
Input: arxiv_text_cleaned.pkl  
Output: tfidf_matrix.pkl (2.4M × 10K sparse), tfidf_vectorizer.pkl  
Parameters: max_features=10K, min_df=10, max_df=0.7, bigrams included  
ML Involved: YES - TF-IDF is a feature extraction technique  
Runtime: ~20-30 minutes  
Key Concept: Transforms text to numbers while preserving semantic meaning

In [1]:
# imports

import pandas as pd
import numpy as np
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import joblib

# add project root to path for config

sys.path.append('..')
from config import TFIDF_MAX_FEATURES, TFIDF_MIN_DF, TFIDF_MAX_DF, TFIDF_NGRAM_RANGE, RANDOM_STATE

print("✓ All imports loaded")
print(f"TF-IDF Config: max_features={TFIDF_MAX_FEATURES}, min_df={TFIDF_MIN_DF}, max_df={TFIDF_MAX_DF}")

✓ All imports loaded
TF-IDF Config: max_features=1000, min_df=10, max_df=0.7


In [2]:
# load cleaned text data

df = pd.read_pickle('data/processed/arxiv_text_cleaned.pkl')

print(f"Loaded: {len(df):,} papers")
print(f"Columns: {list(df.columns)}")
print(f"\nSample abstract (cleaned):")
print(df['abstract_clean'].iloc[0][:200])

Loaded: 2,384,622 papers
Columns: ['id', 'title', 'abstract_clean', 'year', 'primary_category', 'all_categories', 'top_level_domain', 'num_categories', 'is_multi_category', 'has_journal', 'num_authors', 'abstract_length', 'title_length']

Sample abstract (cleaned):
a fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders all next to leading order perturbative contributions


In [3]:
# verify no empty abstracts (should be 0 after preprocessing)

empty_count = (df['abstract_clean'] == '').sum()
print(f"Empty abstracts: {empty_count:,}")

if empty_count > 0:
    print(f"⚠ Warning: {empty_count} empty abstracts found")
    print("Removing them now...")
    df = df[df['abstract_clean'] != ''].reset_index(drop=True)
    print(f"Papers remaining: {len(df):,}")
else:
    print("✓ All abstracts have content")

Empty abstracts: 0
✓ All abstracts have content


In [4]:
# create TF-IDF vectorizer with config parameters

vectorizer = TfidfVectorizer(
    max_features=TFIDF_MAX_FEATURES,     # top 10K terms
    min_df=TFIDF_MIN_DF,                 # must appear in 10+ papers
    max_df=TFIDF_MAX_DF,                 # ignore if in >70% of papers
    ngram_range=TFIDF_NGRAM_RANGE,       # unigrams + bigrams
    stop_words='english',                # remove common English words
    dtype=np.float32,                    # use float32 to save memory
    strip_accents='unicode',             # handle special characters
    lowercase=True                       # already should be lowercase, but ensure
)

print("✓ Vectorizer initialized")
print(f"\nParameters:")
print(f"  max_features: {TFIDF_MAX_FEATURES:,}")
print(f"  min_df: {TFIDF_MIN_DF}")
print(f"  max_df: {TFIDF_MAX_DF}")
print(f"  ngram_range: {TFIDF_NGRAM_RANGE}")

✓ Vectorizer initialized

Parameters:
  max_features: 1,000
  min_df: 10
  max_df: 0.7
  ngram_range: (1, 2)


In [5]:
# fit TF-IDF and transform abstracts to feature matrix
# this step could be ~20-30 minutes for 2.4M papers

print("Fitting TF-IDF vectorizer and transforming abstracts...")
print("This will take 20-30 minutes...\n")

tfidf_matrix = vectorizer.fit_transform(df['abstract_clean'])

print("\n✓ TF-IDF transformation complete!")
print(f"\nMatrix shape: {tfidf_matrix.shape}")
print(f"  Papers: {tfidf_matrix.shape[0]:,}")
print(f"  Features: {tfidf_matrix.shape[1]:,}")
print(f"\nSparsity: {(1.0 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"Non-zero values: {tfidf_matrix.nnz:,}")
print(f"Memory usage: {tfidf_matrix.data.nbytes / 1024**3:.2f} GB")

Fitting TF-IDF vectorizer and transforming abstracts...
This will take 20-30 minutes...


✓ TF-IDF transformation complete!

Matrix shape: (2384622, 1000)
  Papers: 2,384,622
  Features: 1,000

Sparsity: 96.41%
Non-zero values: 85,574,897
Memory usage: 0.32 GB


In [6]:
# get feature names (the actual words/bigrams)

feature_names = vectorizer.get_feature_names_out()

print(f"Total features extracted: {len(feature_names):,}")
print(f"\nFirst 20 features:")
print(feature_names[:20])
print(f"\nLast 20 features:")
print(feature_names[-20:])
print(f"\nSample bigrams:")
bigrams = [f for f in feature_names if ' ' in f]
print(bigrams[:20])

Total features extracted: 1,000

First 20 features:
['ability' 'able' 'absorption' 'access' 'account' 'accretion' 'accuracy'
 'accurate' 'achieve' 'achieved' 'achieves' 'action' 'active' 'activity'
 'adaptive' 'addition' 'additional' 'additionally' 'address' 'adversarial']

Last 20 features:
['velocity' 'version' 'video' 'view' 'vision' 'visual' 'volume' 'wave'
 'waves' 'way' 'weak' 'weight' 'weighted' 'wide' 'widely' 'work' 'works'
 'world' 'years' 'zero']

Sample bigrams:
['black hole', 'dark matter', 'deep learning', 'et al', 'experimental results', 'github com', 'language models', 'large scale', 'machine learning', 'magnetic field', 'monte carlo', 'neural network', 'neural networks', 'paper propose', 'propose novel', 'proposed method', 'real time', 'real world', 'state art']


In [7]:
# see top terms for each research domain
# this helps verify TF-IDF captured meaningful terms

domains = df['top_level_domain'].value_counts().head(5).index

for domain in domains:
    # get papers in this domain
    domain_mask = df['top_level_domain'] == domain
    domain_indices = df[domain_mask].index
    
    # get TF-IDF scores for this domain
    domain_tfidf = tfidf_matrix[domain_indices].mean(axis=0)
    domain_tfidf_array = np.asarray(domain_tfidf).flatten()
    
    # get top 15 terms
    top_indices = domain_tfidf_array.argsort()[-15:][::-1]
    top_terms = [feature_names[i] for i in top_indices]
    
    print(f"\n{domain.upper()} - Top 15 terms:")
    print(", ".join(top_terms))


CS - Top 15 terms:
data, learning, model, based, models, performance, network, paper, training, methods, method, language, propose, information, approach

MATH - Top 15 terms:
prove, mathbb, paper, group, finite, space, problem, spaces, functions, result, theorem, study, results, mathcal, function

COND-MAT - Top 15 terms:
spin, magnetic, phase, temperature, quantum, field, transition, states, energy, electron, state, model, density, properties, topological

ASTRO-PH - Top 15 terms:
mass, stars, star, galaxies, ray, emission, observations, stellar, galaxy, data, formation, gas, high, observed, solar

PHYSICS - Top 15 terms:
energy, optical, high, field, model, time, laser, using, based, flow, beam, electron, method, light, results


In [8]:
# save top terms per domain for case study reference

print("\nSaving domain top terms analysis...")

domain_top_terms = {}

domains = df['top_level_domain'].value_counts().head(15).index

for domain in domains:
    # get papers in this domain
    domain_mask = df['top_level_domain'] == domain
    domain_indices = df[domain_mask].index
    
    # get TF-IDF scores for this domain
    domain_tfidf = tfidf_matrix[domain_indices].mean(axis=0)
    domain_tfidf_array = np.asarray(domain_tfidf).flatten()
    
    # get top 20 terms (save more than we show)
    top_indices = domain_tfidf_array.argsort()[-20:][::-1]
    top_terms = [feature_names[i] for i in top_indices]
    top_scores = [domain_tfidf_array[i] for i in top_indices]
    
    domain_top_terms[domain] = {
        'terms': top_terms,
        'scores': top_scores,
        'paper_count': domain_mask.sum()
    }

# save to pickle

domain_terms_path = 'data/processed/domain_top_terms.pkl'
joblib.dump(domain_top_terms, domain_terms_path)
print(f"✓ Saved domain top terms to: {domain_terms_path}")

# also save as readable text file for easy reference

txt_path = 'results/domain_top_terms.txt'
os.makedirs('results', exist_ok=True)

with open(txt_path, 'w') as f:
    f.write("TOP TERMS PER RESEARCH DOMAIN\n")
    f.write("Generated from TF-IDF analysis of 2.4M ArXiv papers\n")
    f.write("Higher scores indicate terms that are both frequent AND distinctive\n")
    f.write(f"Total features: {len(feature_names):,}\n")
    f.write(f"Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}\n\n")
    
    for domain in domains:
        info = domain_top_terms[domain]
        f.write(f"\n{domain.upper()}\n")
        f.write(f"Papers in domain: {info['paper_count']:,}\n")
        f.write(f"Percentage of corpus: {info['paper_count']/len(df)*100:.1f}%\n")
        f.write("Top 20 terms (by mean TF-IDF score):\n")
        
        for i, (term, score) in enumerate(zip(info['terms'], info['scores']), 1):
            f.write(f"  {i:2d}. {term:25s} (score: {score:.4f})\n")
        f.write("\n")

print(f"✓ Saved readable text to: {txt_path}")


Saving domain top terms analysis...
✓ Saved domain top terms to: data/processed/domain_top_terms.pkl
✓ Saved readable text to: results/domain_top_terms.txt


In [9]:
# save sparse matrix (efficient storage)

tfidf_path = 'data/processed/tfidf_matrix.pkl'
joblib.dump(tfidf_matrix, tfidf_path)
print(f"✓ Saved TF-IDF matrix to: {tfidf_path}")

# save vectorizer (need this to interpret features later)

vectorizer_path = 'data/processed/tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_path)
print(f"✓ Saved vectorizer to: {vectorizer_path}")

# save paper IDs in same order (critical for matching later!)

id_mapping_path = 'data/processed/tfidf_paper_ids.pkl'
df[['id', 'title']].to_pickle(id_mapping_path)
print(f"✓ Saved paper ID mapping to: {id_mapping_path}")

✓ Saved TF-IDF matrix to: data/processed/tfidf_matrix.pkl
✓ Saved vectorizer to: data/processed/tfidf_vectorizer.pkl
✓ Saved paper ID mapping to: data/processed/tfidf_paper_ids.pkl


In [10]:
# verify all files created successfully

files_to_check = [
    'data/processed/tfidf_matrix.pkl',
    'data/processed/tfidf_vectorizer.pkl',
    'data/processed/tfidf_paper_ids.pkl',
    'data/processed/domain_top_terms.pkl',      
    'results/domain_top_terms.txt'              
]
all_good = True
for file_path in files_to_check:
    if os.path.exists(file_path):
        size_mb = os.path.getsize(file_path) / 1024**2
        print(f"✓ {file_path}")
        print(f"  Size: {size_mb:.1f} MB")
    else:
        print(f"x Missing: {file_path}")
        all_good = False

if all_good:
    print("\n✓✓✓ Success! ✓✓✓ - All files created!")
    
    # quick reload test
    matrix_check = joblib.load('data/processed/tfidf_matrix.pkl')
    vectorizer_check = joblib.load('data/processed/tfidf_vectorizer.pkl')
    
    print(f"\nReload verification:")
    print(f"  Matrix shape: {matrix_check.shape}")
    print(f"  Feature count: {len(vectorizer_check.get_feature_names_out())}")
else:
    print("\nx Error - Some files missing!")

✓ data/processed/tfidf_matrix.pkl
  Size: 662.0 MB
✓ data/processed/tfidf_vectorizer.pkl
  Size: 0.0 MB
✓ data/processed/tfidf_paper_ids.pkl
  Size: 209.8 MB
✓ data/processed/domain_top_terms.pkl
  Size: 0.0 MB
✓ results/domain_top_terms.txt
  Size: 0.0 MB

✓✓✓ Success! ✓✓✓ - All files created!

Reload verification:
  Matrix shape: (2384622, 1000)
  Feature count: 1000


In [None]:
# DIAGNOSTIC: check TF-IDF matrix properties

print("TF-IDF MATRIX DIAGNOSTICS")

# load matrix
tfidf_matrix = joblib.load('data/processed/tfidf_matrix.pkl')

# Basic stats
print(f"\nShape: {tfidf_matrix.shape}")
print(f"Sparsity: {(1.0 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"Non-zero elements: {tfidf_matrix.nnz:,}")

# check for anomalies
print(f"\nMatrix statistics:")
print(f"  Min value: {tfidf_matrix.data.min():.6f}")
print(f"  Max value: {tfidf_matrix.data.max():.6f}")
print(f"  Mean value: {tfidf_matrix.data.mean():.6f}")
print(f"  Std value: {tfidf_matrix.data.std():.6f}")

# check row-wise (per paper)
row_sums = np.array(tfidf_matrix.sum(axis=1)).flatten()
print(f"\nPer-paper TF-IDF sums:")
print(f"  Min: {row_sums.min():.6f}")
print(f"  Max: {row_sums.max():.6f}")
print(f"  Mean: {row_sums.mean():.6f}")
print(f"  Median: {np.median(row_sums):.6f}")

# check if any rows are all zeros
zero_rows = (row_sums == 0).sum()
print(f"\nZero rows (papers with no features): {zero_rows}")

# check column-wise (per term)
col_sums = np.array(tfidf_matrix.sum(axis=0)).flatten()
print(f"\nPer-term TF-IDF sums:")
print(f"  Min: {col_sums.min():.6f}")
print(f"  Max: {col_sums.max():.6f}")
print(f"  Mean: {col_sums.mean():.6f}")

# check if any columns are all zeros
zero_cols = (col_sums == 0).sum()
print(f"\nZero columns (terms never used): {zero_cols}")

# check data type
print(f"\nData type: {tfidf_matrix.dtype}")

print("="*60)