# Anomaly Detection for Large Yield Moves (Clean Evaluation)

**Objective:**
- Priority 1: Never miss abnormal events (FN = 0)
- Priority 2: Minimize false alarms

**Method:** Cost-Sensitive Random Forest with Time Series CV for threshold selection

**Key Principle:** Test data is NEVER used for any tuning or model selection.

---

## Audit Checklist

| Check | Status |
|-------|--------|
| Chronological train/val/test split | ✓ |
| Features use only past data | ✓ |
| Medians from train+val only | ✓ |
| Scaler fitted on train+val only | ✓ |
| Threshold from CV (not test) | ✓ |
| Test touched only for final eval | ✓ |

In [None]:
import sys
from pathlib import Path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

from src.models.prepare_data import prepare_event_data

import warnings
warnings.filterwarnings('ignore')
print('Imports complete')

## Phase 1: Data Loading and Splitting

In [None]:
# Load data
TARGET_YIELD = 'y_2y'
LARGE_THRESHOLD = 0.10  # 10 basis points

events_df = prepare_event_data(target_yield=TARGET_YIELD, prediction_horizon=0)
events_df = events_df.sort_values('date').reset_index(drop=True)

# Chronological split: 70% train+val, 30% test
n_total = len(events_df)
n_test = int(n_total * 0.3)
n_trainval = n_total - n_test

trainval_df = events_df.iloc[:n_trainval].copy()
test_df = events_df.iloc[n_trainval:].copy()

print('DATA SPLITS (chronological)')
print('='*60)
print(f'Train+Val: {len(trainval_df)} events ({trainval_df["date"].min().date()} to {trainval_df["date"].max().date()})')
print(f'Test:      {len(test_df)} events ({test_df["date"].min().date()} to {test_df["date"].max().date()})')
print(f'\n*** Test data will NOT be touched until final evaluation ***')

In [None]:
# Define labels and clean data
trainval_df['is_abnormal'] = trainval_df[f'{TARGET_YIELD}_change'].abs() > LARGE_THRESHOLD
test_df['is_abnormal'] = test_df[f'{TARGET_YIELD}_change'].abs() > LARGE_THRESHOLD

trainval_clean = trainval_df.dropna(subset=[f'{TARGET_YIELD}_change']).copy()
test_clean = test_df.dropna(subset=[f'{TARGET_YIELD}_change']).copy()

trainval_clean['cpi_abs'] = trainval_clean['cpi_shock_mom'].abs()
test_clean['cpi_abs'] = test_clean['cpi_shock_mom'].abs()

print('\nLABEL DISTRIBUTION')
print('-'*40)
print(f'Train+Val abnormal: {trainval_clean["is_abnormal"].sum()} ({trainval_clean["is_abnormal"].mean():.1%})')
print(f'Test abnormal:      {test_clean["is_abnormal"].sum()} ({test_clean["is_abnormal"].mean():.1%})')

## Phase 2: Time Series Cross-Validation for Threshold Selection

Use CV on train+val to find a robust threshold. Test data is NOT used.

In [None]:
FEATURES = ['yield_volatility', 'cpi_shock_mom', 'fed_funds', 'slope_10y_2y', 'unemployment', 'cpi_abs']

X_trainval = trainval_clean[FEATURES].copy()
y_trainval = trainval_clean['is_abnormal'].values

print('TIME SERIES CROSS-VALIDATION FOR THRESHOLD SELECTION')
print('='*60)

tscv = TimeSeriesSplit(n_splits=5)
cv_thresholds = []
cv_fp_rates = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_trainval)):
    X_tr = X_trainval.iloc[train_idx]
    X_vl = X_trainval.iloc[val_idx]
    y_tr = y_trainval[train_idx]
    y_vl = y_trainval[val_idx]
    
    # Skip if no abnormals in validation
    if y_vl.sum() == 0:
        print(f'  Fold {fold+1}: skipped (no abnormals in validation)')
        continue
    
    # Impute with train medians
    medians = {col: X_tr[col].median() for col in FEATURES}
    X_tr = X_tr.fillna(medians)
    X_vl = X_vl.fillna(medians)
    
    # Scale
    scaler = StandardScaler()
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_vl_scaled = scaler.transform(X_vl)
    
    # Train
    model = RandomForestClassifier(
        n_estimators=100, max_depth=5,
        class_weight={False: 1, True: 50},
        random_state=42
    )
    model.fit(X_tr_scaled, y_tr)
    
    # Find threshold for FN=0
    val_probs = model.predict_proba(X_vl_scaled)[:, 1]
    thresh = val_probs[y_vl].min() - 0.001
    
    # Compute FP rate
    val_pred = val_probs >= thresh
    fp_rate = (~y_vl & val_pred).sum() / (~y_vl).sum()
    
    cv_thresholds.append(thresh)
    cv_fp_rates.append(fp_rate)
    
    print(f'  Fold {fold+1}: threshold={thresh:.4f}, FP rate={fp_rate:.1%}')

# Use MAXIMUM threshold (most conservative for FN=0)
robust_threshold = max(cv_thresholds)
print(f'\nRobust threshold (max across folds): {robust_threshold:.4f}')

## Phase 3: Train Final Model on All Train+Val Data

In [None]:
print('FINAL MODEL TRAINING')
print('='*60)

# Compute medians from train+val
trainval_medians = {col: trainval_clean[col].median() for col in FEATURES}

# Prepare features
X_trainval_clean = trainval_clean[FEATURES].fillna(trainval_medians)

# Scale
scaler = StandardScaler()
X_trainval_scaled = scaler.fit_transform(X_trainval_clean)

# Train final model
final_model = RandomForestClassifier(
    n_estimators=100, max_depth=5,
    class_weight={False: 1, True: 50},
    random_state=42
)
final_model.fit(X_trainval_scaled, y_trainval)

print('Model trained on ALL train+val data')
print(f'Threshold to use: {robust_threshold:.4f} (from CV)')
print(f'\n*** Test data still not touched ***')

## Phase 4: Final Test Evaluation (First Time Test is Touched)

In [None]:
print('='*70)
print('FINAL TEST EVALUATION')
print('='*70)
print('\n*** This is the FIRST and ONLY time test data is used ***\n')

# Prepare test features (using train+val medians)
X_test = test_clean[FEATURES].fillna(trainval_medians)
X_test_scaled = scaler.transform(X_test)
y_test = test_clean['is_abnormal'].values

# Predict with pre-determined threshold
test_probs = final_model.predict_proba(X_test_scaled)[:, 1]
test_pred = test_probs >= robust_threshold

# Compute metrics
TP = (y_test & test_pred).sum()
FP = (~y_test & test_pred).sum()
FN = (y_test & ~test_pred).sum()
TN = (~y_test & ~test_pred).sum()

fn_rate = FN / y_test.sum() if y_test.sum() > 0 else 0
fp_rate = FP / (~y_test).sum() if (~y_test).sum() > 0 else 0

print(f'Threshold: {robust_threshold:.4f} (from CV, never saw test)')
print(f'\nRESULTS:')
print(f'  FN Rate: {fn_rate:.1%} (missed abnormals)')
print(f'  FP Rate: {fp_rate:.1%} (false alarms)')
print(f'\n  TP: {TP} (caught abnormals)')
print(f'  FP: {FP} (false alarms)')
print(f'  FN: {FN} (missed)')
print(f'  TN: {TN} (correctly ignored)')

if FN == 0:
    print('\n✓ Priority 1 achieved: All abnormal events caught!')
else:
    print(f'\n⚠ {FN} abnormal events missed')

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Probability distribution
ax1 = axes[0]
ax1.hist(test_probs[~y_test], bins=30, alpha=0.6, label='Normal', color='blue')
ax1.hist(test_probs[y_test], bins=15, alpha=0.6, label='Abnormal', color='red')
ax1.axvline(x=robust_threshold, color='green', linestyle='--', linewidth=2, 
            label=f'Threshold={robust_threshold:.3f}')
ax1.set_xlabel('Predicted Probability')
ax1.set_ylabel('Count')
ax1.set_title('Test Set Probability Distribution')
ax1.legend()

# Confusion matrix
ax2 = axes[1]
cm = np.array([[TN, FP], [FN, TP]])
im = ax2.imshow(cm, cmap='Blues')
ax2.set_xticks([0, 1])
ax2.set_yticks([0, 1])
ax2.set_xticklabels(['Pred Normal', 'Pred Abnormal'])
ax2.set_yticklabels(['Actual Normal', 'Actual Abnormal'])
for i in range(2):
    for j in range(2):
        ax2.text(j, i, cm[i, j], ha='center', va='center', fontsize=16, fontweight='bold')
ax2.set_title('Confusion Matrix')

plt.tight_layout()
plt.show()

In [None]:
print('='*70)
print('SUMMARY')
print('='*70)
print(f'''
METHOD: Cost-Sensitive Random Forest with Time Series CV

  1. Split data chronologically: train+val (70%) | test (30%)
  2. Use 5-fold Time Series CV on train+val to find threshold
  3. Take MAX threshold across folds (most conservative)
  4. Train final model on all train+val
  5. Evaluate on test (first and only time touched)

RESULTS:
  FN Rate: {fn_rate:.1%}
  FP Rate: {fp_rate:.1%}
  
  Catches {TP}/{TP+FN} abnormal events ({TP/(TP+FN)*100 if TP+FN > 0 else 0:.0f}%)
  False alarms: {FP}/{FP+TN} normal events ({FP/(FP+TN)*100:.0f}%)

AUDIT:
  ✓ Test data never used for threshold selection
  ✓ Test data never used for model training
  ✓ Test data never used for feature statistics
  ✓ All tuning done on train+val with CV
  ✓ Zero information leakage from test
''')