In [4]:
"""
Severe Data Leakage Example: Feature Engineering with Target Information

This demonstrates how using target information during feature creation
before splitting leads to dramatically inflated performance metrics.
"""

import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print("SEVERE DATA LEAKAGE: Creating Features Using Target Before Split")
print("Dataset: Diabetes Progression")
print("=" * 70)

# Load dataset
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target

print(f"\nDataset shape: {df.shape}")
print(f"Target: Disease progression one year after baseline")
print(f"\nFeatures: {diabetes.feature_names}")
print(f"\nFirst few rows:")
print(df.head())

print("\n" + "=" * 70)
print("SCENARIO 1: SEVERE LEAKAGE - Using Target Info BEFORE Split")
print("=" * 70)

# WRONG: Create features using target information from ENTIRE dataset
# This simulates a common mistake: adding target-based statistics

# Example 1: Add mean target for each BMI quartile
df['bmi_quartile'] = pd.qcut(df['bmi'], q=4, labels=False)
bmi_target_means = df.groupby('bmi_quartile')['target'].mean()
df['bmi_target_mean_LEAKED'] = df['bmi_quartile'].map(bmi_target_means)

# Example 2: Add mean target for each age decile
df['age_decile'] = pd.qcut(df['age'], q=5, labels=False, duplicates='drop')
age_target_means = df.groupby('age_decile')['target'].mean()
df['age_target_mean_LEAKED'] = df['age_decile'].map(age_target_means)

# Example 3: Add ratio to overall target mean (extreme leakage!)
overall_mean = df['target'].mean()
df['target_ratio_LEAKED'] = df['target'] / overall_mean

print("\nCreated 3 LEAKED features:")
print("1. bmi_target_mean_LEAKED - average target per BMI quartile")
print("2. age_target_mean_LEAKED - average target per age decile")
print("3. target_ratio_LEAKED - ratio of target to overall mean")
print("\nBMI Quartile Target Means (using ALL data):")
print(bmi_target_means)

# Prepare features
leaked_features = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
                   'bmi_target_mean_LEAKED', 'age_target_mean_LEAKED',
                   'target_ratio_LEAKED']
X_leaked = df[leaked_features].values
y = df['target'].values

# Split
X_train_leaked, X_test_leaked, y_train_leaked, y_test_leaked = train_test_split(
    X_leaked, y, test_size=0.3, random_state=42
)

# Train
model_leaked = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
model_leaked.fit(X_train_leaked, y_train_leaked)

# Evaluate
y_pred_leaked = model_leaked.predict(X_test_leaked)
r2_leaked = r2_score(y_test_leaked, y_pred_leaked)
rmse_leaked = np.sqrt(mean_squared_error(y_test_leaked, y_pred_leaked))

print(f"\n{'='*50}")
print(f"LEAKED MODEL PERFORMANCE (LOOKS AMAZING - BUT FAKE!)")
print(f"{'='*50}")
print(f"Test R² Score:  {r2_leaked:.4f}")
print(f"Test RMSE:      {rmse_leaked:.4f}")

print("\n" + "=" * 70)
print("SCENARIO 2: CORRECT - Creating Features AFTER Split")
print("=" * 70)

# Reset and split first
df_clean = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df_clean['target'] = diabetes.target

X_basic = df_clean[diabetes.feature_names].values
y = df_clean['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    X_basic, y, test_size=0.3, random_state=42
)

# Create features ONLY using training data
train_df = pd.DataFrame(X_train, columns=diabetes.feature_names)
train_df['target'] = y_train

# Create quartiles and calculate means from TRAINING data only
train_df['bmi_quartile'] = pd.qcut(train_df['bmi'], q=4, labels=False)
bmi_target_means_correct = train_df.groupby('bmi_quartile')['target'].mean()

train_df['age_decile'] = pd.qcut(train_df['age'], q=5, labels=False, duplicates='drop')
age_target_means_correct = train_df.groupby('age_decile')['target'].mean()

# Apply to training data
train_df['bmi_target_mean'] = train_df['bmi_quartile'].map(bmi_target_means_correct)
train_df['age_target_mean'] = train_df['age_decile'].map(age_target_means_correct)

# Apply to test data using TRAINING statistics
test_df = pd.DataFrame(X_test, columns=diabetes.feature_names)
test_df['bmi_quartile'] = pd.qcut(test_df['bmi'], q=4, labels=False)
test_df['age_decile'] = pd.qcut(test_df['age'], q=5, labels=False, duplicates='drop')
test_df['bmi_target_mean'] = test_df['bmi_quartile'].map(bmi_target_means_correct)
test_df['age_target_mean'] = test_df['age_decile'].map(age_target_means_correct)

# Handle NaN values from unseen categories
test_df['bmi_target_mean'].fillna(bmi_target_means_correct.mean(), inplace=True)
test_df['age_target_mean'].fillna(age_target_means_correct.mean(), inplace=True)

print("\nBMI Quartile Target Means (using TRAINING data only):")
print(bmi_target_means_correct)

# Prepare final features (without the extreme leakage feature)
correct_features = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
                    'bmi_target_mean', 'age_target_mean']
X_train_correct = train_df[correct_features].values
X_test_correct = test_df[correct_features].values

# Train
model_correct = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
model_correct.fit(X_train_correct, y_train)

# Evaluate
y_pred_correct = model_correct.predict(X_test_correct)
r2_correct = r2_score(y_test, y_pred_correct)
rmse_correct = np.sqrt(mean_squared_error(y_test, y_pred_correct))

print(f"\n{'='*50}")
print(f"CORRECT MODEL PERFORMANCE (REALISTIC)")
print(f"{'='*50}")
print(f"Test R² Score:  {r2_correct:.4f}")
print(f"Test RMSE:      {rmse_correct:.4f}")

print("\n" + "=" * 70)
print("COMPARISON: THE SHOCKING TRUTH")
print("=" * 70)

print(f"\nR² Score WITH leakage:    {r2_leaked:.4f} ⚠️  DANGEROUSLY HIGH")
print(f"R² Score WITHOUT leakage: {r2_correct:.4f} ✓  REALISTIC")
print(f"\nDifference in R²:         {r2_leaked - r2_correct:.4f}")
print(f"Performance inflation:    {((r2_leaked - r2_correct) / max(abs(r2_correct), 0.01) * 100):.1f}%")

print(f"\nRMSE WITH leakage:        {rmse_leaked:.4f} (misleadingly low)")
print(f"RMSE WITHOUT leakage:     {rmse_correct:.4f} (honest estimate)")
print(f"Error underestimated by:  {((rmse_correct - rmse_leaked) / rmse_correct * 100):.1f}%")

print("\n" + "=" * 70)
print("FEATURE IMPORTANCE: THE SMOKING GUN")
print("=" * 70)

# Compare feature importance
leaked_importance = dict(zip(leaked_features, model_leaked.feature_importances_))
correct_importance = dict(zip(correct_features, model_correct.feature_importances_))

print("\nTop 5 Features WITH LEAKAGE:")
for feat, imp in sorted(leaked_importance.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {feat:30s}: {imp:.4f}")

print("\nTop 5 Features WITHOUT LEAKAGE:")
for feat, imp in sorted(correct_importance.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"  {feat:30s}: {imp:.4f}")

print("\n" + "=" * 70)
print("WHAT WENT WRONG & WHY IT MATTERS")
print("=" * 70)

print("""
The LEAKED Model:
✗ Used target information from test set during feature creation
✗ The 'target_ratio_LEAKED' feature is basically the answer itself!
✗ Model appears to perform amazingly well
✗ But it's cheating - using information it wouldn't have in production

The CORRECT Model:
✓ Features created using ONLY training data
✓ Test data remains completely unseen during preprocessing
✓ Performance metrics reflect real-world expectations
✓ Model will actually work in production

Real-World Impact:
- Medical: A hospital deploys a model thinking it's 99% accurate
  → Actually 60% accurate → Patients misdiagnosed

- Finance: A trading algorithm shows 95% accuracy in backtesting
  → Actually 55% accurate → Company loses millions

- Marketing: Campaign optimization model shows huge ROI
  → Actually negative ROI → Budget wasted

ALWAYS SPLIT FIRST, THEN ENGINEER FEATURES!
""")

SEVERE DATA LEAKAGE: Creating Features Using Target Before Split
Dataset: Diabetes Progression

Dataset shape: (442, 11)
Target: Disease progression one year after baseline

Features: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

First few rows:
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  target  
0 -0.002592  0.019907 -0.017646   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  

SCENARIO 1: SEVERE LEAKAGE - Us