# Phase 3: Feature Engineering

## Systematic Feature Construction for Diabetes Risk Modeling

**Pipeline Stages:**
1. Environment Configuration
2. Path Configuration and Constants
3. Data Loading and Validation
4. Outlier Refinement
5. Clinical Discretization
6. Interaction Feature Synthesis
7. Feature Aggregation
8. Advanced Features
9. Feature Quality Control and Selection
10. Stratified Data Partitioning
11. Robust Numerical Scaling
12. Multi-Task Data Export
13. Pipeline Summary

---
## 1. Environment Configuration

In [1]:
import os
import sys
import json
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
import joblib
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 35)
pd.set_option('display.float_format', '{:.4f}'.format)
np.random.seed(42)

print(f"Python: {sys.version}")
print(f"Pandas: {pd.__version__}")

Python: 3.13.11 | packaged by Anaconda, Inc. | (main, Dec 10 2025, 21:21:58) [MSC v.1929 64 bit (AMD64)]
Pandas: 3.0.0


---
## 2. Path Configuration and Constants

In [2]:
PROJECT_ROOT = Path.cwd()
DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
OUTPUT_DIR = PROJECT_ROOT / "outputs" / "feature_engineering"

INPUT_FILE = DATA_PROCESSED_DIR / "CDC_Diabetes_Cleaned.csv"

# === BASELINE OUTPUT FILES ===
# These files contain engineered features WITHOUT cluster-derived features.
# Purpose: 
#   1. Baseline for ablation studies (measuring cluster feature contribution)
#   2. Input for clustering pipeline (Phase 4) to load labels and weights
TRAIN_OUTPUT = DATA_PROCESSED_DIR / "CDC_Train_Classification_BASELINE.csv"
TEST_OUTPUT = DATA_PROCESSED_DIR / "CDC_Test_Classification_BASELINE.csv"

# Raw data prepared for K-Prototypes
CLUSTERING_RAW_OUTPUT = DATA_PROCESSED_DIR / "CDC_Clustering_RAW.csv"
# Fully scaled data prepared for PCA/GMM
CLUSTERING_SCALED_OUTPUT = DATA_PROCESSED_DIR / "CDC_Clustering_SCALED.csv"
METADATA_OUTPUT = OUTPUT_DIR / "feature_metadata.json"
SCALER_OUTPUT = OUTPUT_DIR / "robust_scaler.pkl"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Thresholds
BMI_EXTREME_THRESHOLD = 50.0
BMI_WINSORIZE_PERCENTILE = 99
VARIANCE_THRESHOLD = 0.01
VIF_THRESHOLD = 10.0
CORRELATION_THRESHOLD = 0.85
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"[CONFIG] Input: {INPUT_FILE}")
print(f"[CONFIG] Output: {OUTPUT_DIR}")

[CONFIG] Input: D:\ProgramSoftware\PyCharm\WorkPlace\DiaMetric-CDC\data\processed\CDC_Diabetes_Cleaned.csv
[CONFIG] Output: D:\ProgramSoftware\PyCharm\WorkPlace\DiaMetric-CDC\outputs\feature_engineering


---
## 3. Data Loading and Validation

In [3]:
def load_and_validate_data(filepath: Path) -> pd.DataFrame:
    """Load dataset and validate Sample_Weight normalization (mean ≈ 1.0)."""
    if not filepath.exists():
        raise FileNotFoundError(f"Input file not found: {filepath}")
    
    df = pd.read_csv(filepath)
    
    required_cols = ['Diabetes_binary', 'Sample_Weight', 'BMI', 'Age', 'GenHlth']
    missing = [c for c in required_cols if c not in df.columns]
    assert len(missing) == 0, f"Missing columns: {missing}"
    
    weight_mean = df['Sample_Weight'].mean()
    assert np.isclose(weight_mean, 1.0, atol=0.01), f"Weight mean: {weight_mean:.4f}"
    
    print(f"[LOADED] {len(df):,} records, {df.shape[1]} columns")
    print(f"[VALIDATED] Sample_Weight mean: {weight_mean:.6f}")
    return df


df = load_and_validate_data(INPUT_FILE)
print(f"\nColumns: {df.columns.tolist()}")
df.head()

[LOADED] 229,296 records, 23 columns
[VALIDATED] Sample_Weight mean: 1.000000

Columns: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income', 'Diabetes_binary', 'Sample_Weight']


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary,Sample_Weight
0,0.0,0.0,0.0,14.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,4.0,4.0,0.0,1.0,11.0,6.0,8.0,0,0.9054
1,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,5.0,7.0,0,0.9054
2,0.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,20.0,28.0,1.0,0.0,10.0,6.0,4.0,0,0.9054
3,0.0,0.0,0.0,15.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,0.0,29.0,0.0,0.0,7.0,5.0,2.0,0,0.9054
4,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,5.0,5.0,0,0.9054


---
## 4. Outlier Refinement

In [4]:
def mark_extreme_values(df: pd.DataFrame, col: str, threshold: float, marker: str) -> pd.DataFrame:
    """Create binary flag for values > threshold."""
    df = df.copy()
    df[marker] = (df[col] > threshold).astype(int)
    print(f"[MARKER] {marker}: {df[marker].sum():,} flagged ({df[marker].mean()*100:.2f}%)")
    return df


def winsorize_column(df: pd.DataFrame, col: str, pct: float = 99) -> pd.DataFrame:
    """Clip upper tail at specified percentile."""
    df = df.copy()
    upper = np.percentile(df[col], pct)
    orig_max = df[col].max()
    df[col] = df[col].clip(upper=upper)
    print(f"[WINSORIZE] {col}: max {orig_max:.2f} -> {upper:.2f} (P{pct})")
    return df


df = mark_extreme_values(df, 'BMI', BMI_EXTREME_THRESHOLD, 'Is_Extreme_BMI')
df = winsorize_column(df, 'BMI', BMI_WINSORIZE_PERCENTILE)
print(f"\nBMI after refinement:")
print(df['BMI'].describe())

[MARKER] Is_Extreme_BMI: 2,172 flagged (0.95%)
[WINSORIZE] BMI: max 98.00 -> 50.00 (P99)

BMI after refinement:
count   229296.0000
mean        28.5725
std          6.1899
min         12.0000
25%         24.0000
50%         27.0000
75%         32.0000
max         50.0000
Name: BMI, dtype: float64


---
## 5. Clinical Discretization

In [5]:
def discretize_bmi_who(df: pd.DataFrame) -> pd.DataFrame:
    """Discretize BMI into WHO categories: 1=Underweight, 2=Normal, 3=Overweight, 4=Obese."""
    df = df.copy()
    df['BMI_WHO'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, np.inf], 
                          labels=[1, 2, 3, 4], include_lowest=True).astype(int)
    print(f"[BMI_WHO]\n{df['BMI_WHO'].value_counts().sort_index()}")
    return df


def discretize_age_lifecycle(df: pd.DataFrame) -> pd.DataFrame:
    """Discretize age into lifecycle stages: 1=18-29, 2=30-44, 3=45-59, 4=60-74, 5=75+."""
    df = df.copy()
    mapping = {1:1, 2:1, 3:2, 4:2, 5:3, 6:3, 7:3, 8:4, 9:4, 10:4, 11:5, 12:5, 13:5}
    df['Age_Group'] = df['Age'].map(mapping)
    print(f"[Age_Group]\n{df['Age_Group'].value_counts().sort_index()}")
    return df


def discretize_health_days(df: pd.DataFrame, src: str, tgt: str) -> pd.DataFrame:
    """Discretize health burden days: 0=None, 1=Moderate(1-13d), 2=Severe(14-30d)."""
    df = df.copy()
    df[tgt] = pd.cut(df[src], bins=[-1, 0, 13, 30], 
                     labels=[0, 1, 2], include_lowest=True).astype(int)
    print(f"[{tgt}]\n{df[tgt].value_counts().sort_index()}")
    return df


print("="*50)
df = discretize_bmi_who(df)
print()
df = discretize_age_lifecycle(df)
print()
df = discretize_health_days(df, 'MentHlth', 'MentHlth_Cat')
print()
df = discretize_health_days(df, 'PhysHlth', 'PhysHlth_Cat')
print("="*50)

[BMI_WHO]
BMI_WHO
1     3043
2    73563
3    81366
4    71324
Name: count, dtype: int64

[Age_Group]
Age_Group
1    12562
2    22218
3    54377
4    86004
5    54135
Name: count, dtype: int64

[MentHlth_Cat]
MentHlth_Cat
0    152263
1     52586
2     24447
Name: count, dtype: int64

[PhysHlth_Cat]
PhysHlth_Cat
0    136811
1     59261
2     33224
Name: count, dtype: int64


---
## 6. Interaction Feature Synthesis

In [6]:
def create_interaction_features(df: pd.DataFrame) -> pd.DataFrame:
    """Generate Age×BMI interaction, CVD co-morbidity, and metabolic syndrome risk features."""
    df = df.copy()
    df['Age_BMI_Interaction'] = df['Age'] * df['BMI']
    df['CVD_Risk'] = (df['HighBP'] * df['HighChol']).astype(int)
    df['MetSyn_Risk'] = df['HighBP'] + df['HighChol'] + (df['BMI_WHO'] >= 4).astype(int)
    
    print(f"[Age_BMI_Interaction] range: [{df['Age_BMI_Interaction'].min():.1f}, {df['Age_BMI_Interaction'].max():.1f}]")
    print(f"[CVD_Risk] positive: {df['CVD_Risk'].sum():,} ({df['CVD_Risk'].mean()*100:.1f}%)")
    print(f"[MetSyn_Risk] mean: {df['MetSyn_Risk'].mean():.2f}")
    return df


print("="*50)
df = create_interaction_features(df)
print("="*50)

[Age_BMI_Interaction] range: [14.0, 650.0]
[CVD_Risk] positive: 62,174 (27.1%)
[MetSyn_Risk] mean: 1.21


---
## 7. Feature Aggregation

In [7]:
def create_aggregate_features(df: pd.DataFrame) -> pd.DataFrame:
    """Generate composite indices: chronic conditions, SDOH, lifestyle, risk behavior."""
    df = df.copy()
    df['Chronic_Count'] = df['HighBP'] + df['HighChol'] + df['HeartDiseaseorAttack'] + df['Stroke']
    df['SDOH_Index'] = (df['Education'] + df['Income']) / 2
    df['Lifestyle_Score'] = df['PhysActivity'] + df['Fruits'] + df['Veggies']
    df['Risk_Behavior'] = df['Smoker'] + df['HvyAlcoholConsump']
    
    print(f"[Chronic_Count] range: [0, {df['Chronic_Count'].max()}], mean: {df['Chronic_Count'].mean():.2f}")
    print(f"[SDOH_Index] range: [{df['SDOH_Index'].min():.1f}, {df['SDOH_Index'].max():.1f}]")
    print(f"[Lifestyle_Score] range: [0, {df['Lifestyle_Score'].max()}]")
    print(f"[Risk_Behavior] range: [0, {df['Risk_Behavior'].max()}]")
    return df


print("="*50)
df = create_aggregate_features(df)
print("="*50)

[Chronic_Count] range: [0, 4.0], mean: 1.04
[SDOH_Index] range: [1.0, 7.0]
[Lifestyle_Score] range: [0, 3.0]
[Risk_Behavior] range: [0, 2.0]


---
## 8. Advanced Features

In [8]:
def create_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
    """Generate BMI² for non-linear effects and mental-physical health imbalance."""
    df = df.copy()
    df['BMI_Squared'] = df['BMI'] ** 2
    df['Health_Imbalance'] = df['MentHlth_Cat'] - df['PhysHlth_Cat']
    
    print(f"[BMI_Squared] range: [{df['BMI_Squared'].min():.1f}, {df['BMI_Squared'].max():.1f}]")
    print(f"[Health_Imbalance] range: [{df['Health_Imbalance'].min()}, {df['Health_Imbalance'].max()}]")
    return df


print("="*50)
df = create_advanced_features(df)
print(f"\nCurrent feature count: {df.shape[1]}")
print("="*50)

[BMI_Squared] range: [144.0, 2500.0]
[Health_Imbalance] range: [-2, 2]

Current feature count: 37


---
## 9. Feature Quality Control and Selection

In [9]:
def calculate_vif(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    """Calculate VIF with intercept for multicollinearity detection."""
    X = df[cols].values
    X_with_const = np.column_stack([np.ones(X.shape[0]), X])  # Add intercept
    vif_data = []
    for i, col in enumerate(cols):
        try:
            vif = variance_inflation_factor(X_with_const, i + 1)  # Skip intercept column
        except:
            vif = np.inf
        vif_data.append({'Feature': col, 'VIF': vif})
    return pd.DataFrame(vif_data).sort_values('VIF', ascending=False)


def filter_high_vif(df: pd.DataFrame, cols: List[str], threshold: float = 10.0) -> Tuple[List[str], List[str]]:
    """Iteratively remove features with VIF > threshold."""
    remaining, removed = cols.copy(), []
    while True:
        if len(remaining) < 2:
            break
        vif_df = calculate_vif(df, remaining)
        max_vif = vif_df['VIF'].max()
        if max_vif <= threshold or np.isinf(max_vif):
            break
        worst = vif_df.iloc[0]['Feature']
        remaining.remove(worst)
        removed.append(worst)
        print(f"  [VIF] Removed {worst} (VIF={max_vif:.2f})")
    return remaining, removed


def filter_near_zero_var(df: pd.DataFrame, cols: List[str], threshold: float = 0.01) -> Tuple[List[str], List[str]]:
    """Remove features with variance < threshold."""
    retained, removed = [], []
    for col in cols:
        v = df[col].var()
        if v < threshold:
            removed.append(col)
            print(f"  [VAR] Removed {col} (var={v:.6f})")
        else:
            retained.append(col)
    return retained, removed


def filter_high_corr(df: pd.DataFrame, cols: List[str], threshold: float = 0.85) -> Tuple[List[str], List[str]]:
    """Remove one feature from highly correlated pairs (Spearman)."""
    corr = df[cols].corr(method='spearman').abs()
    to_remove = set()
    for i in range(len(cols)):
        if cols[i] in to_remove:
            continue  # Skip already marked features
        for j in range(i + 1, len(cols)):
            if cols[j] in to_remove:
                continue
            if corr.iloc[i, j] > threshold:
                ci, cj = cols[i], cols[j]
                drop = ci if df[ci].var() < df[cj].var() else cj
                to_remove.add(drop)
                print(f"  [CORR] {ci} vs {cj}: r={corr.iloc[i,j]:.3f}, removed {drop}")
    return [c for c in cols if c not in to_remove], list(to_remove)

In [10]:
TARGET_COL = 'Diabetes_binary'
WEIGHT_COL = 'Sample_Weight'
LOW_UTILITY = ['AnyHealthcare', 'CholCheck']

all_features = [c for c in df.columns if c not in [TARGET_COL, WEIGHT_COL]]
print(f"[INFO] Features before selection: {len(all_features)}")

# Tracking
removed_log = {'low_utility': [], 'near_zero_var': [], 'high_vif': [], 'high_corr': []}

[INFO] Features before selection: 35


In [11]:
print("="*50)
print("[STEP 1] Low Utility Removal")
for f in LOW_UTILITY:
    if f in all_features:
        all_features.remove(f)
        removed_log['low_utility'].append(f)
        print(f"  Removed {f}")
print(f"  Remaining: {len(all_features)}")

[STEP 1] Low Utility Removal
  Removed AnyHealthcare
  Removed CholCheck
  Remaining: 33


In [12]:
print("\n[STEP 2] Near-Zero Variance Filtering")
all_features, removed_var = filter_near_zero_var(df, all_features, VARIANCE_THRESHOLD)
removed_log['near_zero_var'] = removed_var
if not removed_var:
    print("  No features removed")
print(f"  Remaining: {len(all_features)}")


[STEP 2] Near-Zero Variance Filtering
  [VAR] Removed Is_Extreme_BMI (var=0.009383)
  Remaining: 32


In [13]:
print("\n[STEP 3] VIF Multicollinearity Filtering")
numeric_cols = df[all_features].select_dtypes(include=[np.number]).columns.tolist()
all_features, removed_vif = filter_high_vif(df, numeric_cols, VIF_THRESHOLD)
removed_log['high_vif'] = removed_vif
if not removed_vif:
    print("  No features removed")
print(f"  Remaining: {len(all_features)}")


[STEP 3] VIF Multicollinearity Filtering
  No features removed
  Remaining: 32


In [14]:
print("\n[STEP 4] High Correlation Filtering")
FINAL_FEATURES, removed_corr = filter_high_corr(df, all_features, CORRELATION_THRESHOLD)
removed_log['high_corr'] = removed_corr
if not removed_corr:
    print("  No features removed")
print(f"  Remaining: {len(FINAL_FEATURES)}")


[STEP 4] High Correlation Filtering
  [CORR] BMI vs BMI_WHO: r=0.946, removed BMI_WHO
  [CORR] BMI vs BMI_Squared: r=1.000, removed BMI
  [CORR] Smoker vs Risk_Behavior: r=0.946, removed Smoker
  [CORR] MentHlth vs MentHlth_Cat: r=0.991, removed MentHlth_Cat
  [CORR] PhysHlth vs PhysHlth_Cat: r=0.988, removed PhysHlth_Cat
  [CORR] Age vs Age_Group: r=0.964, removed Age_Group
  [CORR] Age vs Age_BMI_Interaction: r=0.851, removed Age
  [CORR] Income vs SDOH_Index: r=0.930, removed SDOH_Index
  Remaining: 24


In [15]:
print("\n" + "="*50)
print("FEATURE SELECTION SUMMARY")
print("="*50)
print(f"Initial: {len(df.columns) - 2} | Final: {len(FINAL_FEATURES)}")
print(f"\n[REMOVED LOG]")
for k, v in removed_log.items():
    print(f"  {k}: {v}")
all_removed = sum(removed_log.values(), [])
print(f"\nTotal removed: {len(all_removed)}")
print(f"\n[FINAL FEATURES]\n{FINAL_FEATURES}")
print("="*50)


FEATURE SELECTION SUMMARY
Initial: 35 | Final: 24

[REMOVED LOG]
  low_utility: ['AnyHealthcare', 'CholCheck']
  near_zero_var: ['Is_Extreme_BMI']
  high_vif: []
  high_corr: ['Age_Group', 'SDOH_Index', 'Age', 'Smoker', 'BMI_WHO', 'PhysHlth_Cat', 'BMI', 'MentHlth_Cat']

Total removed: 11

[FINAL FEATURES]
['HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Education', 'Income', 'Age_BMI_Interaction', 'CVD_Risk', 'MetSyn_Risk', 'Chronic_Count', 'Lifestyle_Score', 'Risk_Behavior', 'BMI_Squared', 'Health_Imbalance']


---
## 10. Stratified Data Partitioning

In [16]:
def stratified_split(df: pd.DataFrame, features: List[str], target: str, weight: str,
                     test_size: float = 0.2, seed: int = 42):
    """Execute stratified train-test split preserving class distribution and sample weights."""
    X, y, w = df[features], df[target], df[weight]
    return train_test_split(X, y, w, test_size=test_size, stratify=y, random_state=seed)


print("="*50)
X_train, X_test, y_train, y_test, w_train, w_test = stratified_split(
    df, FINAL_FEATURES, TARGET_COL, WEIGHT_COL, TEST_SIZE, RANDOM_STATE
)

print(f"Train: {len(X_train):,} ({(1-TEST_SIZE)*100:.0f}%)")
print(f"Test:  {len(X_test):,} ({TEST_SIZE*100:.0f}%)")

train_rate, test_rate = y_train.mean(), y_test.mean()
print(f"\nClass distribution:")
print(f"  Train positive: {train_rate:.4f}")
print(f"  Test positive:  {test_rate:.4f}")

assert np.isclose(train_rate, test_rate, atol=0.01), "Stratification failed"
print("\n[VALIDATED] Stratification verified")
print("="*50)

Train: 183,436 (80%)
Test:  45,860 (20%)

Class distribution:
  Train positive: 0.1727
  Test positive:  0.1727

[VALIDATED] Stratification verified


---
## 11. Robust Numerical Scaling

In [17]:
def identify_continuous(df: pd.DataFrame, threshold: int = 10) -> List[str]:
    """Identify features with >threshold unique values as continuous."""
    return [c for c in df.columns if df[c].nunique() > threshold]


def apply_robust_scaling(X_train: pd.DataFrame, X_test: pd.DataFrame, 
                         cols: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame, RobustScaler]:
    """Apply RobustScaler (median/IQR normalization) fitted on training set only."""
    Xtr, Xte = X_train.copy(), X_test.copy()
    cols_scale = [c for c in cols if c in Xtr.columns]
    if not cols_scale:
        print("[WARNING] No continuous columns")
        return Xtr, Xte, None
    scaler = RobustScaler()
    Xtr[cols_scale] = scaler.fit_transform(Xtr[cols_scale])
    Xte[cols_scale] = scaler.transform(Xte[cols_scale])
    print(f"[SCALING] Applied to {len(cols_scale)} features: {cols_scale}")
    return Xtr, Xte, scaler


print("="*50)
continuous_features = identify_continuous(X_train, threshold=10)
print(f"Continuous features: {len(continuous_features)}")

X_train_scaled, X_test_scaled, scaler = apply_robust_scaling(X_train, X_test, continuous_features)

if scaler:
    print(f"\nPost-scaling validation (train):")
    for c in continuous_features[:3]:
        if c in X_train_scaled.columns:
            print(f"  {c}: median={X_train_scaled[c].median():.4f}")
print("\n[VALIDATED] Scaling complete")
print("="*50)

Continuous features: 4
[SCALING] Applied to 4 features: ['MentHlth', 'PhysHlth', 'Age_BMI_Interaction', 'BMI_Squared']

Post-scaling validation (train):
  MentHlth: median=0.0000
  PhysHlth: median=0.0000
  Age_BMI_Interaction: median=0.0000

[VALIDATED] Scaling complete


---
## 12. Multi-Task Data Export

In [18]:
def export_classification(X_tr, X_te, y_tr, y_te, w_tr, w_te, tr_path, te_path):
    """Export train and test datasets with features, target, and sample weights."""
    train_df = X_tr.copy()
    train_df['Diabetes_binary'] = y_tr.values
    train_df['Sample_Weight'] = w_tr.values
    
    test_df = X_te.copy()
    test_df['Diabetes_binary'] = y_te.values
    test_df['Sample_Weight'] = w_te.values
    
    train_df.to_csv(tr_path, index=False)
    test_df.to_csv(te_path, index=False)
    print(f"[EXPORT] Train: {tr_path.name} ({len(train_df):,})")
    print(f"[EXPORT] Test: {te_path.name} ({len(test_df):,})")


def export_clustering(df, features, path, scaler=None, cont_cols=None):
    """Export feature matrix for clustering tasks without target column."""
    cluster_df = df[features].copy()
    if scaler and cont_cols:
        cols = [c for c in cont_cols if c in cluster_df.columns]
        if cols:
            cluster_df[cols] = scaler.transform(cluster_df[cols])
    cluster_df.to_csv(path, index=False)
    print(f"[EXPORT] Clustering: {path.name} ({len(cluster_df):,}, {len(features)} features)")


def export_metadata(features, removed, cont_cols, path):
    """Export feature metadata including binning definitions and pipeline configuration."""
    # Binning definitions for engineered categorical features
    binning_definitions = {
        'BMI_WHO': {
            'description': 'WHO BMI classification',
            'bins': {'1': 'Underweight (BMI < 18.5)', '2': 'Normal (18.5 <= BMI < 25)', 
                     '3': 'Overweight (25 <= BMI < 30)', '4': 'Obese (BMI >= 30)'}
        },
        'Age_Group': {
            'description': 'Life stage categories derived from Age (1-13)',
            'bins': {'1': '18-29 (Young Adult)', '2': '30-44 (Early Middle Age)', 
                     '3': '45-59 (Late Middle Age)', '4': '60-74 (Young-Old)', '5': '75+ (Old-Old)'}
        },
        'MentHlth_Cat': {
            'description': 'Mental health burden (days in past 30)',
            'bins': {'0': 'None (0 days)', '1': 'Moderate (1-13 days)', '2': 'Severe (14-30 days)'}
        },
        'PhysHlth_Cat': {
            'description': 'Physical health burden (days in past 30)',
            'bins': {'0': 'None (0 days)', '1': 'Moderate (1-13 days)', '2': 'Severe (14-30 days)'}
        },
        'Is_Extreme_BMI': {
            'description': 'Extreme obesity indicator',
            'bins': {'0': 'BMI <= 50', '1': 'BMI > 50 (extreme)'}
        }
    }
    
    # Composite feature definitions
    composite_definitions = {
        'Age_BMI_Interaction': 'Age × BMI product term for non-linear synergy',
        'CVD_Risk': 'HighBP × HighChol (cardiovascular co-morbidity indicator)',
        'MetSyn_Risk': 'HighBP + HighChol + (BMI_WHO >= 4), range [0-3]',
        'Chronic_Count': 'Sum of HighBP, HighChol, HeartDiseaseorAttack, Stroke',
        'SDOH_Index': '(Education + Income) / 2, social determinants composite',
        'Lifestyle_Score': 'PhysActivity + Fruits + Veggies, range [0-3]',
        'Risk_Behavior': 'Smoker + HvyAlcoholConsump, range [0-2]',
        'BMI_Squared': 'BMI² for non-linear effects',
        'Health_Imbalance': 'MentHlth_Cat - PhysHlth_Cat, range [-2, 2]'
    }
    
    meta = {
        'final_features': features,
        'feature_count': len(features),
        'removed_features': removed,
        'continuous_features': cont_cols,
        'scaling_method': 'RobustScaler',
        'train_test_split': {'test_size': TEST_SIZE, 'random_state': RANDOM_STATE, 'stratified': True},
        'binning_definitions': binning_definitions,
        'composite_definitions': composite_definitions,
        'thresholds': {
            'BMI_EXTREME': BMI_EXTREME_THRESHOLD,
            'BMI_WINSORIZE_PCT': BMI_WINSORIZE_PERCENTILE,
            'VARIANCE': VARIANCE_THRESHOLD,
            'VIF': VIF_THRESHOLD,
            'CORRELATION': CORRELATION_THRESHOLD
        }
    }
    with open(path, 'w') as f:
        json.dump(meta, f, indent=2)
    print(f"[EXPORT] Metadata: {path.name}")

In [19]:
print("="*50)
print("MULTI-TASK DATA EXPORT")
print("="*50)

export_classification(X_train_scaled, X_test_scaled, y_train, y_test, 
                      w_train, w_test, TRAIN_OUTPUT, TEST_OUTPUT)

# === DATA LEAKAGE PREVENTION: Split Column Methodology ===
# Test set must remain unseen during all model fitting steps.
# A 'split' column is appended to clustering data to maintain explicit
# train/test provenance throughout the pipeline.
# Prevents index-based errors when CSV files are exported without indices.
df_with_split = df.copy()
df_with_split['split'] = 'train'
df_with_split.loc[X_test.index, 'split'] = 'test'

print(f"\n[SPLIT] Train samples: {(df_with_split['split'] == 'train').sum():,}")
print(f"[SPLIT] Test samples: {(df_with_split['split'] == 'test').sum():,}")

# === DUAL-SCALE EXPORT STRATEGY ===
# Mixed-type clustering requires raw unscaled data to preserve
# categorical semantics and numerical scales.
export_clustering(
    df_with_split,
    FINAL_FEATURES + ['split'],
    CLUSTERING_RAW_OUTPUT,
    scaler=None
)

# === STANDARDIZED DATA FOR PCA ===
# PCA requires standardized features to prevent scale-dominance bias.
# Export StandardScaler-normalized data for dimensionality reduction
full_scaler = StandardScaler()
X_fully_scaled = full_scaler.fit_transform(df[FINAL_FEATURES])
df_fully_scaled = pd.DataFrame(X_fully_scaled, columns=FINAL_FEATURES, index=df.index)
df_fully_scaled['split'] = df_with_split['split']
export_clustering(
    df_fully_scaled,
    FINAL_FEATURES + ['split'],
    CLUSTERING_SCALED_OUTPUT,
    scaler=None
)

export_metadata(FINAL_FEATURES, removed_log, continuous_features, METADATA_OUTPUT)
export_metadata(FINAL_FEATURES, removed_log, continuous_features, METADATA_OUTPUT)
if scaler:
    joblib.dump(scaler, SCALER_OUTPUT)
    print(f"[EXPORT] Scaler: {SCALER_OUTPUT.name}")

print("="*50)

MULTI-TASK DATA EXPORT
[EXPORT] Train: CDC_Train_Classification_BASELINE.csv (183,436)
[EXPORT] Test: CDC_Test_Classification_BASELINE.csv (45,860)

[SPLIT] Train samples: 183,436
[SPLIT] Test samples: 45,860
[EXPORT] Clustering: CDC_Clustering_RAW.csv (229,296, 25 features)
[EXPORT] Clustering: CDC_Clustering_SCALED.csv (229,296, 25 features)
[EXPORT] Metadata: feature_metadata.json
[EXPORT] Metadata: feature_metadata.json
[EXPORT] Scaler: robust_scaler.pkl


---
## 13. Pipeline Summary

In [20]:
print("\n" + "#"*50)
print("FEATURE ENGINEERING COMPLETE")
print("#"*50)

print("\n[STAGES]")
print("  1. Environment Configuration: Library imports and random seed initialization")
print("  2. Path Configuration and Constants: File paths and threshold definitions")
print("  3. Data Loading and Validation: CSV loading with Sample_Weight normalization check")
print("  4. Outlier Refinement: Is_Extreme_BMI marker + P99 winsorization")
print("  5. Clinical Discretization: BMI_WHO, Age_Group, MentHlth_Cat, PhysHlth_Cat")
print("  6. Interaction Feature Synthesis: Age_BMI_Interaction, CVD_Risk, MetSyn_Risk")
print("  7. Feature Aggregation: Chronic_Count, SDOH_Index, Lifestyle_Score, Risk_Behavior")
print("  8. Advanced Features: BMI_Squared, Health_Imbalance")
print("  9. Feature Quality Control and Selection: Low-utility + Variance + VIF + Correlation filtering")
print("  10. Stratified Data Partitioning: 80/20 split with class balance preservation")
print("  11. Robust Numerical Scaling: RobustScaler fitted on training set only")
print("  12. Multi-Task Data Export: Classification (train/test) + Clustering (raw/scaled) datasets")
print("  13. Pipeline Summary: Complete workflow overview and file manifest")

print(f"\n[STATS] Initial: {len(df.columns)-2} | Final: {len(FINAL_FEATURES)} | Removed: {len(all_removed)}")

print(f"\n[FILES]")
print(f"  {TRAIN_OUTPUT.name}")
print(f"  {TEST_OUTPUT.name}")
print(f"  {CLUSTERING_RAW_OUTPUT.name} (For K-Prototypes)")
print(f"  {CLUSTERING_SCALED_OUTPUT.name} (For PCA/GMM)")
print(f"  {METADATA_OUTPUT.name}")
print(f"  {SCALER_OUTPUT.name}")

print("#"*50)
print("\n[READY FOR] Phase 4: Clustering | Phase 5: Classification")


##################################################
FEATURE ENGINEERING COMPLETE
##################################################

[STAGES]
  1. Environment Configuration: Library imports and random seed initialization
  2. Path Configuration and Constants: File paths and threshold definitions
  3. Data Loading and Validation: CSV loading with Sample_Weight normalization check
  4. Outlier Refinement: Is_Extreme_BMI marker + P99 winsorization
  5. Clinical Discretization: BMI_WHO, Age_Group, MentHlth_Cat, PhysHlth_Cat
  6. Interaction Feature Synthesis: Age_BMI_Interaction, CVD_Risk, MetSyn_Risk
  7. Feature Aggregation: Chronic_Count, SDOH_Index, Lifestyle_Score, Risk_Behavior
  8. Advanced Features: BMI_Squared, Health_Imbalance
  9. Feature Quality Control and Selection: Low-utility + Variance + VIF + Correlation filtering
  10. Stratified Data Partitioning: 80/20 split with class balance preservation
  11. Robust Numerical Scaling: RobustScaler fitted on training set only
  12. M