In [1]:
# ==================================================================
# RAI DASHBOARD - CLINICAL FEATURES ANALYSIS (With Tuned XGBoost)
# ==================================================================

# --- 1. IMPORTS & CONFIGURATION ---
import pandas as pd
import numpy as np
from pathlib import Path
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, classification_report

# RAI imports
from responsibleai import RAIInsights
from raiwidgets import ResponsibleAIDashboard

warnings.filterwarnings("ignore")

# ==================================================================
# 1. DATA LOADING & PREPROCESSING
# ==================================================================
print("üîç Loading and preparing data...")

# Setup paths
PROJECT_ROOT = Path(os.environ.get('PROJECT_ROOT', '../../')).resolve()
DATA_RAW_DIR = Path(os.environ.get('DATA_RAW_DIR', PROJECT_ROOT / 'data/raw')).resolve()
data_file = DATA_RAW_DIR / 'heart disease.parquet'

# Load dataset
if not data_file.is_file():
    raise FileNotFoundError(f"Data file not found: {data_file.resolve()}")

df = pd.read_parquet(data_file)
df = df.drop(columns=['id']) if 'id' in df.columns else df
df = df.drop_duplicates().reset_index(drop=True)

# Column mapping
column_mapping = {
    'age': 'Age',  
    'gender': 'Sex',
    'height': 'Height',
    'weight': 'Weight',
    'ap_hi': 'Systolic_BP',
    'ap_lo': 'Diastolic_BP',
    'cholesterol': 'Cholesterol_Level',
    'gluc': 'Glucose_Level',
    'smoke': 'Smoking_Status',
    'alco': 'Alcohol_Intake',
    'active': 'Physical_Activity',
    'cardio': 'target'
}
df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})

# Convert Age from days to years
if 'Age' in df.columns:
    df['Age_Years'] = (df['Age'] / 365.25).round().astype(int)
    df = df.drop(columns=['Age'])

# Remove invalid ages
df = df[df['Age_Years'].between(0, 100)]

# Calculate BMI and drop original columns
if 'Height' in df.columns and 'Weight' in df.columns:
    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    df = df.drop(columns=['Height', 'Weight'])

# Calculate and validate Pulse Pressure
if 'Systolic_BP' in df.columns and 'Diastolic_BP' in df.columns:
    df['Pulse_Pressure'] = df['Systolic_BP'] - df['Diastolic_BP']
    invalid_bp = (df['Pulse_Pressure'] < 20) | (df['Pulse_Pressure'] > 100)
    df.loc[invalid_bp, ['Systolic_BP', 'Diastolic_BP', 'Pulse_Pressure']] = np.nan

print(f"‚úÖ Initial preprocessing complete. Shape: {df.shape}")

# ==================================================================
# 2. PREPARE RAI FEATURES
# ==================================================================
print("\nüìä Preparing features for RAI...")

rai_features = [
    'Age_Years', 'Sex', 'BMI', 
    'Systolic_BP', 'Diastolic_BP', 'Pulse_Pressure',
    'Cholesterol_Level', 'Glucose_Level',
    'Smoking_Status', 'Alcohol_Intake', 'Physical_Activity'
]
target_col = 'target'
MAX_SAMPLES = 5000

# Keep only existing features and handle missing values
rai_features = [f for f in rai_features if f in df.columns]
df_rai = df[rai_features + [target_col]].copy()

# Handle missing values
for col in df_rai.columns:
    if df_rai[col].isna().any():
        if df_rai[col].dtype in ['int64', 'float64']:
            df_rai[col].fillna(df_rai[col].median(), inplace=True)
        else:
            df_rai[col].fillna(df_rai[col].mode()[0], inplace=True)

# First stratified split to get initial 5000 samples
if len(df_rai) > MAX_SAMPLES:
    _, df_rai = train_test_split(
        df_rai, 
        train_size=MAX_SAMPLES,
        stratify=df_rai[target_col],
        random_state=42
    )
    df_rai = df_rai.reset_index(drop=True)

print(f"‚úÖ Final dataset shape: {df_rai.shape}")
print("\nüìä Class distribution:")
class_dist = df_rai[target_col].value_counts()
for cls in sorted(class_dist.index):
    count = class_dist[cls]
    print(f"   Class {cls}: {count:,} samples ({count/len(df_rai):.1%})")

# ==================================================================
# 3. TRAIN/TEST SPLIT & MODEL SETUP
# ==================================================================
print("\nü§ñ Setting up model...")

X = df_rai[rai_features]
y = df_rai[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define feature types
numerical_features = ['Age_Years', 'BMI', 'Systolic_BP', 'Diastolic_BP', 'Pulse_Pressure']
ordinal_features = ['Cholesterol_Level', 'Glucose_Level']
nominal_features = ['Sex', 'Smoking_Status', 'Alcohol_Intake', 'Physical_Activity']

# Create preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', RobustScaler(), numerical_features),
    ('ord', OrdinalEncoder(categories=[[1, 2, 3]] * 2), ordinal_features),
    ('nom', OneHotEncoder(drop='first', sparse=False), nominal_features)
], remainder='drop')

# Best XGBoost parameters (from Optuna)
best_params = {
    'n_estimators': 422,
    'max_depth': 4,
    'learning_rate': 0.06439748935039273,
    'subsample': 0.7224788588671935,
    'colsample_bytree': 0.6288331170227341,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'logloss'
}

# Pipeline with SMOTE and tuned XGBoost
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('balancer', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(**best_params))
])

# Train model
print("\n‚öôÔ∏è Training model with tuned XGBoost parameters...")
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("\nüìä Model Performance (Tuned):")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ==================================================================
# 4. RAI DASHBOARD SETUP
# ==================================================================
print("\nüöÄ Setting up RAI Dashboard...")

train_with_target = X_train.copy()
train_with_target[target_col] = y_train
test_with_target = X_test.copy()
test_with_target[target_col] = y_test

rai_insights = RAIInsights(
    model=pipeline,
    train=train_with_target,
    test=test_with_target,
    target_column=target_col,
    task_type='classification',
    categorical_features=ordinal_features + nominal_features
)

# Add explainer and error analysis
rai_insights.explainer.add()
rai_insights.error_analysis.add()

# Compute insights
print("\n‚öôÔ∏è Computing RAI insights (this may take several minutes)...")
rai_insights.compute()

# Launch dashboard
print("\nüéâ Launching RAI Dashboard...")
ResponsibleAIDashboard(rai_insights)

print("\n" + "="*60)
print("‚ú® RAI DASHBOARD IS NOW RUNNING ‚ú®")
print("="*60)
print(f"üìä Access the dashboard at: http://localhost:5000")
print(f"üìà Features analyzed: {len(rai_features)}")
print(f"üìù Training samples: {len(X_train):,}")
print(f"üìù Test samples: {len(X_test):,}")
print("="*60)
print("\n‚ö° Press Ctrl+C to stop the server")
print("="*60)

# Keep notebook running
try:
    import time
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\n\nüëã Dashboard stopped")

  from .autonotebook import tqdm as notebook_tqdm


üîç Loading and preparing data...
‚úÖ Initial preprocessing complete. Shape: (69976, 12)

üìä Preparing features for RAI...
‚úÖ Final dataset shape: (64976, 12)

üìä Class distribution:
   Class 0: 32,503 samples (50.0%)
   Class 1: 32,473 samples (50.0%)

ü§ñ Setting up model...

‚öôÔ∏è Training model with tuned XGBoost parameters...

üìä Model Performance (Tuned):
ROC AUC: 0.7932

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.76      0.74      6501
           1       0.74      0.69      0.71      6495

    accuracy                           0.73     12996
   macro avg       0.73      0.73      0.72     12996
weighted avg       0.73      0.73      0.72     12996


üöÄ Setting up RAI Dashboard...

‚öôÔ∏è Computing RAI insights (this may take several minutes)...
Causal Effects
Current Status: Generating Causal Effects.
Current Status: Finished generating causal effects.
Time taken: 0.0 min 2.2000000171829015e-05 sec
Co

  File "p:\projects\heart disease prediction\env\lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
  File "p:\projects\heart disease prediction\env\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "p:\projects\heart disease prediction\env\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "p:\projects\heart disease prediction\env\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Current Status: Finished generating error analysis reports.
Time taken: 0.0 min 0.30314739999994345 sec
Explanations
Current Status: Explaining 11 features
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 532
[LightGBM] [Info] Number of data points in the train set: 51980, number of used features: 11
[LightGBM] [Info] Start training from score 0.011971
Current Status: Explained 11 features.
Time taken: 0.0 min 0.7783541000007972 sec

üéâ Launching RAI Dashboard...
ResponsibleAI started at http://localhost:8704

‚ú® RAI DASHBOARD IS NOW RUNNING ‚ú®
üìä Access the dashboard at: http://localhost:5000
üìà Features analyzed: 11
üìù Training samples: 51,980
üìù Test samples: 12,996

‚ö° Press Ctrl+C to stop the server


üëã Dashboard stopped


In [6]:
# Import TomekLinks
from imblearn.under_sampling import TomekLinks

# ==================================================================
# 2. CORRECT MUTUALLY EXCLUSIVE FEATURE CREATION
# ==================================================================

print("\n‚ú® Creating PROPER mutually exclusive features...")

df_after = df.copy()

# 1. For "Sedentary Elderly" - Physical Activity == 0 should be EXCLUDED from original
df_after['Sedentary_Elderly'] = (
    (df_after['Age_Years'] > 55) & (df_after['Physical_Activity'] == 0)
).astype(int)

# Create modified physical activity where class 0 is EXCLUDED/REMOVED
df_after['Physical_Activity_mod'] = df_after['Physical_Activity'].copy()
# Remove the specific class (0) that went into the new feature
df_after.loc[df_after['Sedentary_Elderly'] == 1, 'Physical_Activity_mod'] = np.nan
# Option: Remove entirely or create a new category like 'excluded_sedentary'

# 2. For "Pre-Hypertension with Normal Cholesterol" - Cholesterol_Level == 1 should be EXCLUDED
df_after['PreHtn_NormalChol'] = (
    (df_after['Systolic_BP'].between(120, 139)) & (df_after['Cholesterol_Level'] == 1)
).astype(int)

# Create modified cholesterol where class 1 is EXCLUDED/REMOVED  
df_after['Cholesterol_Level_mod'] = df_after['Cholesterol_Level'].copy()
df_after.loc[df_after['PreHtn_NormalChol'] == 1, 'Cholesterol_Level_mod'] = np.nan

print("‚úÖ Proper mutually exclusive features created.")

# ==================================================================
# 3. VERIFICATION OF MUTUAL EXCLUSIVITY
# ==================================================================

def verify_mutual_exclusivity(df_after):
    """Verify that the mutual exclusivity logic is working correctly"""
    
    print("\n" + "="*50)
    print("MUTUAL EXCLUSIVITY VERIFICATION")
    print("="*50)
    
    # Check 1: Sedentary Elderly vs Physical_Activity_mod
    sedentary_cases = df_after[df_after['Sedentary_Elderly'] == 1]
    print(f"\n1. Sedentary Elderly Cases: {len(sedentary_cases)}")
    print("   Physical_Activity_mod values in these cases:")
    print(f"   {sedentary_cases['Physical_Activity_mod'].value_counts(dropna=False)}")
    
    # Check 2: PreHtn_NormalChol vs Cholesterol_Level_mod
    prehtn_cases = df_after[df_after['PreHtn_NormalChol'] == 1]
    print(f"\n2. PreHtn_NormalChol Cases: {len(prehtn_cases)}")
    print("   Cholesterol_Level_mod values in these cases:")
    print(f"   {prehtn_cases['Cholesterol_Level_mod'].value_counts(dropna=False)}")
    
    # Check 3: Verify no double-counting
    overlapping = df_after[
        (df_after['Sedentary_Elderly'] == 1) & 
        (df_after['Physical_Activity_mod'] == 0)
    ]
    print(f"\n3. Cases with double-counting (should be 0): {len(overlapping)}")

verify_mutual_exclusivity(df_after)

# ==================================================================
# 4. UPDATED ERROR ANALYSIS WITH CORRECT SUBGROUP DEFINITION
# ==================================================================

def run_error_analysis_corrected(X, y, description, original_df, use_modified_features=False):
    """Trains a model with CORRECT mutual exclusivity logic."""
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Handle NaN values created by mutual exclusivity
    if use_modified_features:
        X_train = X_train.fillna(-999)  # Special value for excluded cases
        X_test = X_test.fillna(-999)

    # Preprocessing setup
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Handle modified features
    modified_features = ['Physical_Activity_mod', 'Cholesterol_Level_mod']
    for f in modified_features:
        if f in X.columns:
            if f in numerical_features:
                numerical_features.remove(f)
            if f not in categorical_features:
                categorical_features.append(f)

    preprocessor = ColumnTransformer(transformers=[
        ('num', RobustScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

    pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('balancer', TomekLinks()),
        ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    results_df = X_test.copy()
    results_df['true_label'] = y_test
    results_df['predicted_label'] = y_pred
    
    # Join with original data for consistent subgroup definition
    # Use the index to join and add _orig suffix for original features
    original_cols = ['Cholesterol_Level', 'Physical_Activity', 'Age_Years']
    for col in original_cols:
        if col in original_df.columns:
            results_df[f'{col}_orig'] = original_df.loc[results_df.index, col]

    # Define subgroup using ORIGINAL features (consistent definition)
    subgroup = results_df[
        (results_df['Systolic_BP'] <= 139.5) &
        (results_df['Age_Years_orig'] > 57.5) &
        (results_df['Cholesterol_Level_orig'] != 3) &
        (results_df['BMI'] > 21.97) &
        (results_df['Physical_Activity_orig'] == 0)
    ]
    
    if len(subgroup) == 0:
        print(f"\n‚ö†Ô∏è Subgroup for '{description}' is empty. Cannot calculate error rate.")
        return

    errors = (subgroup['true_label'] != subgroup['predicted_label']).sum()
    error_rate = errors / len(subgroup)
    
    print(f"\n--- {description} ---")
    print(f"Subgroup size: {len(subgroup)}")
    print(f"Error Rate: {error_rate:.2%}")
    
    # Show mutual exclusivity impact
    if use_modified_features:
        print(f"\nMutual Exclusivity Impact:")
        sedentary_in_subgroup = subgroup['Sedentary_Elderly'].sum()
        prehtn_in_subgroup = subgroup['PreHtn_NormalChol'].sum()
        
        print(f"  - {sedentary_in_subgroup}/{len(subgroup)} cases captured by 'Sedentary_Elderly'")
        print(f"  - {prehtn_in_subgroup}/{len(subgroup)} cases captured by 'PreHtn_NormalChol'")
        
        # Show what happened to the original features for these cases
        if sedentary_in_subgroup > 0:
            sedentary_cases = subgroup[subgroup['Sedentary_Elderly'] == 1]
            unique_vals = sedentary_cases['Physical_Activity_mod'].unique()
            print(f"  - Physical_Activity_mod values for sedentary cases: {unique_vals}")

# Run analyses
print("\n" + "="*60)
print("RUNNING CORRECTED MUTUAL EXCLUSIVITY ANALYSIS")
print("="*60)

# BEFORE analysis (original features)
base_features = [
    'Age_Years', 'Sex', 'BMI', 'Systolic_BP', 'Diastolic_BP', 'Pulse_Pressure',
    'Cholesterol_Level', 'Glucose_Level', 'Smoking_Status', 'Alcohol_Intake', 'Physical_Activity'
]
X_before = df[base_features]
y_before = df['target']
run_error_analysis_corrected(X_before, y_before, "BEFORE New Features", df, False)

# AFTER analysis with PROPER mutually exclusive features
extended_features = [
    'Age_Years', 'Sex', 'BMI', 'Systolic_BP', 'Diastolic_BP', 'Pulse_Pressure',
    'Glucose_Level', 'Smoking_Status', 'Alcohol_Intake',
    'Physical_Activity_mod', 'Cholesterol_Level_mod',  # Modified originals
    'PreHtn_NormalChol', 'Sedentary_Elderly', 'Age_Weighted_BMI'  # New features
]
extended_features = [col for col in extended_features if col in df_after.columns]

X_after = df_after[extended_features]
y_after = df_after['target']
run_error_analysis_corrected(X_after, y_after, "AFTER New Features (Mutually Exclusive)", df, True)


‚ú® Creating PROPER mutually exclusive features...
‚úÖ Proper mutually exclusive features created.

MUTUAL EXCLUSIVITY VERIFICATION

1. Sedentary Elderly Cases: 6009
   Physical_Activity_mod values in these cases:
   NaN    6009
Name: Physical_Activity_mod, dtype: int64

2. PreHtn_NormalChol Cases: 30754
   Cholesterol_Level_mod values in these cases:
   NaN    30754
Name: Cholesterol_Level_mod, dtype: int64

3. Cases with double-counting (should be 0): 0

RUNNING CORRECTED MUTUAL EXCLUSIVITY ANALYSIS

--- BEFORE New Features ---
Subgroup size: 481
Error Rate: 37.63%

--- AFTER New Features (Mutually Exclusive) ---
Subgroup size: 481
Error Rate: 36.80%

Mutual Exclusivity Impact:
  - 481/481 cases captured by 'Sedentary_Elderly'
  - 352/481 cases captured by 'PreHtn_NormalChol'
  - Physical_Activity_mod values for sedentary cases: [-999.]
