In [None]:
#STACKED ENSEMBLE WITH SHAP + LIME EXPLAINABILITY + GRADIO INTERFACE - MEMORY OPTIMIZED FOR MAXIMUM FEATURES

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
import seaborn as sns
import matplotlib.pyplot as plt
import subprocess
import sys
import os
import gc
import psutil
import time
import gradio as gr
import joblib

# Load dataset
df = pd.read_csv("Dataset.csv")

# 1. Drop unnecessary columns
df.drop(columns=["ID_Patient_Care_Situation", "Patient_ID"], inplace=True)

# 2. Binary encode 'Patient_smoker' and 'Patient_rural_urban'
df["Patient_Smoker"] = df["Patient_Smoker"].map({"YES": 1, "NO": 0, "Cannot say": None})
df["Patient_Rural_Urban"] = df["Patient_Rural_Urban"].map({"RURAL": 1, "URBAN": 0})

# 3. One-hot encode 'Treated_with_drugs'
df = df.join(df["Treated_with_drugs"].str.get_dummies(sep=","))
df.drop(columns=["Treated_with_drugs"], inplace=True)

# 4. Save the processed dataset
df.to_csv("processed_dataset.csv", index=False)

print("Preprocessing completed. Processed dataset saved as 'processed_dataset.csv'")


# Create directory for PDF outputs
os.makedirs('explainability_plots_memory_optimized', exist_ok=True)

# MEMORY-OPTIMIZED Configuration parameters
N_BACKGROUND_SAMPLES = 100  # Reduced from 50 to save memory
N_TEST_SAMPLES = 20000       # Reduced from 2000 but still substantial
SHAP_NSAMPLES = 200        # Reduced from 100 for faster computation
BATCH_SIZE = 50           # Process SHAP values in batches
MAX_FEATURES_DISPLAY = 46  # Show more features in plots

print("="*60)
print("MEMORY-OPTIMIZED SHAP ANALYSIS")
print("="*60)
print("Configuration:")
print(f"- Background samples: {N_BACKGROUND_SAMPLES}")
print(f"- Test samples: {N_TEST_SAMPLES}")
print(f"- SHAP nsamples: {SHAP_NSAMPLES}")
print(f"- Batch processing: {BATCH_SIZE}")
print(f"- Features to display: {MAX_FEATURES_DISPLAY}")
print("="*60)

def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def clear_memory():
    """Force garbage collection to free memory"""
    gc.collect()

print(f"Initial memory usage: {get_memory_usage():.1f} MB")

# Install required packages
def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        return True
    except:
        return False

shap_installed = install_package("shap")
lime_installed = install_package("lime")

if shap_installed:
    import shap
if lime_installed:
    import lime
    import lime.lime_tabular
    from lime.lime_tabular import LimeTabularExplainer

# Load dataset with memory optimization
print("Loading dataset with memory optimization...")
df = pd.read_csv("processed_dataset.csv")
df = df.dropna(subset=['Patient_Smoker'])
df = df.fillna(0)

# Convert to more memory-efficient dtypes
print("Optimizing data types...")
for col in df.columns:
    if df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')
    elif df[col].dtype == 'int64':
        df[col] = df[col].astype('int32')

df = pd.get_dummies(df, drop_first=True)

print(f"Dataset shape: {df.shape}")
print(f"Memory usage after optimization: {get_memory_usage():.1f} MB")

X = df.drop("Survived_1_year", axis=1)
y = df["Survived_1_year"]
feature_names = X.columns.tolist()

print(f"Total features: {len(feature_names)}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use float32 for scaling to save memory
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train).astype(np.float32)
X_test_scaled = scaler.transform(X_test).astype(np.float32)

print(f"Memory after scaling: {get_memory_usage():.1f} MB")
clear_memory()

# Build model
print("Training stacked ensemble...")
base_models = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state=42)),
    ('nb', BernoulliNB()),
    ('rf', RandomForestClassifier(n_estimators=50, random_state=42)),  # Reduced trees
    ('xgb', XGBClassifier(eval_metric='logloss', n_estimators=50, random_state=42)),  # Reduced trees
    ('knn', KNeighborsClassifier(n_neighbors=3))  # Reduced neighbors
]

meta_model = LogisticRegression(max_iter=1000, random_state=42)
stacked_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=3, n_jobs=-1)
stacked_clf.fit(X_train_scaled, y_train)

y_pred = stacked_clf.predict(X_test_scaled)
y_pred_proba = stacked_clf.predict_proba(X_test_scaled)

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Memory Optimized Ensemble")
plt.savefig('explainability_plots_memory_optimized/confusion_matrix.pdf', bbox_inches='tight', dpi=300)
plt.show()

# MEMORY-OPTIMIZED SHAP Analysis
if shap_installed:
    try:
        print(f"\nMemory before SHAP: {get_memory_usage():.1f} MB")

        # Generate smaller but representative background samples
        print(f"Generating {N_BACKGROUND_SAMPLES} background samples using K-means...")
        background_samples = shap.kmeans(X_train_scaled, N_BACKGROUND_SAMPLES)
        print(f"Background samples shape: {background_samples.data.shape}")

        # Generate representative test samples (smaller batch)
        actual_test_samples = min(N_TEST_SAMPLES, len(X_test_scaled))
        print(f"Sampling {actual_test_samples} test samples...")
        X_test_sample = shap.sample(X_test_scaled, actual_test_samples, random_state=42)
        print(f"Test sample shape: {X_test_sample.shape}")

        clear_memory()
        print(f"Memory after sampling: {get_memory_usage():.1f} MB")

        # Model prediction wrapper
        def model_predict(X):
            return stacked_clf.predict_proba(X)[:, 1]

        # Create explainer
        print("Creating SHAP KernelExplainer...")
        explainer = shap.KernelExplainer(model_predict, background_samples)

        # BATCH PROCESSING for SHAP values to manage memory
        print(f"Computing SHAP values in batches of {BATCH_SIZE}...")

        all_shap_values = []
        num_batches = (len(X_test_sample) + BATCH_SIZE - 1) // BATCH_SIZE

        for batch_idx in range(num_batches):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = min((batch_idx + 1) * BATCH_SIZE, len(X_test_sample))

            print(f"Processing batch {batch_idx + 1}/{num_batches} (samples {start_idx}-{end_idx})...")
            batch_data = X_test_sample[start_idx:end_idx]

            batch_shap_values = explainer.shap_values(batch_data, nsamples=SHAP_NSAMPLES)
            all_shap_values.append(batch_shap_values)

            # Clear memory after each batch
            clear_memory()
            print(f"Memory after batch {batch_idx + 1}: {get_memory_usage():.1f} MB")

        # Combine all SHAP values
        print("Combining SHAP values from all batches...")
        shap_values = np.vstack(all_shap_values)
        print(f"Final SHAP values shape: {shap_values.shape}")

        # Clear intermediate results
        del all_shap_values
        clear_memory()

        # Save SHAP values to avoid recomputation
        print("Saving SHAP values...")
        np.save('explainability_plots_memory_optimized/shap_values.npy', shap_values)
        np.save('explainability_plots_memory_optimized/X_test_sample.npy', X_test_sample)

        # SHAP Feature Importance Plot - Show MORE features
        print(f"Creating SHAP feature importance plot ({MAX_FEATURES_DISPLAY} features)...")
        plt.figure(figsize=(12, MAX_FEATURES_DISPLAY * 0.4))  # Dynamic height
        shap.summary_plot(shap_values, X_test_sample, feature_names=feature_names,
                         plot_type="bar", show=False, max_display=MAX_FEATURES_DISPLAY)
        plt.title(f"SHAP Feature Importance - Top {MAX_FEATURES_DISPLAY} Features")
        plt.tight_layout()
        plt.savefig('explainability_plots_memory_optimized/shap_feature_importance.pdf', bbox_inches='tight', dpi=300)
        plt.show()

        # SHAP Summary Plot - Show MORE features
        print(f"Creating SHAP summary plot ({MAX_FEATURES_DISPLAY} features)...")
        plt.figure(figsize=(12, MAX_FEATURES_DISPLAY * 0.4))
        shap.summary_plot(shap_values, X_test_sample, feature_names=feature_names,
                         show=False, max_display=MAX_FEATURES_DISPLAY)
        plt.title(f"SHAP Summary Plot - Top {MAX_FEATURES_DISPLAY} Features")
        plt.tight_layout()
        plt.savefig('explainability_plots_memory_optimized/shap_summary_plot.pdf', bbox_inches='tight', dpi=300)
        plt.show()

        # SHAP Waterfall Plot
        print("Creating SHAP waterfall plot...")
        plt.figure(figsize=(12, 8))
        shap.waterfall_plot(shap.Explanation(values=shap_values[0],
                                           base_values=explainer.expected_value,
                                           data=X_test_sample[0],
                                           feature_names=feature_names),
                           max_display=15, show=False)  # Show top 15 in waterfall
        plt.title("SHAP Waterfall Plot - Instance 1")
        plt.tight_layout()
        plt.savefig('explainability_plots_memory_optimized/shap_waterfall_plot.pdf', bbox_inches='tight', dpi=300)
        plt.show()

        # Enhanced Global Feature Importance Analysis
        print("Computing comprehensive feature importance...")
        shap_importance = np.abs(shap_values).mean(0)
        shap_std = np.abs(shap_values).std(0)

        shap_feature_importance = pd.DataFrame({
            'feature': feature_names,
            'mean_importance': shap_importance,
            'std_importance': shap_std,
            'cv': shap_std / (shap_importance + 1e-8)
        }).sort_values('mean_importance', ascending=False)

        # Save detailed results
        shap_feature_importance.to_csv('explainability_plots_memory_optimized/feature_importance_detailed.csv', index=False)

        # Enhanced Global Feature Importance Plot
        print(f"Creating enhanced global feature importance plot ({MAX_FEATURES_DISPLAY} features)...")
        plt.figure(figsize=(14, MAX_FEATURES_DISPLAY * 0.4))
        top_features = shap_feature_importance.head(MAX_FEATURES_DISPLAY)

        bars = plt.barh(range(len(top_features)), top_features['mean_importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Mean |SHAP Value|')
        plt.title(f'Global Feature Importance - Top {MAX_FEATURES_DISPLAY} Features (Memory Optimized)')
        plt.gca().invert_yaxis()

        # Add error bars
        plt.errorbar(top_features['mean_importance'], range(len(top_features)),
                    xerr=top_features['std_importance'], fmt='none', color='red', alpha=0.5)

        plt.tight_layout()
        plt.savefig('explainability_plots_memory_optimized/global_feature_importance_shap.pdf', bbox_inches='tight', dpi=300)
        plt.show()

        # Feature Importance Tiers Analysis
        print("Creating feature importance tiers...")
        plt.figure(figsize=(14, 8))

        # Create tiers
        n_features = len(shap_feature_importance)
        tier1 = shap_feature_importance.head(10)
        tier2 = shap_feature_importance.iloc[10:25]
        tier3 = shap_feature_importance.iloc[25:50] if n_features > 25 else pd.DataFrame()

        x_pos = 0
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

        for i, (tier, label, color) in enumerate([(tier1, 'Tier 1 (Top 10)', colors[0]),
                                                  (tier2, 'Tier 2 (11-25)', colors[1]),
                                                  (tier3, 'Tier 3 (26-50)', colors[2])]):
            if len(tier) > 0:
                plt.barh(range(x_pos, x_pos + len(tier)), tier['mean_importance'],
                        color=color, alpha=0.7, label=label)
                plt.yticks(range(x_pos, x_pos + len(tier)), tier['feature'], fontsize=8)
                x_pos += len(tier)

        plt.xlabel('Mean |SHAP Value|')
        plt.title('SHAP Feature Importance by Tiers')
        plt.legend()
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig('explainability_plots_memory_optimized/feature_importance_tiers.pdf', bbox_inches='tight', dpi=300)
        plt.show()

        print("SHAP analysis completed successfully!")
        print(f"Final memory usage: {get_memory_usage():.1f} MB")

    except Exception as e:
        print(f"SHAP failed: {e}")
        import traceback
        traceback.print_exc()
        shap_installed = False

# LIME Analysis with more features
if lime_installed:
    try:
        print("\nRunning LIME analysis...")
        lime_explainer = LimeTabularExplainer(
            X_train_scaled,
            feature_names=feature_names,
            class_names=['Did not survive', 'Survived'],
            mode='classification',
            discretize_continuous=True
        )

        def predict_fn(X):
            return stacked_clf.predict_proba(X)

        for i in range(3):
            instance = X_test_scaled[i]
            exp = lime_explainer.explain_instance(
                instance,
                predict_fn,
                num_features=15,  # Show more features
                top_labels=2
            )

            fig = exp.as_pyplot_figure(label=1)
            plt.title(f"LIME Explanation - Instance {i+1} (15 Features)")
            plt.tight_layout()
            plt.savefig(f'explainability_plots_memory_optimized/lime_explanation_instance_{i+1}.pdf', bbox_inches='tight', dpi=300)
            plt.show()

    except Exception as e:
        print(f"LIME failed: {e}")

# Summary
print("\n" + "="*60)
print("MEMORY-OPTIMIZED ANALYSIS COMPLETED!")
print("="*60)

if shap_installed and 'shap_feature_importance' in locals():
    print(f"Top {min(20, len(shap_feature_importance))} Most Important Features:")
    display_features = min(20, len(shap_feature_importance))
    print(shap_feature_importance.head(display_features)[['feature', 'mean_importance']].to_string(index=False))

print(f"\nMemory optimization results:")
print(f"- Analyzed {MAX_FEATURES_DISPLAY} features in detail")
print(f"- Used {N_TEST_SAMPLES} test samples")
print(f"- Batch processing prevented memory crashes")
print(f"- Final memory usage: {get_memory_usage():.1f} MB")
print(f"\nAll results saved to 'explainability_plots_memory_optimized' directory")

# ----------------------
# Train full model for Gradio Interface
# ----------------------
print("\n" + "="*60)
print("TRAINING FULL MODEL FOR GRADIO INTERFACE")
print("="*60)

# Reload and prepare full dataset for interface
df_full = pd.read_csv("processed_dataset.csv")
df_full = df_full.dropna(subset=['Patient_Smoker'])
df_full = df_full.fillna(0)
df_full = pd.get_dummies(df_full, drop_first=True)

X_full = df_full.drop("Survived_1_year", axis=1)
y_full = df_full["Survived_1_year"]

# Scale full dataset
scaler_full = StandardScaler()
X_full_scaled = scaler_full.fit_transform(X_full)

# Train on full data for interface
print("Training stacked ensemble on full dataset for interface...")
base_models_full = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(eval_metric='logloss', n_estimators=100, random_state=42)),
    ('knn', KNeighborsClassifier())
]

meta_model_full = LogisticRegression(max_iter=1000, random_state=42)
stacked_clf_full = StackingClassifier(estimators=base_models_full, final_estimator=meta_model_full, cv=3, n_jobs=-1)
stacked_clf_full.fit(X_full_scaled, y_full)

print("Full model training completed!")

# ----------------------
# Gradio prediction function
# ----------------------
def predict_survival(age, bmi, smoker, rural_urban, prev_conditions):
    # Convert categorical inputs
    smoker_val = 1 if smoker == "YES" else 0
    rural_val = 1 if rural_urban == "RURAL" else 0

    # Create a dataframe row for input
    input_data = pd.DataFrame([[age, bmi, smoker_val, rural_val, prev_conditions]],
                              columns=["Patient_Age", "Patient_Body_Mass_Index",
                                       "Patient_Smoker", "Patient_Rural_Urban",
                                       "Number_of_prev_cond"])

    # Align columns with training data
    input_data = pd.get_dummies(input_data, drop_first=True).reindex(columns=X_full.columns, fill_value=0)

    # Scale
    input_scaled = scaler_full.transform(input_data)

    # Prediction
    pred = stacked_clf_full.predict(input_scaled)[0]
    prob = stacked_clf_full.predict_proba(input_scaled)[0][1]

    result = "Likely to Survive (1 year)" if pred == 1 else "Not Likely to Survive (1 year)"
    return result, f"Survival Probability: {prob:.2f}"

# ----------------------
# Build Gradio UI (Full-Screen Layout)
# ----------------------
print("\n" + "="*60)
print("LAUNCHING GRADIO INTERFACE")
print("="*60)

with gr.Blocks(css="""
    .gradio-container {max-width: 100% !important;}
    footer {display: none !important;}
""") as demo:
    gr.Markdown("## Patient Survival Prediction (1 Year)")

    with gr.Row():
        age = gr.Number(label="Patient Age")
        bmi = gr.Number(label="Patient BMI")
        smoker = gr.Radio(["YES", "NO"], label="Smoker")
        rural_urban = gr.Radio(["URBAN", "RURAL"], label="Rural/Urban")
        prev_conditions = gr.Number(label="Previous Conditions")

    with gr.Row():
        output = gr.Label(label="Prediction")
        output_prob = gr.Textbox(label="Probability Score")

    btn = gr.Button("Predict Survival")
    btn.click(
        predict_survival,
        inputs=[age, bmi, smoker, rural_urban, prev_conditions],
        outputs=[output, output_prob]
    )

demo.launch()

Preprocessing completed. Processed dataset saved as 'processed_dataset.csv'
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5d052f24cda2f1678a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


