<a href="https://colab.research.google.com/github/ashur7035-lgtm/Healthcare_Data_Dashboard/blob/main/Diabetes_Prediction_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# --- Configuration ---
RANDOM_STATE = 42
OUTPUT_DIR = 'output'
MODEL_NAME = 'RandomForest'

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def generate_synthetic_data(n_samples=1000, random_state=RANDOM_STATE):
    """
    Generates a synthetic dataset mimicking the structure and class imbalance
    of the Pima Indians Diabetes Dataset for demonstration purposes.
    """
    np.random.seed(random_state)

    # 8 features + 1 target
    features = {
        'Pregnancies': np.random.randint(0, 15, n_samples),
        'Glucose': np.random.normal(120, 30, n_samples),
        'BloodPressure': np.random.normal(70, 10, n_samples),
        'SkinThickness': np.random.normal(25, 10, n_samples),
        'Insulin': np.random.normal(120, 100, n_samples),
        'BMI': np.random.normal(32, 7, n_samples),
        'DiabetesPedigreeFunction': np.random.lognormal(np.log(0.47), 0.4, n_samples),
        'Age': np.random.randint(21, 75, n_samples)
    }
    df = pd.DataFrame(features)

    # Create an imbalanced target variable (Outcome: 1=Diabetic, 0=Non-Diabetic)
    # The chance of diabetes increases with Glucose, BMI, and Age.
    base_risk = (df['Glucose'] * 0.005) + (df['BMI'] * 0.05) + (df['Age'] * 0.01)

    # Apply a probability threshold and enforce imbalance (approx 35% positive cases)
    probabilities = 1 / (1 + np.exp(-(base_risk - 5))) # Logistic function
    df['Outcome'] = (probabilities > 0.35).astype(int)

    print(f"--- Data Generated ---")
    print(f"Total Samples: {len(df)}")
    print(f"Diabetic Cases (1): {df['Outcome'].sum()}")
    print(f"Non-Diabetic Cases (0): {len(df) - df['Outcome'].sum()}")
    print("-" * 20)

    # Clean up unrealistic values (e.g., BloodPressure cannot be 0 in real life)
    for col in ['Glucose', 'BloodPressure', 'BMI']:
        df[col] = df[col].replace(0, df[col].median())

    return df

def run_diabetes_prediction_pipeline():
    """
    Executes the full machine learning workflow: loading, preprocessing,
    modeling, evaluation, and visualization.
    """
    # 1. Data Loading (Simulated)
    data = generate_synthetic_data()
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']

    # 2. Data Splitting
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
    )

    print(f"Training set size (before SMOTE): {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print("-" * 20)

    # 3. Model Pipeline Definition
    # The pipeline handles scaling, SMOTE (for handling imbalance), and classification.
    # The ImbPipeline is used to ensure SMOTE is only applied to the training data.
    pipeline = ImbPipeline([
        ('scaler', StandardScaler()),                 # Normalize data
        ('smote', SMOTE(random_state=RANDOM_STATE)), # Balance the training data
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE))
    ])

    # 4. Training
    print("Starting model training (Random Forest with SMOTE)...")
    pipeline.fit(X_train, y_train)
    print("Training complete.")
    print("-" * 20)

    # 5. Prediction and Evaluation
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1] # Probability for the positive class (1)

    # Classification Report
    print(f"--- Classification Report for {MODEL_NAME} ---")
    print(classification_report(y_test, y_pred, target_names=['Non-Diabetic (0)', 'Diabetic (1)']))

    # Calculate and print AUC
    auc_score = roc_auc_score(y_test, y_proba)
    print(f"AUC-ROC Score: {auc_score:.4f}")
    print("-" * 20)

    # 6. Visualization: Feature Importance
    plot_feature_importance(pipeline, X.columns)

    # 7. Visualization: ROC Curve
    plot_roc_curve(y_test, y_proba, auc_score)

def plot_feature_importance(pipeline, feature_names):
    """Generates and saves a plot of feature importance."""

    # Get the trained classifier from the pipeline
    classifier = pipeline.named_steps['classifier']
    importances = classifier.feature_importances_
    feature_importance = pd.Series(importances, index=feature_names).sort_values(ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_importance, y=feature_importance.index, palette="viridis")
    plt.title(f'Feature Importance for {MODEL_NAME} Classifier')
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'feature_importance_rf.png'))
    print(f"Saved Feature Importance plot to {OUTPUT_DIR}/feature_importance_rf.png")

def plot_roc_curve(y_test, y_proba, auc_score):
    """Generates and saves the ROC curve plot."""

    fpr, tpr, thresholds = roc_curve(y_test, y_proba)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='coral', label=f'ROC Curve (AUC = {auc_score:.4f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')

    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR) / Recall')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.grid(True, linestyle='--')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'roc_curve_comparison.png'))
    print(f"Saved ROC Curve plot to {OUTPUT_DIR}/roc_curve_comparison.png")

if __name__ == "__main__":
    run_diabetes_prediction_pipeline()

    print("\n----------------------------------------------------")
    print("Project run complete. Check the 'output' directory for plots.")
    print("----------------------------------------------------")

--- Data Generated ---
Total Samples: 1000
Diabetic Cases (1): 0
Non-Diabetic Cases (0): 1000
--------------------
Training set size (before SMOTE): 750
Test set size: 250
--------------------
Starting model training (Random Forest with SMOTE)...


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead