In [None]:
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


In [None]:
df = pd.read_csv("../data/processed/diabetic_data_clean.csv")

TARGET_COL = "readmitted"  # confirm name
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

numeric_transformer = "passthrough"
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

rf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        )),
    ]
)

rf.fit(X_train, y_train)


In [None]:
# Extract the trained RF model and transformed data
rf_model = rf.named_steps["model"]

# Need transformed features for SHAP
X_train_transformed = rf.named_steps["preprocess"].fit_transform(X_train)

explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train_transformed)

shap.summary_plot(shap_values[1], X_train_transformed, show=False)
