Uploading the dataset

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, MinMaxScaler


In [None]:
path = "/content/kidney_disease.csv"
data = pd.read_csv(path)

print("Shape:", data.shape)
data.head()


In [None]:
data_clean = data.copy()

# Drop common ID column names if they exist
for col in ["id", "ID", "Id"]:
    if col in data_clean.columns:
        data_clean = data_clean.drop(columns=[col])

print("Shape after dropping ID (if found):", data_clean.shape)
data_clean.head()


In [None]:
# Strip whitespace from all object (text) columns
for col in data_clean.select_dtypes(include="object").columns:
    data_clean[col] = data_clean[col].astype(str).str.strip()

# Replace typical missing marker with NaN
data_clean = data_clean.replace("?", np.nan)

data_clean.head()


In [None]:
for col in ["pcv", "wc", "rc"]:
    if col in data_clean.columns:
        data_clean[col] = pd.to_numeric(data_clean[col], errors="coerce")

data_clean[["pcv","wc","rc"]].dtypes


In [None]:
# y is the label column
y = data_clean["classification"]

# x is everything else
x = data_clean.drop("classification", axis=1)

print("x shape:", x.shape)
print("y shape:", y.shape)


In [None]:
data.drop('id', axis=1, inplace=True)   #dropping ID column

In [None]:
x = data.drop("classification", axis=1)
y = data["classification"]


In [None]:
data.isnull().sum()



Data splitting

In [None]:
categorical_cols = x.select_dtypes(include="object").columns
numerical_cols = x.select_dtypes(exclude="object").columns

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)


In [None]:
 # making a copy of the dataset to experiment cleaning
 data_clean = data.copy()

In [None]:
data_clean = data_clean.replace("?", np.nan)

In [None]:
for col in ["pcv", "wc", "rc"]:
    if col in data_clean.columns:
        data_clean[col] = pd.to_numeric(data_clean[col], errors="coerce")

In [None]:
for col in data_clean.select_dtypes(include="object").columns:
    data_clean[col] = data_clean[col].str.strip()

In [None]:
# Handle numerical missing values
for col in numerical_cols:
    data_clean[col] = data_clean[col].fillna(data_clean[col].mean())

# Handle categorical missing values
for col in categorical_cols:
    data_clean[col] = data_clean[col].fillna(data_clean[col].mode()[0])


In [None]:
data_clean.isna().sum()

Label enconding (most categories are binary)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for col in categorical_cols:
    data_clean[col] = encoder.fit_transform(data_clean[col])

In [None]:
x = data_clean.drop("classification", axis=1)
y = data_clean["classification"]

In [None]:
y = y.map({"ckd": 1, "notckd": 0})

In [None]:
y.value_counts()


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x[numerical_cols] = scaler.fit_transform(x[numerical_cols])


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_clean[numerical_cols] = scaler.fit_transform(data_clean[numerical_cols])


In [None]:
data_clean.head()

In [None]:
data_clean["classification"] = data_clean["classification"].map({
    "ckd": 1,
    "notckd": 0
})


Features extraction

In [None]:
for col in ["pcv", "wc", "rc"]:
    if col in data_clean.columns:
        data_clean[col] = pd.to_numeric(data_clean[col], errors="coerce")

data_clean[["pcv","wc","rc"]].dtypes


In [None]:
# y is the label column
y = data_clean["classification"]

# x is everything else
x = data_clean.drop("classification", axis=1)

print("x shape:", x.shape)
print("y shape:", y.shape)


In [None]:
categorical_cols = x.select_dtypes(include="object").columns
numerical_cols = x.select_dtypes(exclude="object").columns

print("Categorical columns:", list(categorical_cols))
print("Numerical columns:", list(numerical_cols))


In [None]:
# Numerical: fill NaN with mean
for col in numerical_cols:
    x[col] = x[col].fillna(x[col].mean())

# Categorical: fill NaN with mode
for col in categorical_cols:
    x[col] = x[col].fillna(x[col].mode()[0])

print("Total missing values in x:", x.isna().sum().sum())


In [None]:
for col in categorical_cols:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col])

# Verify no object columns remain
print("Object columns left in x:", list(x.select_dtypes(include="object").columns))
x.head()


In [None]:
scaler = MinMaxScaler()
x[numerical_cols] = scaler.fit_transform(x[numerical_cols])

# Verify pcv/wc/rc ranges
if all(col in x.columns for col in ["pcv","wc","rc"]):
    print(x[["pcv","wc","rc"]].agg(["min","max"]))


In [None]:
# Encode target ONLY if it is still text
if y.dtype == "object":
    y = y.str.strip().map({"ckd": 1, "notckd": 0})

print(y.value_counts())
print(y.dtype)


In [None]:
print("x dtypes check -> any object?", x.select_dtypes(include="object").shape[1] > 0)
print("x NaNs count:", x.isna().sum().sum())

print("x shape:", x.shape)
print("y shape:", y.shape)

x.head()


Feature correlation Filtering

Filter Method: Correlation Test

In [None]:
# Combine x and y temporarily
corr_data = x.copy()
corr_data["classification"] = y

corr_data.head()


In [None]:
corr_matrix = corr_data.corr()


In [None]:
ckd_corr = corr_matrix["classification"].drop("classification")

ckd_corr.sort_values(ascending=False)


In [None]:
import matplotlib.pyplot as plt

ckd_corr_sorted = ckd_corr.sort_values(key=abs, ascending=False)

plt.figure(figsize=(12, 4))
ckd_corr_sorted.plot(kind="bar")
plt.title("Feature Correlation with CKD")
plt.ylabel("Correlation Coefficient")
plt.show()


In [None]:
threshold = 0.3

selected_corr_features = ckd_corr[ckd_corr.abs() >= threshold].index.tolist()

print("Selected features (correlation-based):")
selected_corr_features


In [None]:
x_corr = x[selected_corr_features]

print("Original shape:", x.shape)
print("Correlation-selected shape:", x_corr.shape)


Filter Method: Chi-Square Test

In [None]:
from sklearn.feature_selection import chi2


In [None]:
chi2_scores, chi2_pvalues = chi2(x, y)


In [None]:
chi2_results = pd.DataFrame({
    "feature": x.columns,
    "chi2_score": chi2_scores,
    "p_value": chi2_pvalues
})

chi2_results = chi2_results.sort_values("chi2_score", ascending=False)
chi2_results



In [None]:
k = 15
selected_chi2_features = chi2_results["feature"].iloc[:k].tolist()

selected_chi2_features


In [None]:
x_chi2 = x[selected_chi2_features]

print("Original shape:", x.shape)
print("Chi-square selected shape:", x_chi2.shape)


Wrapper Method: Recursive Feature Elimination, RFE

Logistic Regression

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, solver="liblinear")

In [None]:
# Number of features to select
n_features = 15

rfe = RFE(
    estimator=lr_model,
    n_features_to_select=n_features
)

rfe.fit(x, y)


In [None]:
selected_rfe_features = x.columns[rfe.support_].tolist()

print("Selected features (RFE):")
selected_rfe_features

In [None]:
x_rfe = x[selected_rfe_features]

print("Original shape:", x.shape)
print("RFE-selected shape:", x_rfe.shape)

Wrapper Method: Sequential Forward Selection ‚Äì SFS

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
lr_model = LogisticRegression(max_iter=1000, solver="liblinear")

In [None]:
sfs = SequentialFeatureSelector(
    estimator=lr_model,
    n_features_to_select=15,
    direction="forward",
    scoring="accuracy",
    cv=5
)

sfs.fit(x, y)

In [None]:
selected_sfs_features = x.columns[sfs.get_support()].tolist()

print("Selected features (SFS):")
selected_sfs_features

In [None]:
x_sfs = x[selected_sfs_features]

print("Original shape:", x.shape)
print("SFS-selected shape:", x_sfs.shape)

Training Lasso model

In [None]:
from sklearn.linear_model import LogisticRegression

lasso_model = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    max_iter=1000
)

lasso_model.fit(x, y)


In [None]:
import pandas as pd

# Turn coefficients into a labeled Series (feature name -> weight)
lasso_coeffs = pd.Series(lasso_model.coef_[0], index=x.columns)

# Keep only features with non-zero weights
selected_lasso_features = lasso_coeffs[lasso_coeffs != 0].index.tolist()

print("How many features Lasso kept:", len(selected_lasso_features))
print("Lasso selected features:")
selected_lasso_features


In [None]:
x_lasso = x[selected_lasso_features]

print("Original x shape:", x.shape)
print("x_lasso shape:", x_lasso.shape)


Tree-based Embedded Selection :Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

rf_model.fit(x, y)


In [None]:
rf_importance = pd.Series(rf_model.feature_importances_, index=x.columns).sort_values(ascending=False)

print("Top 15 features by Random Forest importance:")
rf_importance.head(15)


In [None]:
top_n = 15
selected_tree_features = rf_importance.head(top_n).index.tolist()

x_tree = x[selected_tree_features]

print("Selected tree-based features:")
selected_tree_features

print("\nOriginal x shape:", x.shape)
print("x_tree shape:", x_tree.shape)


Feature selection done for each method

Model Training & Evaluation

Which model performs best, and which feature-selection method helps most?

Because it‚Äôs medical prediction, we‚Äôll focus on:
Recall / Sensitivity (catch CKD cases)
Precision
F1
ROC-AUC
Confusion Matrix

dictionary of datasets

In [None]:
datasets = {
    "All features (x)": x,
    "Correlation (x_corr)": x_corr,
    "Chi-square (x_chi2)": x_chi2,
    "RFE (x_rfe)": x_rfe,
    "SFS (x_sfs)": x_sfs,
    "Lasso (x_lasso)": x_lasso,
    "Tree importance (x_tree)": x_tree
}

{key: val.shape for key, val in datasets.items()}


Train/test split (stratified)

In [None]:
from sklearn.model_selection import train_test_split

X_train_dict, X_test_dict = {}, {}

for name, Xset in datasets.items():
    X_train, X_test, y_train, y_test = train_test_split(
        Xset, y, test_size=0.2, random_state=42, stratify=y
    )
    X_train_dict[name] = X_train
    X_test_dict[name] = X_test

print("y_train distribution:\n", y_train.value_counts(normalize=True))
print("y_test distribution:\n", y_test.value_counts(normalize=True))


Define models to compare

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    "LogReg": LogisticRegression(max_iter=1000, solver="liblinear"),
    "SVM (RBF)": SVC(probability=True),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
}


Train & evaluate all models on all datasets

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

results = []

for dname in datasets.keys():
    X_train = X_train_dict[dname]
    X_test = X_test_dict[dname]

    for mname, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Probabilities for AUC (if available)
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)[:, 1]
        else:
            y_prob = None

        row = {
            "Dataset": dname,
            "Model": mname,
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1": f1_score(y_test, y_pred),
            "ROC_AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan
        }
        results.append(row)

results_df = pd.DataFrame(results).sort_values(["Recall", "F1", "Accuracy"], ascending=False)
results_df.head(15)


In [None]:
Show the best configuration

In [None]:
best = results_df.iloc[0]
best


Perfect scores on a single split can be misleading, so we applied stratified cross-validation to verify the model‚Äôs generalization and reduce data leakage effects.

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_cv = {
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "SVM (RBF)": SVC(probability=True)
}

for name, model in models_cv.items():
    scores = cross_validate(
        model,
        x_lasso,
        y,
        cv=cv,
        scoring=["accuracy", "precision", "recall", "f1", "roc_auc"]
    )

    print(f"\n{name} on x_lasso:")
    for metric in scores:
        if metric.startswith("test_"):
            print(f"{metric}: {scores[metric].mean():.3f} ¬± {scores[metric].std():.3f}")


Cross-validation for ALL models on x_lasso


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_cv = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver="liblinear"),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", probability=True)
}

cv_results = []

for model_name, model in models_cv.items():
    scores = cross_validate(
        model,
        x_lasso,
        y,
        cv=cv,
        scoring=["accuracy", "precision", "recall", "f1", "roc_auc"]
    )

    cv_results.append({
        "Model": model_name,
        "Accuracy": f"{scores['test_accuracy'].mean():.3f} ¬± {scores['test_accuracy'].std():.3f}",
        "Precision": f"{scores['test_precision'].mean():.3f} ¬± {scores['test_precision'].std():.3f}",
        "Recall": f"{scores['test_recall'].mean():.3f} ¬± {scores['test_recall'].std():.3f}",
        "F1": f"{scores['test_f1'].mean():.3f} ¬± {scores['test_f1'].std():.3f}",
        "ROC_AUC": f"{scores['test_roc_auc'].mean():.3f} ¬± {scores['test_roc_auc'].std():.3f}",
    })

cv_results_df = pd.DataFrame(cv_results)
cv_results_df


What makes THESE results credible:
‚úÖ Stratified 5-fold cross-validation
‚úÖ Mean ¬± standard deviation reported
‚úÖ Multiple models compared
‚úÖ Fixed feature set (x_lasso)
‚úÖ Performance differences between models


Train final SVM and predict

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Train-test split (same stratification)
X_train, X_test, y_train, y_test = train_test_split(
    x_lasso, y, test_size=0.2, random_state=42, stratify=y
)

# Final SVM model
final_svm = SVC(kernel="rbf", probability=True)
final_svm.fit(X_train, y_train)

# Predictions
y_pred = final_svm.predict(X_test)
y_prob = final_svm.predict_proba(X_test)[:, 1]


Confusion Matrix (Final Model)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Not CKD", "CKD"],
            yticklabels=["Not CKD", "CKD"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix ‚Äî SVM (Lasso features)")
plt.show()


ROC Curve

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, _ = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"SVM (AUC = {auc_score:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve ‚Äî SVM (Lasso features)")
plt.legend()
plt.show()


------------------------------------------

Calibrate the SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

X_train, X_test, y_train, y_test = train_test_split(
    x_lasso, y, test_size=0.2, random_state=42, stratify=y
)

# Base SVM
svm = SVC(kernel="rbf")

# Calibrated SVM -> better probabilities
final_model = CalibratedClassifierCV(svm, method="sigmoid", cv=5)
final_model.fit(X_train, y_train)

y_prob = final_model.predict_proba(X_test)[:, 1]
y_pred = final_model.predict(X_test)


Risk tier function

In [None]:
def risk_tier(p):
    # You can tweak thresholds later for your use case
    if p < 0.33:
        return "Low"
    elif p < 0.66:
        return "Medium"
    else:
        return "High"

def severity_score(p):
    # 0-100 score, easy to display in UI
    return int(round(p * 100))


Create a ‚Äúresults‚Äù table for UI

In [None]:
import pandas as pd

results_ui = X_test.copy()
results_ui["true_label"] = y_test.values
results_ui["pred_label"] = y_pred
results_ui["ckd_probability"] = y_prob
results_ui["risk_tier"] = [risk_tier(p) for p in y_prob]
results_ui["severity_score_0_100"] = [severity_score(p) for p in y_prob]

# Map labels to text for readability
results_ui["pred_text"] = results_ui["pred_label"].map({0: "Not CKD", 1: "CKD"})
results_ui["true_text"] = results_ui["true_label"].map({0: "Not CKD", 1: "CKD"})

results_ui[["true_text", "pred_text", "ckd_probability", "risk_tier", "severity_score_0_100"]].head(10)


In [None]:
def severity_label(p):
    if p < 0.33: return "Mild risk"
    if p < 0.66: return "Moderate risk"
    return "Severe risk"

results_ui["severity_label"] = [severity_label(p) for p in y_prob]
results_ui[["pred_text", "ckd_probability", "severity_label"]].head(10)


----------------------------------------------------------------

Risk tier & severity functions

In [None]:
def risk_tier_clinical(p):
    if p < 0.20:
        return "Very Low"
    elif p < 0.40:
        return "Low"
    elif p < 0.60:
        return "Moderate"
    elif p < 0.80:
        return "High"
    else:
        return "Very High"

def severity_score(p):
    # 0‚Äì100 scale, easy to visualize in UI
    return int(round(p * 100))


Apply to final calibrated SVM output

In [None]:
results_ui = X_test.copy()

results_ui["true_label"] = y_test.values
results_ui["pred_label"] = y_pred
results_ui["ckd_probability"] = y_prob

results_ui["risk_tier"] = results_ui["ckd_probability"].apply(risk_tier_clinical)
results_ui["severity_score_0_100"] = results_ui["ckd_probability"].apply(severity_score)

# Human-readable labels
results_ui["pred_text"] = results_ui["pred_label"].map({0: "Not CKD", 1: "CKD"})
results_ui["true_text"] = results_ui["true_label"].map({0: "Not CKD", 1: "CKD"})

results_ui[[
    "true_text",
    "pred_text",
    "ckd_probability",
    "risk_tier",
    "severity_score_0_100"
]].head(10)


---------------------------------------------------------------

In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

Define the exact feature set + mappings

In [None]:
LASSO_FEATURES = ["sg", "al", "hemo", "pcv", "wc", "htn", "dm", "appet", "pe"]
NUM_FEATURES   = ["sg", "al", "hemo", "pcv", "wc"]
CAT_FEATURES   = ["htn", "dm", "appet", "pe"]

# Encoding maps (adjust if your encoding differs)
MAP_YESNO = {"no": 0, "yes": 1, 0: 0, 1: 1, "0": 0, "1": 1}
MAP_APPET = {"poor": 0, "good": 1, 0: 0, 1: 1, "0": 0, "1": 1}


Build a ‚Äúraw-to-model-ready‚Äù transformer function

In [None]:
def transform_for_model(df_raw, scaler):
    df = df_raw.copy()

    # Ensure columns exist
    for c in LASSO_FEATURES:
        if c not in df.columns:
            raise ValueError(f"Missing required feature: {c}")

    # Encode categorical
    df["htn"] = df["htn"].map(MAP_YESNO)
    df["dm"]  = df["dm"].map(MAP_YESNO)
    df["pe"]  = df["pe"].map(MAP_YESNO)
    df["appet"] = df["appet"].map(MAP_APPET)

    # Safety: convert to numeric
    for c in NUM_FEATURES + CAT_FEATURES:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Scale numeric
    df[NUM_FEATURES] = scaler.transform(df[NUM_FEATURES])

    # Return in correct order
    return df[LASSO_FEATURES]


In [None]:
# Use the raw loaded data and clean only the columns we need
raw_df = data.copy()

# strip and replace missing markers if needed
for col in raw_df.select_dtypes(include="object").columns:
    raw_df[col] = raw_df[col].astype(str).str.strip()
raw_df = raw_df.replace("?", np.nan)

# Force numeric for numeric UI inputs
for col in ["sg","al","hemo","pcv","wc"]:
    raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce")

# Target
y_raw = raw_df["classification"].astype(str).str.strip().map({"ckd": 1, "notckd": 0})

# Keep only rows with known y
mask = y_raw.notna()
raw_df = raw_df[mask].copy()
y_raw = y_raw[mask].astype(int)

# Fill missing numeric with mean (simple, consistent)
for col in NUM_FEATURES:
    raw_df[col] = raw_df[col].fillna(raw_df[col].mean())

# Fill missing categorical with mode
for col in CAT_FEATURES:
    raw_df[col] = raw_df[col].fillna(raw_df[col].mode()[0])

raw_df[LASSO_FEATURES].head(), y_raw.value_counts()


In [None]:
from sklearn.impute import SimpleImputer

# 1. Start from raw data
X_raw = raw_df[LASSO_FEATURES].copy()

# 2. Encode categorical features safely
X_raw["htn"] = X_raw["htn"].map(MAP_YESNO)
X_raw["dm"]  = X_raw["dm"].map(MAP_YESNO)
X_raw["pe"]  = X_raw["pe"].map(MAP_YESNO)
X_raw["appet"] = X_raw["appet"].map(MAP_APPET)

# 3. Force numeric conversion
for c in NUM_FEATURES + CAT_FEATURES:
    X_raw[c] = pd.to_numeric(X_raw[c], errors="coerce")

# üî¥ CHECKPOINT ‚Äî verify NaNs exist (for learning)
print("NaNs before imputation:\n", X_raw.isna().sum())

# 4. Impute missing values
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

X_raw[NUM_FEATURES] = num_imputer.fit_transform(X_raw[NUM_FEATURES])
X_raw[CAT_FEATURES] = cat_imputer.fit_transform(X_raw[CAT_FEATURES])

# üî¥ CHECKPOINT ‚Äî confirm NaNs are gone
print("\nNaNs after imputation:\n", X_raw.isna().sum())

# 5. Scale numeric features
scaler = MinMaxScaler()
X_raw[NUM_FEATURES] = scaler.fit_transform(X_raw[NUM_FEATURES])

# 6. Final model input
X_model = X_raw[LASSO_FEATURES].values
y_model = y_raw.values

# 7. Train calibrated SVM
base_svm = SVC(kernel="rbf")
model = CalibratedClassifierCV(base_svm, method="sigmoid", cv=5)
model.fit(X_model, y_model)

print("‚úÖ Trained calibrated SVM successfully (no NaNs).")


In [None]:
# Background data for SHAP (small sample to keep it fast)
bg = X_model[np.random.choice(len(X_model), size=min(80, len(X_model)), replace=False)]

artifact = {
    "model": model,
    "scaler": scaler,
    "features": LASSO_FEATURES,
    "num_features": NUM_FEATURES,
    "cat_features": CAT_FEATURES,
    "shap_background": bg
}

joblib.dump(artifact, "ckd_ui_artifacts.joblib")
print("Saved: ckd_ui_artifacts.joblib")


In [None]:
joblib.dump(artifact, "ckd_ui_artifacts.joblib")


#UI

In [None]:
!pip -q install gradio shap joblib matplotlib


In [None]:
import numpy as np
import pandas as pd
import joblib
import gradio as gr
import shap
import matplotlib.pyplot as plt

# --- Load artifacts ---
artifact = joblib.load("ckd_ui_artifacts.joblib")
model = artifact["model"]
scaler = artifact["scaler"]
FEATURES = artifact["features"]
NUM_FEATURES = artifact["num_features"]
CAT_FEATURES = artifact["cat_features"]
BG = artifact["shap_background"]

# --- Risk tier logic (granular, clinical-feeling) ---
def risk_tier_clinical(p):
    if p < 0.20:
        return "Very Low"
    elif p < 0.40:
        return "Low"
    elif p < 0.60:
        return "Moderate"
    elif p < 0.80:
        return "High"
    else:
        return "Very High"

def tier_color_hex(tier):
    # green ‚Üí yellow ‚Üí red
    return {
        "Very Low": "#2ecc71",
        "Low": "#27ae60",
        "Moderate": "#f1c40f",
        "High": "#e67e22",
        "Very High": "#e74c3c",
    }[tier]

def recommendation_text(tier):
    # Safe, non-medical-advice phrasing
    return {
        "Very Low": "Model suggests CKD is unlikely. Continue routine health monitoring.",
        "Low": "Low model-based risk. Consider periodic checkups if symptoms or risk factors exist.",
        "Moderate": "Moderate model-based risk. Consider follow-up lab testing and clinical evaluation.",
        "High": "High model-based risk. Clinical review and confirmatory testing are recommended.",
        "Very High": "Very high model-based risk. Prompt clinical assessment and confirmatory testing are recommended.",
    }[tier]

def severity_score(p):
    return int(round(p * 100))

# --- Build model input row ---
def build_model_row(sg, al, hemo, pcv, wc, htn, dm, appet, pe):
    # Convert categories to numeric encoding
    MAP_YESNO = {"No": 0, "Yes": 1}
    MAP_APPET = {"Poor": 0, "Good": 1}

    row = pd.DataFrame([{
        "sg": float(sg),
        "al": float(al),
        "hemo": float(hemo),
        "pcv": float(pcv),
        "wc": float(wc),
        "htn": MAP_YESNO[htn],
        "dm": MAP_YESNO[dm],
        "appet": MAP_APPET[appet],
        "pe": MAP_YESNO[pe],
    }])

    # Scale numeric columns using saved scaler
    row[NUM_FEATURES] = scaler.transform(row[NUM_FEATURES])

    # Ensure correct order
    return row[FEATURES].values, row

# --- SHAP explainer (KernelExplainer works with any model, but can be slow) ---
# We'll initialize once to avoid repeated setup cost.
explainer = shap.KernelExplainer(lambda z: model.predict_proba(z)[:, 1], BG)

def predict_ckd(sg, al, hemo, pcv, wc, htn, dm, appet, pe):
    try:
        X_in, row_df = build_model_row(sg, al, hemo, pcv, wc, htn, dm, appet, pe)

        prob_ckd = float(model.predict_proba(X_in)[:, 1][0])
        pred = int(model.predict(X_in)[0])

        tier = risk_tier_clinical(prob_ckd)
        color = tier_color_hex(tier)
        sev = severity_score(prob_ckd)

        # Pretty HTML output
        badge = f"""
        <div style="padding:12px;border-radius:12px;background:{color};color:white;font-weight:700;">
            Risk Tier: {tier}
        </div>
        """

        summary = f"""
        {badge}
        <p><b>Prediction:</b> {"CKD" if pred==1 else "Not CKD"}</p>
        <p><b>CKD probability:</b> {prob_ckd:.3f}</p>
        <p><b>Severity score:</b> {sev}/100</p>
        <p><b>Recommendation:</b> {recommendation_text(tier)}</p>
        <hr>
        <p style="font-size:0.9em;color:#555;">
        Disclaimer: This is a machine-learning demo for educational purposes only and is not medical advice.
        </p>
        """

        # --- SHAP explanation ---
        # Keep it reasonably fast:
        shap_vals = explainer.shap_values(X_in, nsamples=200)
        shap_series = pd.Series(shap_vals[0], index=FEATURES).sort_values(key=abs, ascending=False)

        # Table output (top 9)
        shap_table = shap_series.to_frame("SHAP value").reset_index().rename(columns={"index": "Feature"})
        shap_table["Abs(SHAP)"] = shap_table["SHAP value"].abs()
        shap_table = shap_table.sort_values("Abs(SHAP)", ascending=False).drop(columns=["Abs(SHAP)"])

        # Bar plot (top 8)
        fig, ax = plt.subplots(figsize=(7, 4))
        shap_series.iloc[:8][::-1].plot(kind="barh", ax=ax)
        ax.set_title("Top feature contributions (SHAP)")
        ax.set_xlabel("Impact on CKD probability")
        plt.tight_layout()

        return summary, shap_table, fig

    except Exception as e:
        return f"<p style='color:red;'><b>Error:</b> {e}</p>", None, None

# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ü©∫ CKD Risk Predictor (Model-based)")
    gr.Markdown("Educational demo only ‚Äî **not medical advice**. Outputs are *model-based risk tiers*, not clinical staging.")

    with gr.Row():
        with gr.Column():
            sg = gr.Number(label="Specific Gravity (sg)", value=1.020)
            al = gr.Number(label="Albumin (al)", value=1.0)
            hemo = gr.Number(label="Hemoglobin (hemo)", value=12.0)
            pcv = gr.Number(label="Packed Cell Volume (pcv)", value=40.0)
            wc = gr.Number(label="White Blood Cell Count (wc)", value=8000.0)

            htn = gr.Dropdown(["No", "Yes"], label="Hypertension (htn)", value="No")
            dm = gr.Dropdown(["No", "Yes"], label="Diabetes Mellitus (dm)", value="No")
            appet = gr.Dropdown(["Poor", "Good"], label="Appetite (appet)", value="Good")
            pe = gr.Dropdown(["No", "Yes"], label="Pedal Edema (pe)", value="No")

            btn = gr.Button("Predict")

        with gr.Column():
            output_html = gr.HTML(label="Result")
            shap_df = gr.Dataframe(label="SHAP Contributions (table)", interactive=False)
            shap_plot = gr.Plot(label="SHAP Bar Plot")

    btn.click(
        predict_ckd,
        inputs=[sg, al, hemo, pcv, wc, htn, dm, appet, pe],
        outputs=[output_html, shap_df, shap_plot]
    )

demo.launch(debug=True)


In [None]:
# ===== CKD Gradio UI (Full Version: Risk + SHAP Direction + Download Report) =====

import numpy as np
import pandas as pd
import joblib
import gradio as gr
import shap
import matplotlib.pyplot as plt
from datetime import datetime

# ------------------ Load artifacts ------------------
artifact = joblib.load("ckd_ui_artifacts.joblib")
model = artifact["model"]
scaler = artifact["scaler"]
FEATURES = artifact["features"]
NUM_FEATURES = artifact["num_features"]
CAT_FEATURES = artifact["cat_features"]
BG = artifact["shap_background"]

# ------------------ Risk tier logic (granular) ------------------
def risk_tier_clinical(p):
    if p < 0.20:
        return "Very Low"
    elif p < 0.40:
        return "Low"
    elif p < 0.60:
        return "Moderate"
    elif p < 0.80:
        return "High"
    else:
        return "Very High"

def tier_color_hex(tier):
    return {
        "Very Low": "#2ecc71",
        "Low": "#27ae60",
        "Moderate": "#f1c40f",
        "High": "#e67e22",
        "Very High": "#e74c3c",
    }[tier]

def recommendation_text(tier):
    return {
        "Very Low": "Model suggests CKD is unlikely. Continue routine health monitoring.",
        "Low": "Low model-based risk. Consider periodic checkups if symptoms or risk factors exist.",
        "Moderate": "Moderate model-based risk. Consider follow-up lab testing and clinical evaluation.",
        "High": "High model-based risk. Clinical review and confirmatory testing are recommended.",
        "Very High": "Very high model-based risk. Prompt clinical assessment and confirmatory testing are recommended.",
    }[tier]

def severity_score(p):
    return int(round(p * 100))

# ------------------ Build model input ------------------
def build_model_row(sg, al, hemo, pcv, wc, htn, dm, appet, pe):
    MAP_YESNO = {"No": 0, "Yes": 1}
    MAP_APPET = {"Poor": 0, "Good": 1}

    row = pd.DataFrame([{
        "sg": float(sg),
        "al": float(al),
        "hemo": float(hemo),
        "pcv": float(pcv),
        "wc": float(wc),
        "htn": MAP_YESNO[htn],
        "dm": MAP_YESNO[dm],
        "appet": MAP_APPET[appet],
        "pe": MAP_YESNO[pe],
    }])

    # Scale numeric columns using saved scaler
    row[NUM_FEATURES] = scaler.transform(row[NUM_FEATURES])

    # Ensure correct order
    return row[FEATURES].values

# ------------------ SHAP explainer ------------------
# KernelExplainer is universal; BG keeps it fast enough for demo
explainer = shap.KernelExplainer(lambda z: model.predict_proba(z)[:, 1], BG)

# ------------------ Prediction function ------------------
def predict_ckd(sg, al, hemo, pcv, wc, htn, dm, appet, pe):
    try:
        X_in = build_model_row(sg, al, hemo, pcv, wc, htn, dm, appet, pe)

        prob_ckd = float(model.predict_proba(X_in)[:, 1][0])
        pred = int(model.predict(X_in)[0])

        tier = risk_tier_clinical(prob_ckd)
        color = tier_color_hex(tier)
        sev = severity_score(prob_ckd)

        badge = f"""
        <div style="padding:12px;border-radius:12px;background:{color};color:white;font-weight:700;">
            Risk Tier: {tier}
        </div>
        """

        summary_html = f"""
        {badge}
        <p><b>Prediction:</b> {"CKD" if pred==1 else "Not CKD"}</p>
        <p><b>CKD probability:</b> {prob_ckd:.3f}</p>
        <p><b>Severity score:</b> {sev}/100</p>
        <p><b>Recommendation:</b> {recommendation_text(tier)}</p>
        <hr>
        <p style="font-size:0.9em;color:#555;">
        Disclaimer: This is a machine-learning demo for educational purposes only and is not medical advice.
        </p>
        """

        # ---------- SHAP values ----------
        shap_vals = explainer.shap_values(X_in, nsamples=200)
        shap_series = pd.Series(shap_vals[0], index=FEATURES)

        # Direction helpers
        def direction_text(v):
            if v > 0:
                return "Toward CKD"
            elif v < 0:
                return "Away from CKD"
            else:
                return "Neutral"

        def arrow(v):
            if v > 0:
                return "‚¨ÜÔ∏è"
            elif v < 0:
                return "‚¨áÔ∏è"
            return "‚ûñ"

        shap_table = (
            shap_series.to_frame("SHAP value")
            .reset_index()
            .rename(columns={"index": "Feature"})
        )
        shap_table["Direction"] = shap_table["SHAP value"].apply(direction_text)
        shap_table["Arrow"] = shap_table["SHAP value"].apply(arrow)
        shap_table["AbsImpact"] = shap_table["SHAP value"].abs()
        shap_table = shap_table.sort_values("AbsImpact", ascending=False).drop(columns=["AbsImpact"])

        # ---------- SHAP plot ----------
        topn = 8
        top_feats = shap_table["Feature"].iloc[:topn].tolist()
        top_vals = shap_series[top_feats]

        fig, ax = plt.subplots(figsize=(7, 4))
        top_vals[::-1].plot(kind="barh", ax=ax)
        ax.axvline(0, linewidth=1)
        ax.set_title("Top feature contributions (SHAP)\n(+ pushes toward CKD, ‚àí pushes away)")
        ax.set_xlabel("Impact on CKD probability")
        plt.tight_layout()

        # ---------- Create downloadable report ----------
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        report_path = f"ckd_report_{ts}.csv"

        inputs_df = pd.DataFrame([{
            "sg": sg, "al": al, "hemo": hemo, "pcv": pcv, "wc": wc,
            "htn": htn, "dm": dm, "appet": appet, "pe": pe
        }])

        summary_df = pd.DataFrame([{
            "prediction": "CKD" if pred==1 else "Not CKD",
            "ckd_probability": prob_ckd,
            "risk_tier": tier,
            "severity_score_0_100": sev,
            "recommendation": recommendation_text(tier)
        }])

        # Write one CSV with sections (simple + reliable)
        with open(report_path, "w", encoding="utf-8") as f:
            f.write("=== INPUTS ===\n")
        inputs_df.to_csv(report_path, mode="a", index=False)

        with open(report_path, "a", encoding="utf-8") as f:
            f.write("\n=== PREDICTION SUMMARY ===\n")
        summary_df.to_csv(report_path, mode="a", index=False)

        with open(report_path, "a", encoding="utf-8") as f:
            f.write("\n=== SHAP EXPLANATION ===\n")
        shap_table.to_csv(report_path, mode="a", index=False)

        return summary_html, shap_table, fig, report_path

    except Exception as e:
        return f"<p style='color:red;'><b>Error:</b> {e}</p>", None, None, None

# ------------------ Gradio UI ------------------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ü©∫ CKD Risk Predictor (Model-based)")
    gr.Markdown("Educational demo only ‚Äî **not medical advice**. Outputs are *model-based risk tiers*, not clinical staging.")

    with gr.Row():
        with gr.Column():
            sg = gr.Number(label="Specific Gravity (sg)", value=1.020)
            al = gr.Number(label="Albumin (al)", value=1.0)
            hemo = gr.Number(label="Hemoglobin (hemo)", value=12.0)
            pcv = gr.Number(label="Packed Cell Volume (pcv)", value=40.0)
            wc = gr.Number(label="White Blood Cell Count (wc)", value=8000.0)

            htn = gr.Dropdown(["No", "Yes"], label="Hypertension (htn)", value="No")
            dm = gr.Dropdown(["No", "Yes"], label="Diabetes Mellitus (dm)", value="No")
            appet = gr.Dropdown(["Poor", "Good"], label="Appetite (appet)", value="Good")
            pe = gr.Dropdown(["No", "Yes"], label="Pedal Edema (pe)", value="No")

            btn = gr.Button("Predict")

        with gr.Column():
            output_html = gr.HTML(label="Result")
            shap_df = gr.Dataframe(label="SHAP Contributions (with direction)", interactive=False)
            shap_plot = gr.Plot(label="SHAP Bar Plot")
            report_file = gr.File(label="Download Report (CSV)")

    btn.click(
        predict_ckd,
        inputs=[sg, al, hemo, pcv, wc, htn, dm, appet, pe],
        outputs=[output_html, shap_df, shap_plot, report_file]
    )

demo.launch(debug=True)
