# ✅ CKD Model Training with Calibration and Class Balance
This notebook improves the model training pipeline for CKD detection.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Load dataset
df = pd.read_csv("../data/kidney_disease.csv")

# Column renaming
df.rename(columns={
    'bp': 'blood pressure', 'sg': 'specific gravity', 'al': 'albumin', 'su': 'sugar', 'rbc': 'red blood cells',
    'pc': 'pus cell', 'pcc': 'pus cell clumps', 'ba': 'bacteria', 'bgr': 'blood glucose random',
    'bu': 'blood urea', 'sc': 'serum creatinine', 'sod': 'sodium', 'pot': 'potassium', 'hemo': 'haemoglobin',
    'pcv': 'packed cell volume', 'wc': 'white blood cell count', 'rc': 'red blood cell count',
    'htn': 'hypertension', 'dm': 'diabetes mellitus', 'cad': 'coronary artery disease', 'appet': 'appetite',
    'pe': 'pedal edema', 'ane': 'anemia', 'classification': 'class'
}, inplace=True)

In [None]:
# Basic cleaning
df = df.replace({
    "yes": 1, "no": 0, "ckd": 1, "notckd": 0,
    "normal": 1, "abnormal": 0, "present": 1, "notpresent": 0,
    "good": 1, "poor": 0, "\tyes": 1, "\tno": 0, "?": np.nan
})
df = df.drop(columns=['id'], errors='ignore')

# Convert object columns to numeric
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip().replace('?', np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)
for col in df.columns[df.isna().any()]:
    df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else -1)

In [None]:
# Feature selection
X = df.drop(columns=['class'])
y = df['class'].astype(int)

selector = SelectKBest(score_func=chi2, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Top 10 Features:", selected_features.tolist())

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale selected features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, "scaler.pkl")


In [None]:
# Logistic Regression with class weighting and calibration
base_model = LogisticRegression(max_iter=10000, class_weight='balanced', random_state=42)
calibrated_model = CalibratedClassifierCV(base_model, cv=5)
calibrated_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = calibrated_model.predict(X_test)
y_prob = calibrated_model.predict_proba(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Save the calibrated model and selector
joblib.dump(calibrated_model, "calibrated_ckd_model.pkl")
joblib.dump(selector, "select_k_best.pkl")

In [None]:
# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()