In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import files
uploaded = files.upload()

Saving kidney_disease.csv to kidney_disease.csv


In [3]:
df = pd.read_csv('kidney_disease.csv')

In [4]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [5]:
df.columns = df.columns.str.strip().str.lower()

In [6]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [7]:
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [9]:
categorical_cols = ["rbc", "pc", "pcc", "ba", "htn", "dm", "cad", "appet", "pe", "ane"]
numerical_cols = ["age", "bp", "sg", "al", "su", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wc", "rc"]

df["pcv"] = pd.to_numeric(df["pcv"], errors="coerce")
df["wc"] = pd.to_numeric(df["wc"], errors="coerce")
df["rc"] = pd.to_numeric(df["rc"], errors="coerce")

In [10]:
num_imputer = SimpleImputer(strategy="median")
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

In [11]:
for col in categorical_cols:
    df.fillna({col: df[col].mode()[0]}, inplace=True)

In [12]:
print("\nMissing Values After Imputation:\n", df.isnull().sum().sum())


Missing Values After Imputation:
 0


In [13]:
from sklearn.preprocessing import OneHotEncoder

# Define age bins and labels
bins = [0, 20, 40, 60, 80, 100]  # Define age ranges
labels = ["0-20", "21-40", "41-60", "61-80", "81-100"]  # Corresponding labels

# Create a new column with age categories
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

In [14]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,wc,rc,htn,dm,cad,appet,pe,ane,classification,age_group
0,0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,...,7800.0,5.2,yes,yes,no,good,no,no,ckd,41-60
1,1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,...,6000.0,4.8,no,no,no,good,no,no,ckd,0-20
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,7500.0,4.8,no,yes,no,poor,no,yes,ckd,61-80
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,6700.0,3.9,yes,no,no,poor,yes,yes,ckd,41-60
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,7300.0,4.6,no,no,no,good,no,no,ckd,41-60


In [15]:
df = pd.get_dummies(df, columns=["age_group"], prefix="Age")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,cad,appet,pe,ane,classification,Age_0-20,Age_21-40,Age_41-60,Age_61-80,Age_81-100
0,0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,...,no,good,no,no,ckd,False,False,True,False,False
1,1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,...,no,good,no,no,ckd,True,False,False,False,False
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,no,poor,no,yes,ckd,False,False,False,True,False
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,no,poor,yes,yes,ckd,False,False,True,False,False
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,no,good,no,no,ckd,False,False,True,False,False


In [16]:
df["classification"] = df["classification"].map({"ckd": 1, "notckd": 0})

In [17]:
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

In [18]:
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [19]:
X = df.drop(columns=["id","age","classification"], errors="ignore")
y = df["classification"]

print("\nShape of X (Features):", X.shape)
print("Shape of y (Target):", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



Shape of X (Features): (400, 28)
Shape of y (Target): (400,)


In [20]:
df.shape

(400, 31)

In [25]:
log_reg = LogisticRegression(max_iter=500)

sfs = SequentialFeatureSelector(log_reg, n_features_to_select=10, direction="forward")
sfs.fit(X_train, y_train)

selected_features_forward = X_train.columns[sfs.get_support()]
print("\nSelected Features (Forward Selection):\n", list(selected_features_forward))


Selected Features (Forward Selection):
 ['al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'pot', 'hemo', 'dm', 'cad']


In [26]:
rfe = RFE(estimator=log_reg, n_features_to_select=10)
rfe.fit(X_train, y_train)

selected_features_backward = X_train.columns[rfe.get_support()]
print("\nSelected Features (Backward Elimination):\n", list(selected_features_backward))


Selected Features (Backward Elimination):
 ['sg', 'al', 'su', 'sc', 'hemo', 'pcv', 'rc', 'htn', 'dm', 'appet']


In [27]:
X_train_fs = X_train[selected_features_forward]
X_test_fs = X_test[selected_features_forward]

In [28]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel="linear", probability=True)
}

In [29]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_fs, y_train)
    y_pred = model.predict(X_test_fs)

    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")

    # Store results
    results[name] = acc


Training Logistic Regression...
Logistic Regression Accuracy: 0.9625

Training Random Forest...
Random Forest Accuracy: 0.9875

Training Support Vector Machine...
Support Vector Machine Accuracy: 0.9375


In [31]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
y_pred_best = best_model.predict(X_test_fs)

print(f"\nBest Model: {best_model_name}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


Best Model: Random Forest

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98        30
           1       1.00      0.98      0.99        50

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80


Confusion Matrix:
 [[30  0]
 [ 1 49]]
