In [2]:
import pandas as pd 

In [3]:
df=pd.read_csv("kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [4]:
df.shape

(400, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [6]:
df.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [8]:
df.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [7]:
df.select_dtypes(include=['object']).columns

Index(['rbc', 'pc', 'pcc', 'ba', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [8]:
df["classification"].value_counts()

classification
ckd       248
notckd    150
ckd\t       2
Name: count, dtype: int64

In [9]:
df["al"].value_counts()

al
0.0    199
1.0     44
2.0     43
3.0     43
4.0     24
5.0      1
Name: count, dtype: int64

In [4]:
df.drop(columns=['id'], inplace=True)
for col in ['pcv', 'wc', 'rc']:
    df[col] = pd.to_numeric(df[col], errors='coerce')  
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].fillna(df[col].median())  
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])  
df.isnull().sum().sum()  

0

Encoding and standardization

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  

X = df.drop(columns=['classification'])  
y = df['classification']  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((320, 24), (80, 24), (320,), (80,))

In [7]:
X_train

array([[ 0.4384865 , -1.23024623,  0.4214856 , ..., -0.50780078,
        -0.4843221 , -0.42008403],
       [ 0.49744266, -0.48801623,  0.4214856 , ..., -0.50780078,
        -0.4843221 , -0.42008403],
       [ 1.08700434, -0.48801623, -0.4997944 , ...,  1.96927621,
         2.0647416 , -0.42008403],
       ...,
       [ 0.14370566,  0.25421378,  0.4214856 , ...,  1.96927621,
         2.0647416 , -0.42008403],
       [ 0.49744266,  0.25421378,  0.4214856 , ..., -0.50780078,
        -0.4843221 , -0.42008403],
       [ 0.91013583,  0.99644378,  0.4214856 , ..., -0.50780078,
        -0.4843221 , -0.42008403]])

Logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy,classification_rep


(0.975,
 '              precision    recall  f1-score   support\n\n           0       1.00      0.96      0.98        50\n           2       0.94      1.00      0.97        30\n\n    accuracy                           0.97        80\n   macro avg       0.97      0.98      0.97        80\nweighted avg       0.98      0.97      0.98        80\n')

In [13]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred)
cm

array([[48,  2],
       [ 0, 30]], dtype=int64)

forward selection

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
log_reg1 = LogisticRegression(max_iter=500)
sfs = SequentialFeatureSelector(log_reg1, n_features_to_select=10, direction="forward",cv=2)
sfs.fit(X_train, y_train)
X_train_df = pd.DataFrame(X_train, columns=['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane'])
selected_features_forward = X_train_df.columns[sfs.get_support()]
print("\nSelected Features (Forward Selection):\n", list(selected_features_forward))


Selected Features (Forward Selection):
 ['age', 'sg', 'su', 'rbc', 'pcc', 'ba', 'bu', 'sc', 'hemo', 'htn']


Backward elimination

In [11]:
from sklearn.feature_selection import RFE
rfe = RFE(estimator=log_reg1, n_features_to_select=10)
rfe.fit(X_train, y_train)
X_train_df = pd.DataFrame(X_train, columns=X.columns)
selected_features_backward = X_train_df.columns[rfe.get_support()]
print("\nSelected Features (Backward Elimination):\n", list(selected_features_backward))


Selected Features (Backward Elimination):
 ['sg', 'al', 'su', 'sc', 'hemo', 'pcv', 'rc', 'htn', 'appet', 'pe']


In [14]:
df.value_counts()

age   bp    sg     al   su   rbc  pc  pcc  ba  bgr    bu     sc   sod    pot  hemo   pcv   wc      rc   htn  dm  cad  appet  pe  ane  classification
2.0   80.0  1.010  3.0  0.0  1    0   0    0   121.0  42.0   1.3  138.0  4.4  12.65  40.0  8000.0  4.8  0    3   1    0      1   0    0                 1
60.0  90.0  1.015  3.0  0.0  1    1   0    0   74.0   25.0   1.1  142.0  3.2  12.20  39.0  7800.0  4.4  1    4   1    0      1   0    0                 1
61.0  80.0  1.015  2.0  0.0  0    0   0    0   173.0  148.0  3.9  135.0  5.2  7.70   24.0  9200.0  3.2  1    4   2    1      1   1    0                 1
                   0.0  4.0  1    1   0    0   360.0  19.0   0.7  137.0  4.4  15.20  44.0  8300.0  5.2  1    4   1    0      0   0    0                 1
      70.0  1.025  0.0  0.0  1    1   0    0   133.0  38.0   1.0  142.0  3.6  13.70  47.0  9200.0  4.9  0    3   1    0      0   0    2                 1
                                                                                 

Random Forest 

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)    
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
accuracy,classification_rep

(0.975,
 '              precision    recall  f1-score   support\n\n           0       0.96      1.00      0.98        50\n           2       1.00      0.93      0.97        30\n\n    accuracy                           0.97        80\n   macro avg       0.98      0.97      0.97        80\nweighted avg       0.98      0.97      0.97        80\n')