In [336]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np

In [338]:
df = pd.read_csv("../data/kidney_disease.csv")

In [340]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [342]:
len(df)

400

In [344]:
df = df.replace({
    "yes": 1, "no": 0,
    "ckd": 1, "notckd": 0,
    "normal": 1, "abnormal": 0,
    "present": 1, "notpresent": 0,
    "good": 1, "poor": 0
})


  df = df.replace({


### Special cases: Tabs and other anomalities

In [347]:
df = df.replace({'\tyes': 1, '\tno': 0, '?': np.nan})

  df = df.replace({'\tyes': 1, '\tno': 0, '?': np.nan})


In [349]:
df = df.drop(columns=['id'])

In [351]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,1.0,0.0,0.0,121.0,...,44,7800,5.2,1.0,1,0.0,1.0,0.0,0.0,1
1,7.0,50.0,1.02,4.0,0.0,,1.0,0.0,0.0,,...,38,6000,,0.0,0,0.0,1.0,0.0,0.0,1
2,62.0,80.0,1.01,2.0,3.0,1.0,1.0,0.0,0.0,423.0,...,31,7500,,0.0,1,0.0,0.0,0.0,1.0,1
3,48.0,70.0,1.005,4.0,0.0,1.0,0.0,1.0,0.0,117.0,...,32,6700,3.9,1.0,0,0.0,0.0,1.0,1.0,1
4,51.0,80.0,1.01,2.0,0.0,1.0,1.0,0.0,0.0,106.0,...,35,7300,4.6,0.0,0,0.0,1.0,0.0,0.0,1


In [353]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    float64
 6   pc              335 non-null    float64
 7   pcc             396 non-null    float64
 8   ba              396 non-null    float64
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

We will convert all the columns object type to numeric types

In [356]:
for col in df.select_dtypes(include=['object']).columns:
    # First, clean the strings by removing extra spaces and replacing "?" with NaN
    df[col] = df[col].astype(str).str.strip().replace('?', np.nan).replace(' ?', np.nan)
    # Then convert to numeric, coercing errors to NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [358]:
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.median()))

In [360]:
for col in df.columns[df.isna().any()]:
    df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else -1)

In [362]:
print("Data types for cleaning: ")
print(df.dtypes)

Data types for cleaning: 
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc               float64
pc                float64
pcc               float64
ba                float64
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv               float64
wc                float64
rc                float64
htn               float64
dm                float64
cad               float64
appet             float64
pe                float64
ane               float64
classification    float64
dtype: object


In [364]:
X = df.drop(columns=['classification'])
y = df['classification']

In [366]:
X.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane'],
      dtype='object')

In [368]:
y

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
395    0.0
396    0.0
397    0.0
398    0.0
399    0.0
Name: classification, Length: 400, dtype: float64

In [370]:
y = y.replace({'ckd': 1, 'notckd': 0})  # In case any weren't converted
y = pd.to_numeric(y, errors='coerce').fillna(-1)

In [372]:
X.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane'],
      dtype='object')

In [374]:
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
except ValueError as e:
    print("Stratification failed, trying without:")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

In [376]:
rf_ckd = RandomForestClassifier(random_state = 42)
rf_ckd.fit(X_train, y_train)

y_pred_cks = rf_ckd.predict(X_test)

In [378]:
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_cks))
print(classification_report(y_test, y_pred_cks))

Random Forest Performance:
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        30
         1.0       1.00      1.00      1.00        50

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



In [380]:
# Check if any features perfectly correlate with the target
correlations = df.corr()['classification'].abs().sort_values(ascending=False)
print(correlations.head(10))

classification    1.000000
hemo              0.726368
pcv               0.673129
sg                0.659504
htn               0.590438
rc                0.566163
dm                0.555959
al                0.531562
appet             0.393341
bgr               0.379321
Name: classification, dtype: float64


In [382]:
importances = rf_ckd.feature_importances_
feature_imp = pd.DataFrame({"Feature": X.columns, "Importance": importances})
feature_imp = feature_imp.sort_values("Importance", ascending=False)
print(feature_imp.head(10))

   Feature  Importance
15     pcv    0.171513
14    hemo    0.155467
11      sc    0.147123
17      rc    0.145620
2       sg    0.099248
3       al    0.059065
19      dm    0.043617
10      bu    0.035261
18     htn    0.035246
9      bgr    0.021925


In [384]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_ckd, X, y, cv=5)  # 5-fold CV
print(f"CV Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

CV Accuracy: 0.99 ± 0.01


In [385]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression
lr_ckd = LogisticRegression(max_iter=3000, random_state=42)
lr_ckd.fit(X_train, y_train)

# Predict
y_pred_lr = lr_ckd.predict(X_test)

# Evaluate
print("Logistic Regression CKD Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression CKD Performance:
Accuracy: 0.9375
              precision    recall  f1-score   support

         0.0       0.90      0.93      0.92        30
         1.0       0.96      0.94      0.95        50

    accuracy                           0.94        80
   macro avg       0.93      0.94      0.93        80
weighted avg       0.94      0.94      0.94        80



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [397]:
joblib.dump(lr_ckd, 'lr_ckd.pkl')

['lr_ckd.pkl']

In [393]:
X.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane'],
      dtype='object')