In [108]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier , plot_tree
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Load Dataset
df = pd.read_csv("Datasets/kidney_disease.csv")
df.drop("id",axis = 1, inplace = True)
df.info()
df.columns

In [110]:
df.columns =['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell', 'pus_cell_clumbs', 'bacteria', 'blood_glucose_random', 'blood_urea',
       'serum_creatinine', 'sodium', 'potassium', 'hemoglobin', 'packet_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count', 'hypertension', 'diabetes_mellitus', 'coronary_artery_disase',
       'appetite', 'peda_edama', 'aanemia', 'class']

df.info()

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     391 non-null    float64
 1   blood_pressure          388 non-null    float64
 2   specific_gravity        353 non-null    float64
 3   albumin                 354 non-null    float64
 4   sugar                   351 non-null    float64
 5   red_blood_cells         248 non-null    object 
 6   pus_cell                335 non-null    object 
 7   pus_cell_clumbs         396 non-null    object 
 8   bacteria                396 non-null    object 
 9   blood_glucose_random    356 non-null    float64
 10  blood_urea              381 non-null    float64
 11  serum_creatinine        383 non-null    float64
 12  sodium                  313 non-null    float64
 13  potassium               312 non-null    float64
 14  hemoglobin              348 non-null    fl

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,blood_glucose_random,blood_urea,serum_creatinine,sodium,potassium,hemoglobin
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [111]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumbs,bacteria,blood_glucose_random,...,packet_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disase,appetite,peda_edama,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [112]:
df["packet_cell_volume"] = pd.to_numeric(df["packet_cell_volume"],errors="coerce")
df["white_blood_cell_count"] = pd.to_numeric(df["white_blood_cell_count"],errors="coerce")
df["red_blood_cell_count"] = pd.to_numeric(df["red_blood_cell_count"],errors="coerce")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     391 non-null    float64
 1   blood_pressure          388 non-null    float64
 2   specific_gravity        353 non-null    float64
 3   albumin                 354 non-null    float64
 4   sugar                   351 non-null    float64
 5   red_blood_cells         248 non-null    object 
 6   pus_cell                335 non-null    object 
 7   pus_cell_clumbs         396 non-null    object 
 8   bacteria                396 non-null    object 
 9   blood_glucose_random    356 non-null    float64
 10  blood_urea              381 non-null    float64
 11  serum_creatinine        383 non-null    float64
 12  sodium                  313 non-null    float64
 13  potassium               312 non-null    float64
 14  hemoglobin              348 non-null    fl

In [113]:
#EDA
cat_cols =[col for col in df.columns if df[col].dtype == "object"]
num_cols =[col for col in df.columns if df[col].dtype != "object"]

for col in cat_cols:
    print(f"{col}: {df[col].unique()}")

red_blood_cells: [nan 'normal' 'abnormal']
pus_cell: ['normal' 'abnormal' nan]
pus_cell_clumbs: ['notpresent' 'present' nan]
bacteria: ['notpresent' 'present' nan]
hypertension: ['yes' 'no' nan]
diabetes_mellitus: ['yes' 'no' ' yes' '\tno' '\tyes' nan]
coronary_artery_disase: ['no' 'yes' '\tno' nan]
appetite: ['good' 'poor' nan]
peda_edama: ['no' 'yes' nan]
aanemia: ['no' 'yes' nan]
class: ['ckd' 'ckd\t' 'notckd']


In [114]:
df["diabetes_mellitus"].replace(to_replace={' yes':"yes", '\tno':"no", '\tyes':"yes"}, inplace=True)
df["coronary_artery_disase"].replace(to_replace={'\tno':"no"}, inplace=True)
df["class"].replace(to_replace={'ckd\t':"ckd"}, inplace=True)

for col in cat_cols:
    print(f"{col}: {df[col].unique()}")

red_blood_cells: [nan 'normal' 'abnormal']
pus_cell: ['normal' 'abnormal' nan]
pus_cell_clumbs: ['notpresent' 'present' nan]
bacteria: ['notpresent' 'present' nan]
hypertension: ['yes' 'no' nan]
diabetes_mellitus: ['yes' 'no' nan]
coronary_artery_disase: ['no' 'yes' nan]
appetite: ['good' 'poor' nan]
peda_edama: ['no' 'yes' nan]
aanemia: ['no' 'yes' nan]
class: ['ckd' 'notckd']


In [115]:
df["class"]= df["class"].map({"ckd":0, "notckd":1})

In [116]:
%matplotlib qt 

plt.figure()
plotnumber = 1

for col in num_cols:
    if plotnumber <= 14:  
        ax = plt.subplot(3, 5, plotnumber)
        sns.histplot(df[col], kde=True) 
        plt.xlabel(col)

    plotnumber += 1

plt.tight_layout()
plt.show()

In [117]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(12, 8))  # Daha geniş bir grafik için
sns.heatmap(numeric_df.corr(), annot=True, linecolor="white", linewidths=2, cmap="coolwarm")
plt.show()

In [118]:
def kde(col):
    grid = sns.FacetGrid(df, hue="class", height=6, aspect=2)
    grid.map(sns.kdeplot , col)
    grid.add_legend()

kde("hemoglobin")
kde("white_blood_cell_count")
kde("packet_cell_volume")
kde("albumin")
kde("specific_gravity")

In [119]:
#Missing Values
df.isna().sum()

age                         9
blood_pressure             12
specific_gravity           47
albumin                    46
sugar                      49
red_blood_cells           152
pus_cell                   65
pus_cell_clumbs             4
bacteria                    4
blood_glucose_random       44
blood_urea                 19
serum_creatinine           17
sodium                     87
potassium                  88
hemoglobin                 52
packet_cell_volume         71
white_blood_cell_count    106
red_blood_cell_count      131
hypertension                2
diabetes_mellitus           2
coronary_artery_disase      2
appetite                    1
peda_edama                  1
aanemia                     1
class                       0
dtype: int64

In [120]:
def solve_mv_random(column):
    random_sample = df[column].dropna().sample(df[column].isna().sum())
    random_sample.index = df[df[column].isnull()].index
    df.loc[df[column].isnull(),column]= random_sample

for col in num_cols:
    solve_mv_random(col)


def solve_mv_mode(column):
    mode = df[column].mode()[0]
    df[column]= df[column].fillna(mode)

for col in cat_cols:
    solve_mv_mode(col)


In [121]:
print(df[num_cols].isnull().sum())
print(df[cat_cols].isnull().sum())

age                       0
blood_pressure            0
specific_gravity          0
albumin                   0
sugar                     0
blood_glucose_random      0
blood_urea                0
serum_creatinine          0
sodium                    0
potassium                 0
hemoglobin                0
packet_cell_volume        0
white_blood_cell_count    0
red_blood_cell_count      0
dtype: int64
red_blood_cells           0
pus_cell                  0
pus_cell_clumbs           0
bacteria                  0
hypertension              0
diabetes_mellitus         0
coronary_artery_disase    0
appetite                  0
peda_edama                0
aanemia                   0
class                     0
dtype: int64


In [122]:
#Prepocessing-Feature Encoding
for col in cat_cols:
    print(f"{col}: {df[col].nunique()}")

red_blood_cells: 2
pus_cell: 2
pus_cell_clumbs: 2
bacteria: 2
hypertension: 2
diabetes_mellitus: 2
coronary_artery_disase: 2
appetite: 2
peda_edama: 2
aanemia: 2
class: 2


In [123]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

In [124]:
df.head()

Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumbs,bacteria,blood_glucose_random,...,packet_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disase,appetite,peda_edama,aanemia,class
0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.02,4.0,0.0,1,1,0,0,239.0,...,38.0,6000.0,2.9,0,0,0,0,0,0,0
2,62.0,80.0,1.01,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,4.1,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.01,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0


In [125]:
#Model training and testing
x_col=[col for col in df.columns if col !="class"]
y_col="class"

X=df[x_col]
y=df[y_col]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

dtc_acc = accuracy_score(y_test , y_pred)
print("Accuracy:",dtc_acc)


Accuracy: 0.9666666666666667


In [126]:
cm = confusion_matrix(y_test , y_pred)

cr = classification_report(y_test , y_pred)

print("Confusion Matrix", cm)
print("Classification Report:", cr)

Confusion Matrix [[74  2]
 [ 2 42]]
Classification Report:               precision    recall  f1-score   support

           0       0.97      0.97      0.97        76
           1       0.95      0.95      0.95        44

    accuracy                           0.97       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.97      0.97      0.97       120



In [127]:
class_names=["ckd", "notckd"]

plt.figure()
plot_tree(dtc , feature_names=x_col, filled = True ,fontsize=8)

[Text(0.4230769230769231, 0.9166666666666666, 'hemoglobin <= 12.95\ngini = 0.471\nsamples = 280\nvalue = [174, 106]'),
 Text(0.15384615384615385, 0.75, 'packet_cell_volume <= 41.5\ngini = 0.041\nsamples = 143\nvalue = [140, 3]'),
 Text(0.28846153846153844, 0.8333333333333333, 'True  '),
 Text(0.07692307692307693, 0.5833333333333334, 'gini = 0.0\nsamples = 127\nvalue = [127, 0]'),
 Text(0.23076923076923078, 0.5833333333333334, 'blood_urea <= 35.5\ngini = 0.305\nsamples = 16\nvalue = [13, 3]'),
 Text(0.15384615384615385, 0.4166666666666667, 'hemoglobin <= 11.85\ngini = 0.5\nsamples = 6\nvalue = [3, 3]'),
 Text(0.07692307692307693, 0.25, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]'),
 Text(0.23076923076923078, 0.25, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]'),
 Text(0.3076923076923077, 0.4166666666666667, 'gini = 0.0\nsamples = 10\nvalue = [10, 0]'),
 Text(0.6923076923076923, 0.75, 'specific_gravity <= 1.017\ngini = 0.373\nsamples = 137\nvalue = [34, 103]'),
 Text(0.5576923076923077, 0.833333

In [128]:
feature_importance = pd.DataFrame({"Feature":x_col, "Importance":dtc.feature_importances_})
print("Most important feature:",feature_importance.sort_values(by="Importance",ascending=False).iloc[0])

plt.figure()
sns.barplot(x="Importance" , y= "Feature", data=feature_importance)
plt.title("Feature Importance")
plt.show()

Most important feature: Feature       hemoglobin
Importance      0.590124
Name: 14, dtype: object
