# Early Detection of Chronic Kidney Disease Using Machine Learning Techniques


## Data Pre-Processing  

In [1]:
# importing all the necessary libraries and classes

import pandas as pd

In [2]:
df= pd.read_csv('kidney_disease_dataset_cleaned.csv')
df.head()

Unnamed: 0,age,bloodPressure,specificGravity,albumin,sugar,redBloodCells,pusCells,pusCellClumps,bacteria,bloodGlucoseRandom,...,packedCellVolume,whiteBloodCellCount,redBloodCellCount,hypertension,diabetesMellitus,coronaryArteryDisease,appetite,pedalEdema,anemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [4]:
#checking for null values in the dataset
df.isnull().sum().sort_values()

class                      0
pedalEdema                 1
appetite                   1
anemia                     1
coronaryArteryDisease      2
diabetesMellitus           2
hypertension               2
pusCellClumps              4
bacteria                   4
age                        9
bloodPressure             12
serumCreatinine           17
bloodUrea                 19
bloodGlucoseRandom        44
albumin                   46
specificGravity           47
sugar                     49
haemoglobin               52
pusCells                  65
packedCellVolume          71
sodium                    87
potassium                 88
whiteBloodCellCount      106
redBloodCellCount        131
redBloodCells            152
dtype: int64

In [5]:
# Extracting categorical and numerical columns

categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
numerical_cols = [col for col in df.columns if df[col].dtype != 'object']

In [6]:
df[categorical_cols].isnull().sum().sort_values()

class                      0
appetite                   1
pedalEdema                 1
anemia                     1
hypertension               2
diabetesMellitus           2
coronaryArteryDisease      2
pusCellClumps              4
bacteria                   4
pusCells                  65
redBloodCells            152
dtype: int64

In [7]:
df[numerical_cols].isnull().sum().sort_values()

age                      9
bloodPressure           12
serumCreatinine         17
bloodUrea               19
bloodGlucoseRandom      44
albumin                 46
specificGravity         47
sugar                   49
haemoglobin             52
packedCellVolume        71
sodium                  87
potassium               88
whiteBloodCellCount    106
redBloodCellCount      131
dtype: int64

In [8]:
# filling null values, we will use two methods
#random sampling for higher null values and 


def randomVal_imputation(col):
    randomSample = df[col].dropna().sample(df[col].isna().sum())
    randomSample.index = df[df[col].isnull()].index
    df.loc[df[col].isnull(), col] = randomSample

# mean/mode sampling for lower null values
def mode_imputation(col):
    col_mode = df[col].mode()[0]
    df[col] = df[col].fillna(col_mode)

In [10]:
# filling numerical columns null values using random sampling method

for col in numerical_cols:
    randomVal_imputation(col)

df[numerical_cols].isnull().sum().sort_values()

age                    0
bloodPressure          0
specificGravity        0
albumin                0
sugar                  0
bloodGlucoseRandom     0
bloodUrea              0
serumCreatinine        0
sodium                 0
potassium              0
haemoglobin            0
packedCellVolume       0
whiteBloodCellCount    0
redBloodCellCount      0
dtype: int64

In [12]:
#filling columns of redBloodCells and pussCells using random sampling menthod
randomVal_imputation('redBloodCells')
randomVal_imputation('pusCells')

for col in categorical_cols:
    mode_imputation(col)

df[categorical_cols].isnull().sum().sort_values()

redBloodCells            0
pusCells                 0
pusCellClumps            0
bacteria                 0
hypertension             0
diabetesMellitus         0
coronaryArteryDisease    0
appetite                 0
pedalEdema               0
anemia                   0
class                    0
dtype: int64

## Feature Encoding

In [21]:
for col in categorical_cols:
      print(f"[{col}] column have {df[col].nunique()} categories : {df[col].unique()} \n")

[redBloodCells] column have 2 categories : ['normal' 'abnormal'] 

[pusCells] column have 2 categories : ['normal' 'abnormal'] 

[pusCellClumps] column have 2 categories : ['notpresent' 'present'] 

[bacteria] column have 2 categories : ['notpresent' 'present'] 

[hypertension] column have 2 categories : ['yes' 'no'] 

[diabetesMellitus] column have 2 categories : ['yes' 'no'] 

[coronaryArteryDisease] column have 2 categories : ['no' 'yes'] 

[appetite] column have 2 categories : ['good' 'poor'] 

[pedalEdema] column have 2 categories : ['no' 'yes'] 

[anemia] column have 2 categories : ['no' 'yes'] 

[class] column have 2 categories : ['ckd' 'notckd'] 



In [26]:
#as all of the categorical columns have 2 categories we can use label encoder
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_cols:
    # Fit and transform the categorical data
    df[col] = label_encoder.fit_transform(df[col])

df

Unnamed: 0,age,bloodPressure,specificGravity,albumin,sugar,redBloodCells,pusCells,pusCellClumps,bacteria,bloodGlucoseRandom,...,packedCellVolume,whiteBloodCellCount,redBloodCellCount,hypertension,diabetesMellitus,coronaryArteryDisease,appetite,pedalEdema,anemia,class
0,48.0,80.0,1.020,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.020,4.0,0.0,1,1,0,0,93.0,...,38.0,6000.0,4.5,0,0,0,0,0,0,0
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,4.8,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.0,...,47.0,6700.0,4.9,0,0,0,0,0,0,1
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.0,...,54.0,7800.0,6.2,0,0,0,0,0,0,1
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.0,...,49.0,6600.0,5.4,0,0,0,0,0,0,1
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.0,...,51.0,7200.0,5.9,0,0,0,0,0,0,1


In [27]:
df.to_csv('kidney_disease_dataset_preprocessed.csv', index=False)

In [34]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.figure(figsize=(20,10))
# corr = df.corr()
# sns.heatmap(corr, annot = True, linewidths=2)