# Early Detection of Chronic Kidney Disease Using Machine Learning Techniques


## Data Pre-Processing  

In [3]:
# importing all the necessary libraries and classes

import pandas as ckd_pd
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [4]:
# loading ckd cleaned dataset

ckd_DataFrame= ckd_pd.read_csv('CKD_cleaned.csv')
ckd_DataFrame.head()


Unnamed: 0,age,bloodPressure,specificGravity,albumin,sugar,redBloodCells,pusCells,pusCellClumps,bacteria,bloodGlucoseRandom,...,packedCellVolume,whiteBloodCellCount,redBloodCellCount,hypertension,diabetesMellitus,coronaryArteryDisease,appetite,pedalEdema,anemia,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [5]:
#checking for null values in the dataset
ckd_DataFrame.isnull().sum().sort_values()

class                      0
pedalEdema                 1
appetite                   1
anemia                     1
coronaryArteryDisease      2
diabetesMellitus           2
hypertension               2
pusCellClumps              4
bacteria                   4
age                        9
bloodPressure             12
serumCreatinine           17
bloodUrea                 19
bloodGlucoseRandom        44
albumin                   46
specificGravity           47
sugar                     49
haemoglobin               52
pusCells                  65
packedCellVolume          71
sodium                    87
potassium                 88
whiteBloodCellCount      106
redBloodCellCount        131
redBloodCells            152
dtype: int64

In [6]:
# Extracting categorical and numerical columns
categorical_columns = [featuredColumn for featuredColumn in ckd_DataFrame.columns if ckd_DataFrame[featuredColumn].dtype == 'object']
numerical_columns = [featuredColumn for featuredColumn in ckd_DataFrame.columns if ckd_DataFrame[featuredColumn].dtype != 'object']

ckd_DataFrame[categorical_columns].isnull().sum().sort_values()

class                      0
appetite                   1
pedalEdema                 1
anemia                     1
hypertension               2
diabetesMellitus           2
coronaryArteryDisease      2
pusCellClumps              4
bacteria                   4
pusCells                  65
redBloodCells            152
dtype: int64

In [7]:
ckd_DataFrame[numerical_columns].isnull().sum().sort_values()

age                      9
bloodPressure           12
serumCreatinine         17
bloodUrea               19
bloodGlucoseRandom      44
albumin                 46
specificGravity         47
sugar                   49
haemoglobin             52
packedCellVolume        71
sodium                  87
potassium               88
whiteBloodCellCount    106
redBloodCellCount      131
dtype: int64

In [8]:
# two methods are used to fill the null values in the dataset
#random sampling for higher null values and numerical columns

def randomValImpute(featuredColumn):
    randomSample = ckd_DataFrame[featuredColumn].dropna().sample(ckd_DataFrame[featuredColumn].isna().sum())
    randomSample.index = ckd_DataFrame[ckd_DataFrame[featuredColumn].isnull()].index
    ckd_DataFrame.loc[ckd_DataFrame[featuredColumn].isnull(), featuredColumn] = randomSample

# mode sampling for lower null values
def modeImpute(featuredColumn):
    colModeValue = ckd_DataFrame[featuredColumn].mode()[0]
    ckd_DataFrame[featuredColumn] = ckd_DataFrame[featuredColumn].fillna(colModeValue)

In [9]:
# filling numerical columns null values using random sampling method

for featuredColumn in numerical_columns:
    randomValImpute(featuredColumn)

ckd_DataFrame[numerical_columns].isnull().sum().sort_values()

age                    0
bloodPressure          0
specificGravity        0
albumin                0
sugar                  0
bloodGlucoseRandom     0
bloodUrea              0
serumCreatinine        0
sodium                 0
potassium              0
haemoglobin            0
packedCellVolume       0
whiteBloodCellCount    0
redBloodCellCount      0
dtype: int64

In [10]:
#filling columns of redBloodCells and pussCells using random sampling menthod
randomValImpute('redBloodCells')
randomValImpute('pusCells')

for featuredColumn in categorical_columns:
    modeImpute(featuredColumn)

ckd_DataFrame[categorical_columns].isnull().sum().sort_values()

redBloodCells            0
pusCells                 0
pusCellClumps            0
bacteria                 0
hypertension             0
diabetesMellitus         0
coronaryArteryDisease    0
appetite                 0
pedalEdema               0
anemia                   0
class                    0
dtype: int64

In [11]:
ckd_DataFrame.isnull().sum().sort_values()

age                      0
pedalEdema               0
appetite                 0
coronaryArteryDisease    0
diabetesMellitus         0
hypertension             0
redBloodCellCount        0
whiteBloodCellCount      0
packedCellVolume         0
haemoglobin              0
potassium                0
anemia                   0
sodium                   0
bloodUrea                0
bloodGlucoseRandom       0
bacteria                 0
pusCellClumps            0
pusCells                 0
redBloodCells            0
sugar                    0
albumin                  0
specificGravity          0
bloodPressure            0
serumCreatinine          0
class                    0
dtype: int64

## Feature Encoding

In [12]:
for featuredColumn in categorical_columns:
      print(f"[{featuredColumn}] column have {ckd_DataFrame[featuredColumn].nunique()} categories : {ckd_DataFrame[featuredColumn].unique()} \n")

[redBloodCells] column have 2 categories : ['normal' 'abnormal'] 

[pusCells] column have 2 categories : ['normal' 'abnormal'] 

[pusCellClumps] column have 2 categories : ['notpresent' 'present'] 

[bacteria] column have 2 categories : ['notpresent' 'present'] 

[hypertension] column have 2 categories : ['yes' 'no'] 

[diabetesMellitus] column have 2 categories : ['yes' 'no'] 

[coronaryArteryDisease] column have 2 categories : ['no' 'yes'] 

[appetite] column have 2 categories : ['good' 'poor'] 

[pedalEdema] column have 2 categories : ['no' 'yes'] 

[anemia] column have 2 categories : ['no' 'yes'] 

[class] column have 2 categories : ['ckd' 'notckd'] 



In [13]:
#as all of the categorical columns have 2 categories we can use label encoder

CKD_LabelEncoder = LabelEncoder()
for featuredColumn in categorical_columns:
    # Fit and transform the categorical data
    ckd_DataFrame[featuredColumn] = CKD_LabelEncoder.fit_transform(ckd_DataFrame[featuredColumn])

ckd_DataFrame

Unnamed: 0,age,bloodPressure,specificGravity,albumin,sugar,redBloodCells,pusCells,pusCellClumps,bacteria,bloodGlucoseRandom,...,packedCellVolume,whiteBloodCellCount,redBloodCellCount,hypertension,diabetesMellitus,coronaryArteryDisease,appetite,pedalEdema,anemia,class
0,48.0,80.0,1.020,1.0,0.0,1,1,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,0,0,0,0
1,7.0,50.0,1.020,4.0,0.0,0,1,0,0,117.0,...,38.0,6000.0,3.4,0,0,0,0,0,0,0
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.0,...,31.0,7500.0,4.1,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,1,1,1,0
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.0,...,47.0,6700.0,4.9,0,0,0,0,0,0,1
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.0,...,54.0,7800.0,6.2,0,0,0,0,0,0,1
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.0,...,49.0,6600.0,5.4,0,0,0,0,0,0,1
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.0,...,51.0,7200.0,5.9,0,0,0,0,0,0,1


In [14]:
ckd_DataFrame.to_csv('CKD_Preprocessed.csv', index=False)