# Early Detection of Chronic Kidney Disease Using Machine Learning Techniques

## Data Collection and Cleaning

In [13]:
# importing all the necessary libraries and classes
import pandas as ckd_pd

In [14]:
# loading ckd dataset

ckd_DataFrame= ckd_pd.read_csv('CKD_initial_dataset.csv')
ckd_DataFrame.head(10)


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,...,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,...,36,,,no,no,no,good,no,no,ckd
7,7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,...,44,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,...,33,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,...,29,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [15]:
ckd_DataFrame.shape

(400, 26)

In [16]:

ckd_DataFrame.describe()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [17]:
ckd_DataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [18]:
# dropping id column
ckd_DataFrame.drop('id', axis = 1, inplace = True)
ckd_DataFrame.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [19]:
# converting columns 'pcv', 'wc' and 'rc'  to numerical type
selectedFeatureColumns = ['rc', 'pcv', 'wc']
for featuredColumns in selectedFeatureColumns:
    ckd_DataFrame[featuredColumns]= ckd_pd.to_numeric(ckd_DataFrame[featuredColumns], errors='coerce')


ckd_DataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             329 non-null    float64
 16  wc              294 non-null    float64
 17  rc              269 non-null    flo

In [20]:
# Extracting categorical and numerical columns

categorical_columns = [featuredColumn for featuredColumn in ckd_DataFrame.columns if ckd_DataFrame[featuredColumn].dtype == 'object']
numerical_columns = [featuredColumn for featuredColumn in ckd_DataFrame.columns if ckd_DataFrame[featuredColumn].dtype != 'object']

In [21]:
# looking at unique values in categorical columns

for featuredColumn in categorical_columns:
    print(f"Unique values in [{featuredColumn}] column are : {ckd_DataFrame[featuredColumn].unique()} \n")

Unique values in [rbc] column are : [nan 'normal' 'abnormal'] 

Unique values in [pc] column are : ['normal' 'abnormal' nan] 

Unique values in [pcc] column are : ['notpresent' 'present' nan] 

Unique values in [ba] column are : ['notpresent' 'present' nan] 

Unique values in [htn] column are : ['yes' 'no' nan] 

Unique values in [dm] column are : ['yes' 'no' ' yes' '\tno' '\tyes' nan] 

Unique values in [cad] column are : ['no' 'yes' '\tno' nan] 

Unique values in [appet] column are : ['good' 'poor' nan] 

Unique values in [pe] column are : ['no' 'yes' nan] 

Unique values in [ane] column are : ['no' 'yes' nan] 

Unique values in [classification] column are : ['ckd' 'ckd\t' 'notckd'] 



In [22]:
# replacing incorrect values in the columns

# Replace values in the 'dm' column
ckd_DataFrame['dm'] = ckd_DataFrame['dm'].map({'\tno': 'no', '\tyes': 'yes', ' yes': 'yes', 'yes': 'yes', 'no': 'no'})

# Replace values in the 'cad' column
ckd_DataFrame['cad'] = ckd_DataFrame['cad'].apply(lambda x: 'no' if x == '\tno' else x)

# Replace values in the 'classification' column
ckd_DataFrame['classification'] = ckd_DataFrame['classification'].map({'ckd\t': 'ckd','ckd': 'ckd', 'notckd': 'notckd'})
ckd_DataFrame.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,...,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,...,36.0,,,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,...,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,...,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [23]:
correctedColumns = ['dm', 'cad', 'classification']

for featureColumn in correctedColumns:
    print(f"values in [{featureColumn}] column : {ckd_DataFrame[featureColumn].unique()} \n")
ckd_DataFrame.columns

values in [dm] column : ['yes' 'no' nan] 

values in [cad] column : ['no' 'yes' nan] 

values in [classification] column : ['ckd' 'notckd'] 



Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [24]:
newColumns=ckd_pd.read_csv('column_description.txt',sep='-')
newColumns=newColumns.reset_index()
newColumns.columns=['cols','abb_col_names']
ckd_DataFrame.columns=newColumns['abb_col_names'].values
ckd_DataFrame.columns
ckd_DataFrame.to_csv('CKD_cleaned.csv', index=False)
ckd_DataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    391 non-null    float64
 1   bloodPressure          388 non-null    float64
 2   specificGravity        353 non-null    float64
 3   albumin                354 non-null    float64
 4   sugar                  351 non-null    float64
 5   redBloodCells          248 non-null    object 
 6   pusCells               335 non-null    object 
 7   pusCellClumps          396 non-null    object 
 8   bacteria               396 non-null    object 
 9   bloodGlucoseRandom     356 non-null    float64
 10  bloodUrea              381 non-null    float64
 11  serumCreatinine        383 non-null    float64
 12  sodium                 313 non-null    float64
 13  potassium              312 non-null    float64
 14  haemoglobin            348 non-null    float64
 15  packed