In [24]:
import pandas as pd
import numpy as np
from collections import Counter as c
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
data=pd.read_csv("kidney_disease.csv")
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35.0,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
data.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [4]:
data.columns=['id','age','blood_pressure','specific_gravity','albumin',
              'sugar','red_blood_cells','pus_cell','pus_cell_clumps','bacteria',
              'blood glucose random','blood_urea','serum_creatinine','sodium','potassium',
              'hemoglobin','packed_cell_volume','white_blood_cell_count','red_blood_cell_count',
              'hypertension','diabetesmellitus','coronary_artery_disease','appetite',
              'pedal_edema','anemia','class']
data.columns              

Index(['id', 'age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
       'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
       'blood glucose random', 'blood_urea', 'serum_creatinine', 'sodium',
       'potassium', 'hemoglobin', 'packed_cell_volume',
       'white_blood_cell_count', 'red_blood_cell_count', 'hypertension',
       'diabetesmellitus', 'coronary_artery_disease', 'appetite',
       'pedal_edema', 'anemia', 'class'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       400 non-null    int64  
 1   age                      391 non-null    float64
 2   blood_pressure           388 non-null    float64
 3   specific_gravity         353 non-null    float64
 4   albumin                  354 non-null    float64
 5   sugar                    351 non-null    float64
 6   red_blood_cells          248 non-null    object 
 7   pus_cell                 335 non-null    object 
 8   pus_cell_clumps          396 non-null    object 
 9   bacteria                 396 non-null    object 
 10  blood glucose random     356 non-null    float64
 11  blood_urea               381 non-null    float64
 12  serum_creatinine         383 non-null    float64
 13  sodium                   313 non-null    float64
 14  potassium                3

In [6]:
data.isnull().any()

id                         False
age                         True
blood_pressure              True
specific_gravity            True
albumin                     True
sugar                       True
red_blood_cells             True
pus_cell                    True
pus_cell_clumps             True
bacteria                    True
blood glucose random        True
blood_urea                  True
serum_creatinine            True
sodium                      True
potassium                   True
hemoglobin                  True
packed_cell_volume          True
white_blood_cell_count      True
red_blood_cell_count        True
hypertension                True
diabetesmellitus            True
coronary_artery_disease     True
appetite                    True
pedal_edema                 True
anemia                      True
class                      False
dtype: bool

In [7]:
data['blood glucose random'].fillna(data['blood glucose random'].mean(),inplace=True)
data['blood_pressure'].fillna(data['blood_pressure'].mean(),inplace=True)
data['blood_urea'].fillna(data['blood_urea'].mean(),inplace=True)
data['hemoglobin'].fillna(data['hemoglobin'].mean(),inplace=True)
data['packed_cell_volume'].fillna(data['packed_cell_volume'].mean(),inplace=True)
data['potassium'].fillna(data['potassium'].mean(),inplace=True)
data['red_blood_cell_count'].fillna(data['red_blood_cell_count'].mean(),inplace=True)
data['serum_creatinine'].fillna(data['serum_creatinine'].mean(),inplace=True)
data['sodium'].fillna(data['sodium'].mean(),inplace=True)

In [8]:
data['age'].fillna(data['age'].mode()[0],inplace=True)
data['hypertension'].fillna(data['hypertension'].mode()[0],inplace=True)
data['pus_cell_clumps'].fillna(data['pus_cell_clumps'].mode()[0],inplace=True)
data['appetite'].fillna(data['appetite'].mode()[0],inplace=True)
data['albumin'].fillna(data['albumin'].mode()[0],inplace=True)
data['pus_cell'].fillna(data['pus_cell'].mode()[0],inplace=True)
data['red_blood_cells'].fillna(data['red_blood_cells'].mode()[0],inplace=True)
data['coronary_artery_disease'].fillna(data['coronary_artery_disease'].mode()[0],inplace=True)
data['bacteria'].fillna(data['bacteria'].mode()[0],inplace=True)
data['anemia'].fillna(data['anemia'].mode()[0],inplace=True)
data['sugar'].fillna(data['sugar'].mode()[0],inplace=True)
data['diabetesmellitus'].fillna(data['diabetesmellitus'].mode()[0],inplace=True)
data['pedal_edema'].fillna(data['pedal_edema'].mode()[0],inplace=True)
data['specific_gravity'].fillna(data['specific_gravity'].mode()[0],inplace=True)

In [9]:
catcols=set(data.dtypes[data.dtypes=='O'].index.values)
print(catcols)

{'pus_cell', 'pus_cell_clumps', 'anemia', 'red_blood_cells', 'hypertension', 'appetite', 'bacteria', 'coronary_artery_disease', 'pedal_edema', 'diabetesmellitus', 'class', 'white_blood_cell_count'}


In [10]:
for i in catcols:
  print('Columns:',i)
  print(c(data[i]))
  print('*'*120+'\n')

Columns: pus_cell
Counter({'normal': 324, 'abnormal': 76})
************************************************************************************************************************

Columns: pus_cell_clumps
Counter({'notpresent': 358, 'present': 42})
************************************************************************************************************************

Columns: anemia
Counter({'no': 340, 'yes': 60})
************************************************************************************************************************

Columns: red_blood_cells
Counter({'normal': 353, 'abnormal': 47})
************************************************************************************************************************

Columns: hypertension
Counter({'no': 253, 'yes': 147})
************************************************************************************************************************

Columns: appetite
Counter({'good': 318, 'poor': 82})
****************************************

In [11]:
#catcols.remove('red_blood_cell_count')
#catcols.remove('packed_cell_volume')
catcols.remove('white_blood_cell_count')
print(catcols)

{'pus_cell', 'pus_cell_clumps', 'anemia', 'red_blood_cells', 'hypertension', 'appetite', 'bacteria', 'coronary_artery_disease', 'pedal_edema', 'diabetesmellitus', 'class'}


In [25]:
catcols=['anemia','pedal_edema','appetite','bacteria','coronary_artery_disease','diabetesmellitus','hypertention','pus_cell','pus_cell_clumps','red_blood_cells']

In [28]:
from sklearn.preprocessing import LabelEncoder
for i in catcols:
  print("LABEL ENCODING OF:",i)
  LEi = LabelEncoder()
print(c(data[i]))
data[i] = LEi.fit_transform(data[i])
print(c(data[i]))
print("*"*100)  

LABEL ENCODING OF: anemia
LABEL ENCODING OF: pedal_edema
LABEL ENCODING OF: appetite
LABEL ENCODING OF: bacteria
LABEL ENCODING OF: coronary_artery_disease
LABEL ENCODING OF: diabetesmellitus
LABEL ENCODING OF: hypertention
LABEL ENCODING OF: pus_cell
LABEL ENCODING OF: pus_cell_clumps
LABEL ENCODING OF: red_blood_cells
Counter({'normal': 353, 'abnormal': 47})
Counter({1: 353, 0: 47})
****************************************************************************************************


In [32]:
cantcols=set(data.dtypes[data.dtypes!='O'].index.values)
print(cantcols)

{'blood glucose random', 'blood_urea', 'id', 'sugar', 'red_blood_cells', 'blood_pressure', 'potassium', 'packed_cell_volume', 'albumin', 'age', 'specific_gravity', 'hemoglobin', 'serum_creatinine', 'red_blood_cell_count', 'sodium'}


In [33]:
for i in cantcols:
  print("Continous Columns :",i)
  print(c(data[i]))
  print('*'*120+'\n')

Continous Columns : blood glucose random
Counter({148.0365168539326: 44, 99.0: 10, 100.0: 9, 93.0: 9, 107.0: 8, 117.0: 6, 140.0: 6, 92.0: 6, 109.0: 6, 131.0: 6, 130.0: 6, 70.0: 5, 114.0: 5, 95.0: 5, 123.0: 5, 124.0: 5, 102.0: 5, 132.0: 5, 104.0: 5, 125.0: 5, 122.0: 5, 121.0: 4, 106.0: 4, 76.0: 4, 91.0: 4, 129.0: 4, 133.0: 4, 94.0: 4, 88.0: 4, 118.0: 4, 139.0: 4, 111.0: 4, 113.0: 4, 120.0: 4, 119.0: 4, 74.0: 3, 108.0: 3, 171.0: 3, 137.0: 3, 79.0: 3, 150.0: 3, 112.0: 3, 127.0: 3, 219.0: 3, 172.0: 3, 89.0: 3, 128.0: 3, 214.0: 3, 105.0: 3, 78.0: 3, 103.0: 3, 82.0: 3, 97.0: 3, 81.0: 3, 138.0: 2, 490.0: 2, 208.0: 2, 98.0: 2, 204.0: 2, 207.0: 2, 144.0: 2, 253.0: 2, 141.0: 2, 86.0: 2, 360.0: 2, 163.0: 2, 158.0: 2, 165.0: 2, 169.0: 2, 210.0: 2, 101.0: 2, 153.0: 2, 213.0: 2, 424.0: 2, 303.0: 2, 192.0: 2, 80.0: 2, 110.0: 2, 96.0: 2, 85.0: 2, 83.0: 2, 75.0: 2, 423.0: 1, 410.0: 1, 380.0: 1, 157.0: 1, 263.0: 1, 173.0: 1, 156.0: 1, 264.0: 1, 159.0: 1, 270.0: 1, 162.0: 1, 246.0: 1, 182.0: 1, 146.0: 1,

In [36]:
cantcols.remove('specific_gravity')
cantcols.remove('albumin')
cantcols.add('sugar')
print(cantcols)

{'blood glucose random', 'blood_urea', 'id', 'sugar', 'red_blood_cells', 'blood_pressure', 'potassium', 'packed_cell_volume', 'age', 'hemoglobin', 'serum_creatinine', 'red_blood_cell_count', 'sodium'}


In [37]:
cantcols.add('red_blood_cell_count')
cantcols.add('packed_cell_volume')
cantcols.add('white_blood_cell_count')
print(cantcols)

{'blood glucose random', 'blood_urea', 'white_blood_cell_count', 'id', 'sugar', 'red_blood_cells', 'blood_pressure', 'potassium', 'packed_cell_volume', 'age', 'hemoglobin', 'serum_creatinine', 'red_blood_cell_count', 'sodium'}


In [44]:
#catcols.add('specific_gravity')
#catcols.add('albumin')
#catcols.add('sugar')
print(catcols)

['anemia', 'pedal_edema', 'appetite', 'bacteria', 'coronary_artery_disease', 'diabetesmellitus', 'hypertention', 'pus_cell', 'pus_cell_clumps', 'red_blood_cells']
