In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [95]:
df=pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [96]:
columns=pd.read_csv("data_description.txt",sep='-')
columns_val=columns.reset_index()

In [97]:
columns_val.columns=['col', 'abb_col_names']

In [98]:
columns_val

Unnamed: 0,col,abb_col_names
0,id,id
1,age,age
2,bp,blood pressure
3,sg,specific gravity
4,al,albumin
5,su,sugar
6,rbc,red blood cells
7,pc,pus cell
8,pcc,pus cell clumps
9,ba,bacteria


In [99]:
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [100]:
df.columns=columns_val['abb_col_names'].values

In [101]:
df.head()

Unnamed: 0,id,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,...,packed cell volume,white blood cell count,red blood cell count,ypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [102]:
df.drop('id', axis=1, inplace=True)

In [103]:
df.dtypes

age                        float64
blood pressure             float64
specific gravity           float64
albumin                    float64
sugar                      float64
red blood cells             object
 pus cell                   object
pus cell clumps             object
bacteria                    object
blood glucose random       float64
blood urea                 float64
serum creatinine           float64
sodium                     float64
potassium                  float64
haemoglobin                float64
packed cell volume          object
white blood cell count      object
red blood cell count        object
ypertension                 object
diabetes mellitus           object
coronary artery disease     object
appetite                    object
pedal edema                 object
anemia                      object
class                       object
dtype: object

In [104]:
# function to convert few object datatype to numeric.
def convert_dtype(df, feature):
    df[feature]=pd.to_numeric(df[feature], errors='coerce')

In [105]:
features=['packed cell volume','white blood cell count','red blood cell count']
for feature in features:
    convert_dtype(df,feature)

In [106]:
df.dtypes

age                        float64
blood pressure             float64
specific gravity           float64
albumin                    float64
sugar                      float64
red blood cells             object
 pus cell                   object
pus cell clumps             object
bacteria                    object
blood glucose random       float64
blood urea                 float64
serum creatinine           float64
sodium                     float64
potassium                  float64
haemoglobin                float64
packed cell volume         float64
white blood cell count     float64
red blood cell count       float64
ypertension                 object
diabetes mellitus           object
coronary artery disease     object
appetite                    object
pedal edema                 object
anemia                      object
class                       object
dtype: object

In [107]:
cat_columns = df.select_dtypes(include=['object','bool']).columns
cat_columns

Index(['red blood cells', ' pus cell', 'pus cell clumps', 'bacteria',
       'ypertension', 'diabetes mellitus', 'coronary artery disease',
       'appetite', 'pedal edema', 'anemia', 'class'],
      dtype='object')

In [108]:
num_columns=df.select_dtypes(include=['int64','float64']).columns
num_columns

Index(['age', 'blood pressure', 'specific gravity', 'albumin', 'sugar',
       'blood glucose random', 'blood urea', 'serum creatinine', 'sodium',
       'potassium', 'haemoglobin', 'packed cell volume',
       'white blood cell count', 'red blood cell count'],
      dtype='object')

In [109]:
# Finding unique values in the column categories
for col in cat_columns:
    print('{} HAS {} values'.format(col,df[col].unique()))
    print('\n')

red blood cells HAS [nan 'normal' 'abnormal'] values


 pus cell HAS ['normal' 'abnormal' nan] values


pus cell clumps HAS ['notpresent' 'present' nan] values


bacteria HAS ['notpresent' 'present' nan] values


ypertension HAS ['yes' 'no' nan] values


diabetes mellitus HAS ['yes' 'no' ' yes' '\tno' '\tyes' nan] values


coronary artery disease HAS ['no' 'yes' '\tno' nan] values


appetite HAS ['good' 'poor' nan] values


pedal edema HAS ['no' 'yes' nan] values


anemia HAS ['no' 'yes' nan] values


class HAS ['ckd' 'ckd\t' 'notckd'] values




In [114]:
# Replacing values
df['diabetes mellitus'].replace(to_replace={'\tno':'no', '\tyes':'yes'}, inplace=True)
df['coronary artery disease'].replace(to_replace={'\tno':'no'}, inplace=True)

In [115]:
for col in num_columns:
    print('{} HAS {} values'.format(col,df[col].unique()))
    print('\n')

age HAS [48.  7. 62. 51. 60. 68. 24. 52. 53. 50. 63. 40. 47. 61. 21. 42. 75. 69.
 nan 73. 70. 65. 76. 72. 82. 46. 45. 35. 54. 11. 59. 67. 15. 55. 44. 26.
 64. 56.  5. 74. 38. 58. 71. 34. 17. 12. 43. 41. 57.  8. 39. 66. 81. 14.
 27. 83. 30.  4.  3.  6. 32. 80. 49. 90. 78. 19.  2. 33. 36. 37. 23. 25.
 20. 29. 28. 22. 79.] values


blood pressure HAS [ 80.  50.  70.  90.  nan 100.  60. 110. 140. 180. 120.] values


specific gravity HAS [1.02  1.01  1.005 1.015   nan 1.025] values


albumin HAS [ 1.  4.  2.  3.  0. nan  5.] values


sugar HAS [ 0.  3.  4.  1. nan  2.  5.] values


blood glucose random HAS [121.  nan 423. 117. 106.  74. 100. 410. 138.  70. 490. 380. 208.  98.
 157.  76.  99. 114. 263. 173.  95. 108. 156. 264. 123.  93. 107. 159.
 140. 171. 270.  92. 137. 204.  79. 207. 124. 144.  91. 162. 246. 253.
 141. 182.  86. 150. 146. 425. 112. 250. 360. 163. 129. 133. 102. 158.
 165. 132. 104. 127. 415. 169. 251. 109. 280. 210. 219. 295.  94. 172.
 101. 298. 153.  88. 226. 143. 115. 