In [99]:
import arff
import numpy as np
import pandas as pd

In [115]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [86]:
FILE_PATH = r".\data\chronic_kidney_disease_full.arff"

In [90]:
dataset = arff.load(fp=open(FILE_PATH, mode="r"))

In [57]:
pprint(dataset)

Pretty printing has been turned ON


In [368]:
print(dataset['description'])

1. Title: Early stage of Indians Chronic Kidney Disease(CKD)

2. Source Information:
  (a) Source:
			Dr.P.Soundarapandian.M.D.,D.M
		     (Senior Consultant Nephrologist),
			Apollo  Hospitals,
			Managiri,
			Madurai Main Road,
			Karaikudi,
			Tamilnadu,
			India.
  (b) Creator:
			L.Jerlin Rubini(Research Scholar)
			Alagappa University
			EmailId   :jel.jerlin@gmail.com
			ContactNo :+91-9597231281
  (c) Guided by:
			Dr.P.Eswaran Assistant Professor,
			Department of Computer Science and Engineering,
			Alagappa University,
			Karaikudi,
			Tamilnadu,
			India.
			Emailid:eswaranperumal@gmail.com
  (d) Date     : july 2015

3.Relevant Information:
			age		-	age	
			bp		-	blood pressure
			sg		-	specific gravity
			al		-   	albumin
			su		-	sugar
			rbc		-	red blood cells
			pc		-	pus cell
			pcc		-	pus cell clumps
			ba		-	bacteria
			bgr		-	blood glucose random
			bu		-	blood urea
			sc		-	serum creatinine
			sod		-	sodium
			pot		-	potassium
			hemo		-	hemoglobin
			pcv		-	pack

In [172]:
data = np.array(dataset["data"])
data

array([[48.0, 80.0, '1.020', ..., 'no', 'no', 'ckd'],
       [7.0, 50.0, '1.020', ..., 'no', 'no', 'ckd'],
       [62.0, 80.0, '1.010', ..., 'no', 'yes', 'ckd'],
       ...,
       [12.0, 80.0, '1.020', ..., 'no', 'no', 'notckd'],
       [17.0, 60.0, '1.025', ..., 'no', 'no', 'notckd'],
       [58.0, 80.0, '1.025', ..., 'no', 'no', 'notckd']], dtype=object)

In [102]:
HEADERS = np.array(list(zip(*dataset["attributes"]))[0])
HEADERS

array(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn',
       'dm', 'cad', 'appet', 'pe', 'ane', 'class'], dtype='<U5')

In [334]:
TYPE_NAMES = np.array(list(zip(*dataset["attributes"]))[1])
# TYPE_NAMES
# print(*zip(HEADERS,TYPE_NAMES), sep='\n')

TYPE_DICT = {name:typ for name,typ in zip(HEADERS,TYPE_NAMES)}
TYPE_DICT

# hello = pd.DataFrame(TYPE_DICT)

{'age': 'NUMERIC',
 'bp': 'NUMERIC',
 'sg': ['1.005', '1.010', '1.015', '1.020', '1.025'],
 'al': ['0', '1', '2', '3', '4', '5'],
 'su': ['0', '1', '2', '3', '4', '5'],
 'rbc': ['normal', 'abnormal'],
 'pc': ['normal', 'abnormal'],
 'pcc': ['present', 'notpresent'],
 'ba': ['present', 'notpresent'],
 'bgr': 'NUMERIC',
 'bu': 'NUMERIC',
 'sc': 'NUMERIC',
 'sod': 'NUMERIC',
 'pot': 'NUMERIC',
 'hemo': 'NUMERIC',
 'pcv': 'NUMERIC',
 'wbcc': 'NUMERIC',
 'rbcc': 'NUMERIC',
 'htn': ['yes', 'no'],
 'dm': ['yes', 'no'],
 'cad': ['yes', 'no'],
 'appet': ['good', 'poor'],
 'pe': ['yes', 'no'],
 'ane': ['yes', 'no'],
 'class': ['ckd', 'notckd']}

In [209]:
# Changing the 'TYPES' list into pandas types, manually by observation

TYPES = [int, int, float, int, int,            # Age, BP, SG, AL, SU
         bool, bool, bool, bool, int,          # RBC, PC, PCC, BA, BGR
         int, float, int, float, float,        # BU, SC, SOD, POT, HEMO
         int, int, float, bool, bool,          # PCV, WBCC, RBCC, HTN, DM
         bool, bool, bool, bool, bool]         # CAD, APPET, PE, ANE, CLASS

# Need to use 'float' type for integer data, b/c nan is a float
# Will convert object type into boolean later
TYPES2 = [float, float, float, float, float,              # Age, BP, SG, AL, SU
         object, object, object, object, float,           # RBC, PC, PCC, BA, BGR
         float, float, float, float, float,               # BU, SC, SOD, POT, HEMO
         float, float, float, object, object,             # PCV, WBCC, RBCC, HTN, DM
         object, object, object, object, object]          # CAD, APPET, PE, ANE, CLASS

TYPE_MAPPINGS = {header:typ for header, typ in zip(HEADERS,TYPES2)}
#TYPE_MAPPINGS

## Data Cleaning

In [348]:
ckd = pd.DataFrame(data=data, columns=HEADERS)
ckd.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80.0,1.02,1,0,,normal,notpresent,notpresent,121.0,36,1.2,,,15.4,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7,50.0,1.02,4,0,,normal,notpresent,notpresent,,18,0.8,,,11.3,38,6000.0,,no,no,no,good,no,no,ckd
2,62,80.0,1.01,2,3,normal,normal,notpresent,notpresent,423.0,53,1.8,,,9.6,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,48,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,56,3.8,111.0,2.5,11.2,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80.0,1.01,2,0,normal,normal,notpresent,notpresent,106.0,26,1.4,,,11.6,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,60,90.0,1.015,3,0,,,notpresent,notpresent,74.0,25,1.1,142.0,3.2,12.2,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68,70.0,1.01,0,0,,normal,notpresent,notpresent,100.0,54,24.0,104.0,4.0,12.4,36,,,no,no,no,good,no,no,ckd
7,24,,1.015,2,4,normal,abnormal,notpresent,notpresent,410.0,31,1.1,,,12.4,44,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52,100.0,1.015,3,0,normal,abnormal,present,notpresent,138.0,60,1.9,,,10.8,33,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53,90.0,1.02,2,0,abnormal,abnormal,present,notpresent,70.0,107,7.2,114.0,3.7,9.5,29,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [349]:
ckd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     391 non-null    object
 1   bp      388 non-null    object
 2   sg      353 non-null    object
 3   al      354 non-null    object
 4   su      351 non-null    object
 5   rbc     248 non-null    object
 6   pc      335 non-null    object
 7   pcc     396 non-null    object
 8   ba      396 non-null    object
 9   bgr     356 non-null    object
 10  bu      381 non-null    object
 11  sc      383 non-null    object
 12  sod     313 non-null    object
 13  pot     312 non-null    object
 14  hemo    348 non-null    object
 15  pcv     329 non-null    object
 16  wbcc    294 non-null    object
 17  rbcc    269 non-null    object
 18  htn     398 non-null    object
 19  dm      398 non-null    object
 20  cad     398 non-null    object
 21  appet   399 non-null    object
 22  pe      399 non-null    ob

In [350]:
ckd = ckd.astype(dtype=TYPE_MAPPINGS)

In [351]:
ckd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   rbc     248 non-null    object 
 6   pc      335 non-null    object 
 7   pcc     396 non-null    object 
 8   ba      396 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     329 non-null    float64
 16  wbcc    294 non-null    float64
 17  rbcc    269 non-null    float64
 18  htn     398 non-null    object 
 19  dm      398 non-null    object 
 20  cad     398 non-null    object 
 21  appet   399 non-null    object 
 22  pe

In [352]:
ckd.head(20)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,54.0,24.0,104.0,4.0,12.4,36.0,,,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,31.0,1.1,,,12.4,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,60.0,1.9,,,10.8,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [353]:
ckd_labels = ckd['class']
ckd.drop('class', axis=1, inplace=True)

In [354]:
ckd.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no


In [355]:
ckd_labels[:5]

0    ckd
1    ckd
2    ckd
3    ckd
4    ckd
Name: class, dtype: object

In [364]:
dict(dataset['attributes'])

{'age': 'NUMERIC',
 'bp': 'NUMERIC',
 'sg': ['1.005', '1.010', '1.015', '1.020', '1.025'],
 'al': ['0', '1', '2', '3', '4', '5'],
 'su': ['0', '1', '2', '3', '4', '5'],
 'rbc': ['normal', 'abnormal'],
 'pc': ['normal', 'abnormal'],
 'pcc': ['present', 'notpresent'],
 'ba': ['present', 'notpresent'],
 'bgr': 'NUMERIC',
 'bu': 'NUMERIC',
 'sc': 'NUMERIC',
 'sod': 'NUMERIC',
 'pot': 'NUMERIC',
 'hemo': 'NUMERIC',
 'pcv': 'NUMERIC',
 'wbcc': 'NUMERIC',
 'rbcc': 'NUMERIC',
 'htn': ['yes', 'no'],
 'dm': ['yes', 'no'],
 'cad': ['yes', 'no'],
 'appet': ['good', 'poor'],
 'pe': ['yes', 'no'],
 'ane': ['yes', 'no'],
 'class': ['ckd', 'notckd']}

In [360]:
# Doing one-hot encoding for the (binary) categorical variables

atts_dict = dict(dataset['attributes'])
for col in ckd.select_dtypes(include=['object']).columns:
    
    new_values = []
    positive, negative = atts_dict[col]
    
    for item in ckd[col]:
        if item == positive:
            new_values.append("1")
        elif item == negative:
            new_values.append("0")
        else:
            new_values.append(item)

    ckd[col+"_"+positive] = pd.Series(data=new_values, dtype=float)
    ckd.drop(col, axis=1, inplace=True)


In [365]:
ckd

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,7.0,50.0,1.02,4.0,0.0,,18.0,0.8,,,11.3,38.0,6000.0,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,,,9.6,31.0,7500.0,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,60.0,90.0,1.015,3.0,0.0,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,,,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
6,68.0,70.0,1.01,0.0,0.0,100.0,54.0,24.0,104.0,4.0,12.4,36.0,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,24.0,,1.015,2.0,4.0,410.0,31.0,1.1,,,12.4,44.0,6900.0,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
8,52.0,100.0,1.015,3.0,0.0,138.0,60.0,1.9,,,10.8,33.0,9600.0,4.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
9,53.0,90.0,1.02,2.0,0.0,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [362]:
ckd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          391 non-null    float64
 1   bp           388 non-null    float64
 2   sg           353 non-null    float64
 3   al           354 non-null    float64
 4   su           351 non-null    float64
 5   bgr          356 non-null    float64
 6   bu           381 non-null    float64
 7   sc           383 non-null    float64
 8   sod          313 non-null    float64
 9   pot          312 non-null    float64
 10  hemo         348 non-null    float64
 11  pcv          329 non-null    float64
 12  wbcc         294 non-null    float64
 13  rbcc         269 non-null    float64
 14  rbc_normal   248 non-null    float64
 15  pc_normal    335 non-null    float64
 16  pcc_present  396 non-null    float64
 17  ba_present   396 non-null    float64
 18  htn_yes      398 non-null    float64
 19  dm_yes  

In [366]:
ckd.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_good,pe_yes,ane_yes
count,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0,329.0,294.0,269.0,248.0,335.0,396.0,396.0,398.0,398.0,398.0,399.0,399.0,399.0
mean,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437,38.884498,8406.122449,4.707435,0.810484,0.773134,0.106061,0.055556,0.369347,0.344221,0.085427,0.794486,0.190476,0.150376
std,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587,8.990105,2944.47419,1.025323,0.392711,0.419431,0.308305,0.229351,0.483235,0.475712,0.279868,0.404584,0.39317,0.357888
min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3,32.0,6500.0,3.9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65,40.0,8000.0,4.8,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0,45.0,9800.0,5.4,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
