In [99]:
import arff
import numpy as np
import pandas as pd

In [115]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [86]:
FILE_PATH = r".\data\chronic_kidney_disease_full.arff"

In [90]:
dataset = arff.load(fp=open(FILE_PATH, mode="r"))

In [57]:
pprint(dataset)

Pretty printing has been turned ON


In [91]:
dataset

{'description': '1. Title: Early stage of Indians Chronic Kidney Disease(CKD)\n\n2. Source Information:\n  (a) Source:\n\t\t\tDr.P.Soundarapandian.M.D.,D.M\n\t\t     (Senior Consultant Nephrologist),\n\t\t\tApollo  Hospitals,\n\t\t\tManagiri,\n\t\t\tMadurai Main Road,\n\t\t\tKaraikudi,\n\t\t\tTamilnadu,\n\t\t\tIndia.\n  (b) Creator:\n\t\t\tL.Jerlin Rubini(Research Scholar)\n\t\t\tAlagappa University\n\t\t\tEmailId   :jel.jerlin@gmail.com\n\t\t\tContactNo :+91-9597231281\n  (c) Guided by:\n\t\t\tDr.P.Eswaran Assistant Professor,\n\t\t\tDepartment of Computer Science and Engineering,\n\t\t\tAlagappa University,\n\t\t\tKaraikudi,\n\t\t\tTamilnadu,\n\t\t\tIndia.\n\t\t\tEmailid:eswaranperumal@gmail.com\n  (d) Date     : july 2015\n\n3.Relevant Information:\n\t\t\tage\t\t-\tage\t\n\t\t\tbp\t\t-\tblood pressure\n\t\t\tsg\t\t-\tspecific gravity\n\t\t\tal\t\t-   \talbumin\n\t\t\tsu\t\t-\tsugar\n\t\t\trbc\t\t-\tred blood cells\n\t\t\tpc\t\t-\tpus cell\n\t\t\tpcc\t\t-\tpus cell clumps\n\t\t\tba\t

In [172]:
data = np.array(dataset["data"])
data

array([[48.0, 80.0, '1.020', ..., 'no', 'no', 'ckd'],
       [7.0, 50.0, '1.020', ..., 'no', 'no', 'ckd'],
       [62.0, 80.0, '1.010', ..., 'no', 'yes', 'ckd'],
       ...,
       [12.0, 80.0, '1.020', ..., 'no', 'no', 'notckd'],
       [17.0, 60.0, '1.025', ..., 'no', 'no', 'notckd'],
       [58.0, 80.0, '1.025', ..., 'no', 'no', 'notckd']], dtype=object)

In [102]:
HEADERS = np.array(list(zip(*dataset["attributes"]))[0])
HEADERS

array(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn',
       'dm', 'cad', 'appet', 'pe', 'ane', 'class'], dtype='<U5')

In [334]:
TYPE_NAMES = np.array(list(zip(*dataset["attributes"]))[1])
# TYPE_NAMES
# print(*zip(HEADERS,TYPE_NAMES), sep='\n')

TYPE_DICT = {name:typ for name,typ in zip(HEADERS,TYPE_NAMES)}
TYPE_DICT

# hello = pd.DataFrame(TYPE_DICT)

{'age': 'NUMERIC',
 'bp': 'NUMERIC',
 'sg': ['1.005', '1.010', '1.015', '1.020', '1.025'],
 'al': ['0', '1', '2', '3', '4', '5'],
 'su': ['0', '1', '2', '3', '4', '5'],
 'rbc': ['normal', 'abnormal'],
 'pc': ['normal', 'abnormal'],
 'pcc': ['present', 'notpresent'],
 'ba': ['present', 'notpresent'],
 'bgr': 'NUMERIC',
 'bu': 'NUMERIC',
 'sc': 'NUMERIC',
 'sod': 'NUMERIC',
 'pot': 'NUMERIC',
 'hemo': 'NUMERIC',
 'pcv': 'NUMERIC',
 'wbcc': 'NUMERIC',
 'rbcc': 'NUMERIC',
 'htn': ['yes', 'no'],
 'dm': ['yes', 'no'],
 'cad': ['yes', 'no'],
 'appet': ['good', 'poor'],
 'pe': ['yes', 'no'],
 'ane': ['yes', 'no'],
 'class': ['ckd', 'notckd']}

In [209]:
# Changing the 'TYPES' list into pandas types, manually by observation

TYPES = [int, int, float, int, int,            # Age, BP, SG, AL, SU
         bool, bool, bool, bool, int,          # RBC, PC, PCC, BA, BGR
         int, float, int, float, float,        # BU, SC, SOD, POT, HEMO
         int, int, float, bool, bool,          # PCV, WBCC, RBCC, HTN, DM
         bool, bool, bool, bool, bool]         # CAD, APPET, PE, ANE, CLASS

# Need to use 'float' type for integer data, b/c nan is a float
# Will convert object type into boolean later
TYPES2 = [float, float, float, float, float,              # Age, BP, SG, AL, SU
         object, object, object, object, float,           # RBC, PC, PCC, BA, BGR
         float, float, float, float, float,               # BU, SC, SOD, POT, HEMO
         float, float, float, object, object,             # PCV, WBCC, RBCC, HTN, DM
         object, object, object, object, object]          # CAD, APPET, PE, ANE, CLASS

TYPE_MAPPINGS = {header:typ for header, typ in zip(HEADERS,TYPES2)}
#TYPE_MAPPINGS

## Data Cleaning

In [246]:
ckd = pd.DataFrame(data=data, columns=HEADERS)
ckd.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48,80.0,1.02,1,0,,normal,notpresent,notpresent,121.0,36,1.2,,,15.4,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7,50.0,1.02,4,0,,normal,notpresent,notpresent,,18,0.8,,,11.3,38,6000.0,,no,no,no,good,no,no,ckd
2,62,80.0,1.01,2,3,normal,normal,notpresent,notpresent,423.0,53,1.8,,,9.6,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,48,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,56,3.8,111.0,2.5,11.2,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80.0,1.01,2,0,normal,normal,notpresent,notpresent,106.0,26,1.4,,,11.6,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,60,90.0,1.015,3,0,,,notpresent,notpresent,74.0,25,1.1,142.0,3.2,12.2,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68,70.0,1.01,0,0,,normal,notpresent,notpresent,100.0,54,24.0,104.0,4.0,12.4,36,,,no,no,no,good,no,no,ckd
7,24,,1.015,2,4,normal,abnormal,notpresent,notpresent,410.0,31,1.1,,,12.4,44,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52,100.0,1.015,3,0,normal,abnormal,present,notpresent,138.0,60,1.9,,,10.8,33,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53,90.0,1.02,2,0,abnormal,abnormal,present,notpresent,70.0,107,7.2,114.0,3.7,9.5,29,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [247]:
ckd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     391 non-null    object
 1   bp      388 non-null    object
 2   sg      353 non-null    object
 3   al      354 non-null    object
 4   su      351 non-null    object
 5   rbc     248 non-null    object
 6   pc      335 non-null    object
 7   pcc     396 non-null    object
 8   ba      396 non-null    object
 9   bgr     356 non-null    object
 10  bu      381 non-null    object
 11  sc      383 non-null    object
 12  sod     313 non-null    object
 13  pot     312 non-null    object
 14  hemo    348 non-null    object
 15  pcv     329 non-null    object
 16  wbcc    294 non-null    object
 17  rbcc    269 non-null    object
 18  htn     398 non-null    object
 19  dm      398 non-null    object
 20  cad     398 non-null    object
 21  appet   399 non-null    object
 22  pe      399 non-null    ob

In [248]:
ckd = ckd.astype(dtype=TYPE_MAPPINGS)

In [249]:
ckd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   rbc     248 non-null    object 
 6   pc      335 non-null    object 
 7   pcc     396 non-null    object 
 8   ba      396 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     329 non-null    float64
 16  wbcc    294 non-null    float64
 17  rbcc    269 non-null    float64
 18  htn     398 non-null    object 
 19  dm      398 non-null    object 
 20  cad     398 non-null    object 
 21  appet   399 non-null    object 
 22  pe

In [250]:
ckd.head(20)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,54.0,24.0,104.0,4.0,12.4,36.0,,,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,31.0,1.1,,,12.4,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,60.0,1.9,,,10.8,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,107.0,7.2,114.0,3.7,9.5,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [251]:
ckd_labels = ckd['class']
ckd.drop('class', axis=1, inplace=True)

In [252]:
ckd.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,36.0,1.2,,,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,18.0,0.8,,,11.3,38.0,6000.0,,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,,,9.6,31.0,7500.0,,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,,,11.6,35.0,7300.0,4.6,no,no,no,good,no,no


In [254]:
ckd_labels[:5]

0    ckd
1    ckd
2    ckd
3    ckd
4    ckd
Name: class, dtype: object

In [291]:
test = ckd['htn'].to_frame()
test.head(10)

Unnamed: 0,htn
0,yes
1,no
2,no
3,yes
4,no
5,yes
6,no
7,no
8,yes
9,yes


In [296]:
new_values = []
for item in test['htn']:
    if item == "yes":
        new_values.append("1")
    elif item == "no":
        new_values.append("0")
    else:
        new_values.append(item)

new_values

test['htn'+'_yes'] = pd.Series(data=new_values)
test.drop('htn', axis=1, inplace=True)

In [301]:
test.head(10)

Unnamed: 0,htn_yes
0,1
1,0
2,0
3,1
4,0
5,1
6,0
7,0
8,1
9,1


In [294]:
test.describe()

Unnamed: 0,htn,htn_yes
count,398,398
unique,2,2
top,no,0
freq,251,251


In [326]:
ckd.select_dtypes(include=['object']).head()

Unnamed: 0,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,,normal,notpresent,notpresent,no,no,no,good,no,no
2,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,normal,normal,notpresent,notpresent,no,no,no,good,no,no


In [335]:
dict(dataset['attributes'])

{'age': 'NUMERIC',
 'bp': 'NUMERIC',
 'sg': ['1.005', '1.010', '1.015', '1.020', '1.025'],
 'al': ['0', '1', '2', '3', '4', '5'],
 'su': ['0', '1', '2', '3', '4', '5'],
 'rbc': ['normal', 'abnormal'],
 'pc': ['normal', 'abnormal'],
 'pcc': ['present', 'notpresent'],
 'ba': ['present', 'notpresent'],
 'bgr': 'NUMERIC',
 'bu': 'NUMERIC',
 'sc': 'NUMERIC',
 'sod': 'NUMERIC',
 'pot': 'NUMERIC',
 'hemo': 'NUMERIC',
 'pcv': 'NUMERIC',
 'wbcc': 'NUMERIC',
 'rbcc': 'NUMERIC',
 'htn': ['yes', 'no'],
 'dm': ['yes', 'no'],
 'cad': ['yes', 'no'],
 'appet': ['good', 'poor'],
 'pe': ['yes', 'no'],
 'ane': ['yes', 'no'],
 'class': ['ckd', 'notckd']}