In [3]:
import pandas as pd
import numpy as np
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv('chronic_kidney_disease2.csv')

In [5]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,0,0,1,1,121.0,...,44,7800,5.2,0,0,1,1,1,1,ckd
1,7.0,50.0,1.02,4.0,0.0,0,0,1,1,,...,38,6000,0.0,1,1,1,1,1,1,ckd
2,62.0,80.0,1.01,2.0,3.0,0,0,1,1,423.0,...,31,7500,0.0,1,0,1,0,1,0,ckd
3,48.0,70.0,1.005,4.0,0.0,0,1,0,1,117.0,...,32,6700,3.9,0,1,1,0,0,0,ckd
4,51.0,80.0,1.01,2.0,0.0,0,0,1,1,106.0,...,35,7300,4.6,1,1,1,1,1,1,ckd


In [6]:
df.isnull().sum()

age       9
bp       12
sg       47
al       46
su       49
rbc       0
pc        0
pcc       0
ba        0
bgr      44
bu       19
sc       17
sod      87
pot      88
hemo     52
pcv       0
wbcc      0
rbcc      0
htn       0
dm        0
cad       0
appet     0
pe        0
ane       0
class     0
dtype: int64

# Categorical Encoding

In [7]:
df.replace({'class': {'notckd':0, 'ckd':1}}, inplace=True)

In [8]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,0,0,1,1,121.0,...,44,7800,5.2,0,0,1,1,1,1,1
1,7.0,50.0,1.02,4.0,0.0,0,0,1,1,,...,38,6000,0.0,1,1,1,1,1,1,1
2,62.0,80.0,1.01,2.0,3.0,0,0,1,1,423.0,...,31,7500,0.0,1,0,1,0,1,0,1
3,48.0,70.0,1.005,4.0,0.0,0,1,0,1,117.0,...,32,6700,3.9,0,1,1,0,0,0,1
4,51.0,80.0,1.01,2.0,0.0,0,0,1,1,106.0,...,35,7300,4.6,1,1,1,1,1,1,1


# Missing Value Imputation using Mean Value

In [9]:
df['age'].fillna(df['age'].mean(), inplace=True)
df['bp'].fillna(df['bp'].mean(), inplace=True)
df['sg'].fillna(df['sg'].mean(), inplace=True)
df['al'].fillna(df['al'].mean(), inplace=True)
df['su'].fillna(df['su'].mean(), inplace=True)
df['bgr'].fillna(df['bgr'].mean(), inplace=True)
df['bu'].fillna(df['bu'].mean(), inplace=True)
df['sc'].fillna(df['sc'].mean(), inplace=True)
df['sod'].fillna(df['sod'].mean(), inplace=True)
df['pot'].fillna(df['pot'].mean(), inplace=True)
df['hemo'].fillna(df['hemo'].mean(), inplace=True)

In [10]:
df.isnull().sum()

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64

# Normalization

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
X = df.drop(['class'], axis=1)
Y = df['class']

In [13]:
scaler = MinMaxScaler()
scaler.fit(X)

MinMaxScaler()

In [14]:
scaled_data = pd.DataFrame(data = scaler.transform(X), columns = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'])

In [15]:
scaled_data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane
0,0.522727,0.230769,0.75,0.2,0.0,0.0,0.0,1.0,1.0,0.211538,...,0.836735,0.814815,0.295455,0.65,0.0,0.0,1.0,1.0,1.0,1.0
1,0.056818,0.0,0.75,0.8,0.0,0.0,0.0,1.0,1.0,0.269309,...,0.557823,0.703704,0.227273,0.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.681818,0.230769,0.25,0.4,0.6,0.0,0.0,1.0,1.0,0.856838,...,0.442177,0.574074,0.284091,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.522727,0.153846,0.0,0.8,0.0,0.0,1.0,0.0,1.0,0.202991,...,0.55102,0.592593,0.253788,0.4875,0.0,1.0,1.0,0.0,0.0,0.0
4,0.556818,0.230769,0.25,0.4,0.0,0.0,0.0,1.0,1.0,0.179487,...,0.578231,0.648148,0.276515,0.575,1.0,1.0,1.0,1.0,1.0,1.0


# Feature Selection

In [16]:
bestfeatures = SelectKBest(score_func=chi2, k=14)
fit = bestfeatures.fit(scaled_data,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(scaled_data.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(14,'Score'))

    Specs      Score
18    htn  49.338911
3      al  45.613559
6      pc  45.600000
19     dm  43.436693
17   rbcc  36.488274
5     rbc  28.200000
15    pcv  23.911522
2      sg  22.646074
4      su  20.190389
21  appet  12.214721
14   hemo  11.309043
22     pe  10.263983
10     bu   6.158310
9     bgr   6.094381
