In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer

In [2]:
df=pd.read_csv('kidney.csv')
df.drop('id',axis=1,inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             330 non-null    object 
 16  wc              295 non-null    object 
 17  rc              270 non-null    obj

In [4]:
#convert to numeric data type from object
df.pcv = pd.to_numeric(df.pcv, errors='coerce')
df.wc = pd.to_numeric(df.wc, errors='coerce')
df.rc = pd.to_numeric(df.rc, errors='coerce')

In [5]:
df.classification=df.classification.replace('ckd\t','ckd')
df.dm=df.dm.replace([' yes','\tno','\tyes'],['yes','no','yes'])
df.cad=df.cad.replace('\tno','no')

In [6]:
df.isnull().sum()

age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                71
wc                106
rc                131
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [7]:
df.head(15)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,74.0,...,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,100.0,...,36.0,,,no,no,no,good,no,no,ckd
7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,...,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,...,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [8]:
modus = [df["sg"].value_counts().idxmax(),
         df["al"].value_counts().idxmax(),
         df["su"].value_counts().idxmax(),
         df["rbc"].value_counts().idxmax(),
         df["pc"].value_counts().idxmax(),
         df["pcc"].value_counts().idxmax(),
         df["ba"].value_counts().idxmax(),
         df["htn"].value_counts().idxmax(),
         df["dm"].value_counts().idxmax(),
         df["cad"].value_counts().idxmax(),
         df["appet"].value_counts().idxmax(),
         df["pe"].value_counts().idxmax(),
         df["ane"].value_counts().idxmax(),
        ]
modus

[1.02,
 0.0,
 0.0,
 'normal',
 'normal',
 'notpresent',
 'notpresent',
 'no',
 'no',
 'no',
 'good',
 'no',
 'no']

In [9]:
df["sg"]= df["sg"].fillna(df["sg"].value_counts().idxmax())
df["al"]= df["al"].fillna(df["al"].value_counts().idxmax())
df["su"]= df["su"].fillna(df["su"].value_counts().idxmax())
df["rbc"]= df["rbc"].fillna(df["rbc"].value_counts().idxmax())
df["pc"]= df["pc"].fillna(df["pc"].value_counts().idxmax())
df["pcc"]= df["pcc"].fillna(df["pcc"].value_counts().idxmax())
df["ba"]= df["ba"].fillna(df["ba"].value_counts().idxmax())
df["htn"]= df["htn"].fillna(df["htn"].value_counts().idxmax())
df["dm"]= df["dm"].fillna(df["dm"].value_counts().idxmax())
df["cad"]= df["cad"].fillna(df["cad"].value_counts().idxmax())
df["appet"]= df["appet"].fillna(df["appet"].value_counts().idxmax())
df["pe"]= df["pe"].fillna(df["pe"].value_counts().idxmax())
df["ane"]= df["ane"].fillna(df["ane"].value_counts().idxmax())

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    object 
 6   pc              400 non-null    object 
 7   pcc             400 non-null    object 
 8   ba              400 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             329 non-null    float64
 16  wc              294 non-null    float64
 17  rc              269 non-null    flo

In [11]:
dictionary = {
    'rbc' :{
    'abnormal':1,
    'normal':0,
    },
    'pc' :{
    'abnormal':1,
    'normal':0,
    },
    'pcc' :{
    'notpresent':0,
    'present':1,
    },
    'ba' :{
    'notpresent':0,
    'present':1,
    },
    'htn' :{
    'yes':1,
    'no':0,
    },
    'dm' :{
    'yes':1,
    'no':0,
    },
    'cad' :{
    'yes':1,
    'no':0,
    },
    'appet' :{
    'good':1,
    'poor':0,
    },
    'pe' :{
    'yes':1,
    'no':0,
    },
    'ane' :{
    'yes':1,
    'no':0,
    },
    'classification':{
        'ckd':1,
        'notckd':0,
    }
}
df=df.replace(dictionary)

In [12]:
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,0,0,0,0,121.0,...,44.0,7800.0,5.2,1,1,0,1,0,0,1
1,7.0,50.0,1.02,4.0,0.0,0,0,0,0,,...,38.0,6000.0,,0,0,0,1,0,0,1
2,62.0,80.0,1.01,2.0,3.0,0,0,0,0,423.0,...,31.0,7500.0,,0,1,0,0,0,1,1
3,48.0,70.0,1.005,4.0,0.0,0,1,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,0,0,0,0,106.0,...,35.0,7300.0,4.6,0,0,0,1,0,0,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              400 non-null    float64
 3   al              400 non-null    float64
 4   su              400 non-null    float64
 5   rbc             400 non-null    int64  
 6   pc              400 non-null    int64  
 7   pcc             400 non-null    int64  
 8   ba              400 non-null    int64  
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  pcv             329 non-null    float64
 16  wc              294 non-null    float64
 17  rc              269 non-null    flo

In [14]:
imputer = KNNImputer(n_neighbors=2)
df_filled = imputer.fit_transform(df)

In [15]:
df2 = pd.DataFrame(data = df_filled)

In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       400 non-null    float64
 1   1       400 non-null    float64
 2   2       400 non-null    float64
 3   3       400 non-null    float64
 4   4       400 non-null    float64
 5   5       400 non-null    float64
 6   6       400 non-null    float64
 7   7       400 non-null    float64
 8   8       400 non-null    float64
 9   9       400 non-null    float64
 10  10      400 non-null    float64
 11  11      400 non-null    float64
 12  12      400 non-null    float64
 13  13      400 non-null    float64
 14  14      400 non-null    float64
 15  15      400 non-null    float64
 16  16      400 non-null    float64
 17  17      400 non-null    float64
 18  18      400 non-null    float64
 19  19      400 non-null    float64
 20  20      400 non-null    float64
 21  21      400 non-null    float64
 22  22

In [17]:
df2.columns

RangeIndex(start=0, stop=25, step=1)

In [18]:
kolom = ['age','bp','sg','al','su','rbc','pc','pcc','ba',
          'bgr','bu','sc','sod','pot','hemo','pcv','wc','rc',
          'htn','dm','cad','appet','pe','ane','class']
df2.columns=kolom
df2.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [19]:
df2.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,0.0,0.0,0.0,0.0,121.0,...,44.0,7800.0,5.2,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,7.0,50.0,1.02,4.0,0.0,0.0,0.0,0.0,0.0,101.5,...,38.0,6000.0,5.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,423.0,...,31.0,7500.0,3.2,0.0,1.0,0.0,0.0,0.0,1.0,1.0
3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,117.0,...,32.0,6700.0,3.9,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,106.0,...,35.0,7300.0,4.6,0.0,0.0,0.0,1.0,0.0,0.0,1.0


#save data
df2.to_csv(r'C:\Users\ACER\Music\jupiter\TA\CleanD.csv',index=False)
df2.to_excel(r'C:\Users\ACER\Music\jupiter\TA\CleanDX.xlsx',index=False)