In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score , train_test_split
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('kidney_disease.csv')

In [3]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [4]:
data.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [6]:
data.describe()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
count,400.0,391.0,388.0,353.0,354.0,351.0,356.0,381.0,383.0,313.0,312.0,348.0
mean,199.5,51.483376,76.469072,1.017408,1.016949,0.450142,148.036517,57.425722,3.072454,137.528754,4.627244,12.526437
std,115.614301,17.169714,13.683637,0.005717,1.352679,1.099191,79.281714,50.503006,5.741126,10.408752,3.193904,2.912587
min,0.0,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1
25%,99.75,42.0,70.0,1.01,0.0,0.0,99.0,27.0,0.9,135.0,3.8,10.3
50%,199.5,55.0,80.0,1.02,0.0,0.0,121.0,42.0,1.3,138.0,4.4,12.65
75%,299.25,64.5,80.0,1.02,2.0,0.0,163.0,66.0,2.8,142.0,4.9,15.0
max,399.0,90.0,180.0,1.025,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8


In [7]:
NumericalColumns = ['age','bp','al','su','bgr','bu','sc','sod','pot','hemo']
CategoricalColumns = ['rbc','sg','pc','pcc','ba','pcv','wc','rc','htn','dm','cad','appet','pe','ane']

# Missing Data

In [9]:
data.isnull().sum()

id                  0
age                 9
bp                 12
sg                 47
al                 46
su                 49
rbc               152
pc                 65
pcc                 4
ba                  4
bgr                44
bu                 19
sc                 17
sod                87
pot                88
hemo               52
pcv                70
wc                105
rc                130
htn                 2
dm                  2
cad                 2
appet               1
pe                  1
ane                 1
classification      0
dtype: int64

In [10]:
data['classification'].replace("ckd\t","ckd",inplace=True)

data['dm'].replace(["\tno","\tyes"," yes"],["no","yes","yes"],inplace=True)

data['cad'].replace(["\tno"],["no"],inplace=True)

In [11]:
data['rc'].replace("\t?",np.nan, inplace=True)

data.wc.replace("\t?",np.nan, inplace=True)

data['pcv'].replace(["\t?","\t43"],np.nan, inplace=True)

In [12]:
for columnName in CategoricalColumns:
  data[columnName].fillna(data[columnName].mode()[0], inplace=True)

In [13]:
for columnName in NumericalColumns:
  data[columnName].fillna(data[columnName].mean(), inplace=True)

In [14]:
data.isnull().sum()

id                0
age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
pcc               0
ba                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64

# Dummy encode

In [15]:
encodeColumn = ['rbc','pc' ,'pcc' ,'ba' ,'htn' ,'dm' ,'cad' ,'appet' ,'pe' ,'ane']
data = pd.get_dummies(data , columns=encodeColumn , prefix=encodeColumn , drop_first=True)

In [16]:
data['classification'].replace(["ckd","notckd"],[1,0], inplace=True)

In [17]:
data.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,rbc_normal,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_poor,pe_yes,ane_yes
0,0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,...,1,1,0,0,1,1,0,0,0,0
1,1,7.0,50.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,...,1,1,0,0,0,0,0,0,0,0
2,2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,137.528754,...,1,1,0,0,0,1,0,1,0,1
3,3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,...,1,0,1,0,1,0,0,1,1,1
4,4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,...,1,1,0,0,0,0,0,0,0,0


In [18]:
data.classification.value_counts()

1    250
0    150
Name: classification, dtype: int64

# MinMax

In [19]:
X = data.loc[:, data.columns != 'classification']
y = data['classification']

In [20]:
min_max = MinMaxScaler()
min_max.fit(X)
X = min_max.transform(X)

# Train Test Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
log = LogisticRegression()
cv_v = cross_val_score(log, X_train, y_train, cv=4)
print(cv_v)
print(cv_v.std())
print(cv_v.mean())

[1.    0.975 1.    1.   ]
0.010825317547305492
0.99375


In [23]:
log.fit(X_train,y_train)

LogisticRegression()

In [24]:
y_pred = log.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        28
           1       1.00      0.98      0.99        52

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

