<h1>Importing Libraries</h1>

In [38]:

import pandas as pd #used for data manipulation
import numpy as np #used for numerical analysis
from collections import Counter as c #returns counts of classes
import matplotlib.pyplot as plt #used for data visualization
import seaborn as sns #used for data visualization
import missingno as msno #finding missing values
from sklearn.metrics import accuracy_score, confusion_matrix #model performance
from sklearn.model_selection import train_test_split #Splits data in random train and test array
from sklearn.preprocessing import LabelEncoder #encoding the levels of categorical featues
from sklearn.linear_model import LogisticRegression #Classification ML  Algorithm
import pickle #python object hirearchy is converted into a byte stream


<h1>Loading Dataset</h1>

In [39]:
# Loading the dataset
data = pd.read_csv(r"Datasets/chronickidneydisease.csv")

# returns first 10 rows
data.head(10)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,5,60.0,90.0,1.015,3.0,0.0,,,notpresent,notpresent,...,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,6,68.0,70.0,1.01,0.0,0.0,,normal,notpresent,notpresent,...,36,,,no,no,no,good,no,no,ckd
7,7,24.0,,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,...,44,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,...,33,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,...,29,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [40]:
#Drop is used for dropping the column
data.drop(['id'],axis=1,inplace=True)

<h2>Renaming the columns</h2>

In [41]:
#return all the column names
data.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [42]:
#manually giving the names of the columns
data.columns = ['age','blood_pressure','specific_gravity','albumin','sugar','red_blood_cells','pus_cell',
                'pus_cell_clumps','bacteria','blood glucose random','blood_urea','serum_creatinine','sodium','potassium','hemoglobin','packed_cell_volume','white_blood_cell_count','red_blood_cell_count','hypertension','diabetesmellitus','coronary_artery_disease','appetite','pedal_edema','anemia','class']
data.columns

Index(['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar',
       'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
       'blood glucose random', 'blood_urea', 'serum_creatinine', 'sodium',
       'potassium', 'hemoglobin', 'packed_cell_volume',
       'white_blood_cell_count', 'red_blood_cell_count', 'hypertension',
       'diabetesmellitus', 'coronary_artery_disease', 'appetite',
       'pedal_edema', 'anemia', 'class'],
      dtype='object')

In [43]:
#info will give the summary of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood_pressure           388 non-null    float64
 2   specific_gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red_blood_cells          248 non-null    object 
 6   pus_cell                 335 non-null    object 
 7   pus_cell_clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood glucose random     356 non-null    float64
 10  blood_urea               381 non-null    float64
 11  serum_creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  hemoglobin               3

<h1>Traget Column</h1>

In [44]:
#find the unique elements of the array
data['class'].unique()


array(['ckd', 'ckd\t', 'notckd'], dtype=object)

<h1>Rectifying the target column</h1>

In [45]:
#replace is used of renaming
data['class'] = data['class'].replace("ckd\t","ckd")
data['class'].unique()

array(['ckd', 'notckd'], dtype=object)

In [46]:
#only fetch the object  type columns
catcols = set(data.dtypes[data.dtypes=='O'].index.values)
print(catcols)

{'diabetesmellitus', 'coronary_artery_disease', 'hypertension', 'anemia', 'class', 'pus_cell_clumps', 'pus_cell', 'bacteria', 'red_blood_cells', 'packed_cell_volume', 'pedal_edema', 'red_blood_cell_count', 'white_blood_cell_count', 'appetite'}


In [47]:
for i in catcols:
    print("Columns :",i)
    print(c(data[i])) #using counter for checking the no of classes in the column
    print('*'*120+'\n')

Columns : diabetesmellitus
Counter({'no': 258, 'yes': 134, '\tno': 3, '\tyes': 2, nan: 2, ' yes': 1})
************************************************************************************************************************

Columns : coronary_artery_disease
Counter({'no': 362, 'yes': 34, '\tno': 2, nan: 2})
************************************************************************************************************************

Columns : hypertension
Counter({'no': 251, 'yes': 147, nan: 2})
************************************************************************************************************************

Columns : anemia
Counter({'no': 339, 'yes': 60, nan: 1})
************************************************************************************************************************

Columns : class
Counter({'ckd': 250, 'notckd': 150})
************************************************************************************************************************

Columns : pus_cell_clumps
Cou

<h1>Removing the columns which are not categorical</h1>

In [48]:
#remove is used for removing the column
catcols.remove('red_blood_cell_count')
catcols.remove('packed_cell_volume')
catcols.remove('white_blood_cell_count')
print(catcols)


{'diabetesmellitus', 'coronary_artery_disease', 'hypertension', 'anemia', 'class', 'pus_cell_clumps', 'pus_cell', 'bacteria', 'red_blood_cells', 'pedal_edema', 'appetite'}


In [49]:
# only fetch the float and int type columns
contcols = set(data.dtypes[data.dtypes!='O'].index.values)
print(contcols)

{'serum_creatinine', 'specific_gravity', 'blood_pressure', 'blood_urea', 'sodium', 'blood glucose random', 'hemoglobin', 'potassium', 'albumin', 'sugar', 'age'}


In [50]:
for i in contcols:
    print("Continous Columns:",i)
    print(c(data[i]))#using counter for checking the number of classes in the column
    print('*'*120+'\n')

Continous Columns: serum_creatinine
Counter({1.2: 40, 1.1: 24, 1.0: 23, 0.5: 23, 0.7: 22, 0.9: 22, 0.6: 18, 0.8: 17, 2.2: 10, 1.5: 9, 1.7: 9, 1.3: 8, 1.6: 8, 1.8: 7, 1.4: 7, 2.5: 7, 2.8: 7, 1.9: 6, 2.7: 5, 2.1: 5, 2.0: 5, 3.2: 5, 3.3: 5, 3.9: 4, 7.3: 4, 4.0: 3, 2.4: 3, 3.4: 3, 2.9: 3, 5.3: 3, 2.3: 3, 7.2: 2, 4.6: 2, 4.1: 2, 5.2: 2, 6.3: 2, 3.0: 2, 6.1: 2, 6.7: 2, 5.6: 2, 6.5: 2, 4.4: 2, 6.0: 2, 3.8: 1, 24.0: 1, 9.6: 1, 76.0: 1, 7.7: 1, nan: 1, 10.8: 1, 5.9: 1, 3.25: 1, nan: 1, 9.7: 1, 6.4: 1, 32.0: 1, nan: 1, nan: 1, 8.5: 1, 15.0: 1, 3.6: 1, 10.2: 1, 11.5: 1, nan: 1, 12.2: 1, 9.2: 1, 13.8: 1, 16.9: 1, 7.1: 1, 18.0: 1, 13.0: 1, 48.1: 1, 14.2: 1, 16.4: 1, nan: 1, nan: 1, 2.6: 1, 7.5: 1, 4.3: 1, 18.1: 1, 11.8: 1, 9.3: 1, 6.8: 1, 13.5: 1, nan: 1, 12.8: 1, 11.9: 1, nan: 1, nan: 1, nan: 1, 12.0: 1, nan: 1, 13.4: 1, 15.2: 1, 13.3: 1, nan: 1, nan: 1, nan: 1, nan: 1, nan: 1, 0.4: 1})
****************************************************************************************************************

<h1>Removing the columns which are not numerical</h1>

In [51]:
contcols.remove('specific_gravity')
contcols.remove('albumin')
contcols.remove('sugar')
print(contcols)

{'serum_creatinine', 'blood_pressure', 'blood_urea', 'sodium', 'blood glucose random', 'hemoglobin', 'potassium', 'age'}


<h1>Adding columns which we found continous</h1>

In [52]:
#using add we can add columns
contcols.add('red_blood_cell_count')
contcols.add('packed_cell_volume')
contcols.add('white_blood_cell_count')
print(contcols)

{'serum_creatinine', 'blood_pressure', 'blood_urea', 'sodium', 'blood glucose random', 'hemoglobin', 'potassium', 'packed_cell_volume', 'red_blood_cell_count', 'white_blood_cell_count', 'age'}


<h1>Adding columns which we found categorical</h1>


In [53]:
catcols.add('specific_gravity')
catcols.add('albumin')
catcols.add('sugar')
print(catcols)

{'diabetesmellitus', 'coronary_artery_disease', 'specific_gravity', 'hypertension', 'anemia', 'class', 'pus_cell_clumps', 'pus_cell', 'bacteria', 'red_blood_cells', 'albumin', 'pedal_edema', 'sugar', 'appetite'}


<h1>Rectifying the categorical column classes</h1>

In [54]:
data['coronary_artery_disease']=data.coronary_artery_disease.replace('\tno','no')
c(data['coronary_artery_disease'])

Counter({'no': 364, 'yes': 34, nan: 2})

In [55]:
data['diabetesmellitus']=data.diabetesmellitus.replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'})
c(data['diabetesmellitus'])

Counter({'yes': 137, 'no': 261, nan: 2})

<h1>Null Values</h1>

In [56]:
# it will return if any null vales values present
data.isnull().any()

age                         True
blood_pressure              True
specific_gravity            True
albumin                     True
sugar                       True
red_blood_cells             True
pus_cell                    True
pus_cell_clumps             True
bacteria                    True
blood glucose random        True
blood_urea                  True
serum_creatinine            True
sodium                      True
potassium                   True
hemoglobin                  True
packed_cell_volume          True
white_blood_cell_count      True
red_blood_cell_count        True
hypertension                True
diabetesmellitus            True
coronary_artery_disease     True
appetite                    True
pedal_edema                 True
anemia                      True
class                      False
dtype: bool

In [57]:
#returns the count
data.isnull().count()

age                        400
blood_pressure             400
specific_gravity           400
albumin                    400
sugar                      400
red_blood_cells            400
pus_cell                   400
pus_cell_clumps            400
bacteria                   400
blood glucose random       400
blood_urea                 400
serum_creatinine           400
sodium                     400
potassium                  400
hemoglobin                 400
packed_cell_volume         400
white_blood_cell_count     400
red_blood_cell_count       400
hypertension               400
diabetesmellitus           400
coronary_artery_disease    400
appetite                   400
pedal_edema                400
anemia                     400
class                      400
dtype: int64

In [58]:
data.packed_cell_volume = pd.to_numeric(data.packed_cell_volume , errors='coerce')
data.red_blood_cell_count = pd.to_numeric(data.red_blood_cell_count  , errors='coerce')
data.white_blood_cell_count = pd.to_numeric(data.white_blood_cell_count , errors='coerce')

<h1>Handling Continous/Numerical Columns Missing Values</h1>

In [59]:
data['blood glucose random'].fillna(data['blood glucose random'].mean(),inplace = True)
data['blood_pressure'].fillna(data['blood_pressure'].mean(),inplace = True)
data['blood_urea'].fillna(data['blood_urea'].mean(),inplace = True)
data['hemoglobin'].fillna(data['hemoglobin'].mean(),inplace = True)
data['packed_cell_volume'].fillna(data['packed_cell_volume'].mean(),inplace = True)
data['potassium'].fillna(data['potassium'].mean(),inplace = True)
data['red_blood_cell_count'].fillna(data['red_blood_cell_count'].mean(),inplace = True)
data['serum_creatinine'].fillna(data['serum_creatinine'].mean(),inplace = True)
data['sodium'].fillna(data['sodium'].mean(),inplace = True)
data['white_blood_cell_count'].fillna(data['white_blood_cell_count'].mean(),inplace = True)


In [60]:
data['age'].fillna(data['age'].mode()[0], inplace=True)
data['specific_gravity'].fillna(data['specific_gravity'].mode()[0], inplace=True)
data['albumin'].fillna(data['albumin'].mode()[0], inplace=True)
data['sugar'].fillna(data['sugar'].mode()[0], inplace=True)
data['red_blood_cells'].fillna(data['red_blood_cells'].mode()[0], inplace=True)
data['pus_cell'].fillna(data['pus_cell'].mode()[0], inplace=True)
data['pus_cell_clumps'].fillna(data['pus_cell_clumps'].mode()[0], inplace=True)
data['bacteria'].fillna(data['bacteria'].mode()[0], inplace=True)
data['blood glucose random'].fillna(data['blood glucose random'].mode()[0], inplace=True)
data['hypertension'].fillna(data['hypertension'].mode()[0], inplace=True)
data['diabetesmellitus'].fillna(data['diabetesmellitus'].mode()[0], inplace=True)
data['coronary_artery_disease'].fillna(data['coronary_artery_disease'].mode()[0], inplace=True)
data['appetite'].fillna(data['appetite'].mode()[0], inplace=True)
data['pedal_edema'].fillna(data['pedal_edema'].mode()[0], inplace=True)
data['anemia'].fillna(data['anemia'].mode()[0], inplace=True)
data['class'].fillna(data['class'].mode()[0], inplace=True)

In [61]:
data.isna().sum()

age                        0
blood_pressure             0
specific_gravity           0
albumin                    0
sugar                      0
red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
blood glucose random       0
blood_urea                 0
serum_creatinine           0
sodium                     0
potassium                  0
hemoglobin                 0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetesmellitus           0
coronary_artery_disease    0
appetite                   0
pedal_edema                0
anemia                     0
class                      0
dtype: int64

<h1>Label encoding</h1>

In [62]:
#importing label encoding from sklearn
from sklearn.preprocessing import LabelEncoder

for i in catcols:  #looping through all the categorical columns
    print("LABEL ENCODING OF :",i)
    LEi = LabelEncoder() #creating an object of label encoder
    print(c(data[i]))  #getting the classes values before transformation
    data[i] = LEi.fit_transform(data[i]) #transforming our text classes to numerical values
    print(c(data[i]))  #geting class values after transformation
    print('*'*100)


LABEL ENCODING OF : diabetesmellitus
Counter({'no': 263, 'yes': 137})
Counter({0: 263, 1: 137})
****************************************************************************************************
LABEL ENCODING OF : coronary_artery_disease
Counter({'no': 366, 'yes': 34})
Counter({0: 366, 1: 34})
****************************************************************************************************
LABEL ENCODING OF : specific_gravity
Counter({1.02: 153, 1.01: 84, 1.025: 81, 1.015: 75, 1.005: 7})
Counter({3: 153, 1: 84, 4: 81, 2: 75, 0: 7})
****************************************************************************************************
LABEL ENCODING OF : hypertension
Counter({'no': 253, 'yes': 147})
Counter({0: 253, 1: 147})
****************************************************************************************************
LABEL ENCODING OF : anemia
Counter({'no': 340, 'yes': 60})
Counter({0: 340, 1: 60})
******************************************************************************

In [63]:
features_name = ['blood_urea','blood glucose random','coronary_artery_disease','anemia','pus_cell',
    'red_blood_cells','diabetesmellitus','pedal_edema']
x = pd.DataFrame(data, columns = features_name)
y = pd.DataFrame(data, columns = ['class'])
print(x.shape)
print(y.shape)

(400, 8)
(400, 1)


In [64]:
data.isna().sum()

age                        0
blood_pressure             0
specific_gravity           0
albumin                    0
sugar                      0
red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
blood glucose random       0
blood_urea                 0
serum_creatinine           0
sodium                     0
potassium                  0
hemoglobin                 0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetesmellitus           0
coronary_artery_disease    0
appetite                   0
pedal_edema                0
anemia                     0
class                      0
dtype: int64

In [65]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=2)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(320, 8)
(320, 1)
(80, 8)
(80, 1)


In [66]:
from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(solver='lbfgs', max_iter=1000)

lgr.fit(x_train.values,y_train.values.ravel())

In [67]:
y_pred = lgr.predict(x_test)



In [73]:
y_pred1 =lgr.predict([[90,157,1,0,0,1,1,1]])
print(y_pred1)
c(y_pred)

[0]


Counter({0: 48, 1: 32})

In [69]:
accuracy_score(y_test,y_pred)

0.925

<h1>Confusion matrix of our model</h1>

In [70]:
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[48,  6],
       [ 0, 26]], dtype=int64)

In [71]:
pickle.dump(lgr, open('CKD.pkl','wb'))