In [1]:
import numpy as np
import pandas as pd
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df= pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [3]:
columns =pd.read_csv('data_description.txt',sep='-')
columns=columns.reset_index()

In [4]:
columns.columns=(['cols','ab_col_names'])
columns
df.columns = columns['ab_col_names'].values

In [5]:
df.drop('id',axis=1,inplace=True)

In [6]:
nf=[feature for feature in df.columns if df[feature].dtype!='O']
print("Numerical Features:{}".format(nf))
print("Count of Features:",len(nf))

Numerical Features:['age', 'blood pressure', 'specific gravity', 'albumin', 'sugar', 'blood glucose random', 'blood urea', 'serum creatinine', 'sodium', 'potassium', 'haemoglobin']
Count of Features: 11


In [7]:
cf=[feature for feature in df.columns if df[feature].dtype=='O']
print("Categorical Features:{}".format(cf))
print("Count of Features:",len(cf))

Categorical Features:['red blood cells', ' pus cell', 'pus cell clumps', 'bacteria', 'packed cell volume', 'white blood cell count', 'red blood cell count', 'ypertension', 'diabetes mellitus', 'coronary artery disease', 'appetite', 'pedal edema', 'anemia', 'class']
Count of Features: 14


In [8]:
for feature in cf:
    print('{}: {}'.format(feature,df[feature].unique()))

red blood cells: [nan 'normal' 'abnormal']
 pus cell: ['normal' 'abnormal' nan]
pus cell clumps: ['notpresent' 'present' nan]
bacteria: ['notpresent' 'present' nan]
packed cell volume: ['44' '38' '31' '32' '35' '39' '36' '33' '29' '28' nan '16' '24' '37' '30'
 '34' '40' '45' '27' '48' '\t?' '52' '14' '22' '18' '42' '17' '46' '23'
 '19' '25' '41' '26' '15' '21' '43' '20' '\t43' '47' '9' '49' '50' '53'
 '51' '54']
white blood cell count: ['7800' '6000' '7500' '6700' '7300' nan '6900' '9600' '12100' '4500'
 '12200' '11000' '3800' '11400' '5300' '9200' '6200' '8300' '8400' '10300'
 '9800' '9100' '7900' '6400' '8600' '18900' '21600' '4300' '8500' '11300'
 '7200' '7700' '14600' '6300' '\t6200' '7100' '11800' '9400' '5500' '5800'
 '13200' '12500' '5600' '7000' '11900' '10400' '10700' '12700' '6800'
 '6500' '13600' '10200' '9000' '14900' '8200' '15200' '5000' '16300'
 '12400' '\t8400' '10500' '4200' '4700' '10900' '8100' '9500' '2200'
 '12800' '11200' '19100' '\t?' '12300' '16700' '2600' '2640

In [9]:
#Cleaning data
df['diabetes mellitus'].replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
df['coronary artery disease'].replace(to_replace={'\tno':'no'},inplace=True)
df['class'].replace(to_replace={'ckd\t':'ckd'},inplace=True)
df['red blood cell count'].replace(to_replace={'\t?':'nan'},inplace=True)
df['white blood cell count'].replace(to_replace={'\t6200':'nan','\t8400':'nan','\t?':'nan'},inplace=True)
df['packed cell volume'].replace(to_replace={'\t?':'nan','\t43':'nan'},inplace=True)

In [10]:
def convert_type(df,feature):
    df[feature] = pd.to_numeric(df[feature],errors='coerce')

for feature in ['packed cell volume','white blood cell count','red blood cell count']:
    convert_type(df,feature)


Handling Missing Values

In [11]:
def replace_mode(feature):   #handling categorical values
    replace_with = df[feature].mode()[0]
    df[feature].fillna(replace_with,inplace=True)

In [12]:
####replacing null values with the most frequent one for the below lists
missing_list=['appetite','pedal edema','anemia','pus cell clumps','bacteria','ypertension','diabetes mellitus','coronary artery disease']

for feature in missing_list:
    replace_mode(feature)

In [13]:
df=df.fillna(df.median())
df

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,ypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,121.0,...,38.0,6000.0,4.8,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,4.8,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


In [14]:
def Random_value_imputation(feature):#all the function is executed line by line above
    random_sample=df[feature].dropna().sample(df[feature].isnull().sum())
    random_sample.index=df[df[feature].isnull()].index
    df.loc[df[feature].isnull(),feature]=random_sample

In [15]:
Random_value_imputation(' pus cell')

Handling Categorical Values

In [16]:
### looking how many categories are there in each feature
for feature in cf:
    print('{}:\n{}\n'.format(feature,df[feature].value_counts()))

red blood cells:
normal      201
abnormal     47
Name: red blood cells, dtype: int64

 pus cell:
normal      310
abnormal     90
Name:  pus cell, dtype: int64

pus cell clumps:
notpresent    358
present        42
Name: pus cell clumps, dtype: int64

bacteria:
notpresent    378
present        22
Name: bacteria, dtype: int64

packed cell volume:
40.0    88
52.0    21
41.0    21
44.0    19
48.0    19
43.0    14
45.0    13
42.0    13
32.0    12
36.0    12
33.0    12
50.0    12
28.0    12
37.0    11
34.0    11
46.0     9
30.0     9
29.0     9
35.0     9
31.0     8
24.0     7
39.0     7
26.0     6
38.0     5
53.0     4
51.0     4
49.0     4
47.0     4
54.0     4
25.0     3
27.0     3
22.0     3
19.0     2
23.0     2
15.0     1
21.0     1
20.0     1
17.0     1
9.0      1
18.0     1
14.0     1
16.0     1
Name: packed cell volume, dtype: int64

white blood cell count:
8000.0     111
9800.0      11
6700.0      10
9200.0       9
9600.0       9
          ... 
12200.0      1
12700.0      1
16300.0 

In [17]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()

In [18]:
for feature in cf:
    df[feature] = lb.fit_transform(df[feature])

In [19]:
df#0-->ckd, 1-->notckd

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,ypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
0,48.0,80.0,1.020,1.0,0.0,2,1,0,0,121.0,...,31,35,30,1,1,0,0,0,0,0
1,7.0,50.0,1.020,4.0,0.0,2,1,0,0,121.0,...,25,19,26,0,0,0,0,0,0,0
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.0,...,18,33,26,0,1,0,1,0,1,0
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,19,25,17,1,0,0,1,1,1,0
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.0,...,22,31,24,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.0,...,34,25,27,0,0,0,0,0,0,1
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.0,...,41,35,40,0,0,0,0,0,0,1
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.0,...,36,24,32,0,0,0,0,0,0,1
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.0,...,38,30,37,0,0,0,0,0,0,1


Feature Selection

In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [21]:
ind_col=[col for col in df.columns if col!='class']#independent variable
dep_col='class' #dependent variable

In [22]:
X=df[ind_col]
y=df[dep_col]

In [23]:
ordered_rank_features=SelectKBest(score_func=chi2,k=20)
ordered_feature=ordered_rank_features.fit(X,y)

In [24]:
ordered_feature.scores_

array([1.15859940e+02, 8.17867015e+01, 5.03531613e-03, 2.16000000e+02,
       9.48000000e+01, 7.79524752e+00, 1.38744086e+01, 2.52000000e+01,
       1.32000000e+01, 2.24165129e+03, 2.34309714e+03, 3.57792101e+02,
       2.75587488e+01, 2.95133869e+00, 1.23856342e+02, 4.61414632e+02,
       1.05379734e+02, 3.60743328e+02, 8.82000000e+01, 8.22000000e+01,
       2.04000000e+01, 4.92000000e+01, 4.56000000e+01, 3.60000000e+01])

In [25]:
datascore=pd.DataFrame(ordered_feature.scores_,columns=['Scores'])
dfcols=pd.DataFrame(X.columns)
features_rank=pd.concat([dfcols,datascore],axis=1)
features_rank.columns=['features','Score']
selected_features=features_rank.nlargest(10,'Score')['features'].values

In [26]:
X_new=df[selected_features]

Splitting Dataset

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25, random_state=0)

In [28]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [29]:
lr.fit(X_train,y_train)

In [34]:
X_test

Unnamed: 0,blood urea,blood glucose random,packed cell volume,red blood cell count,serum creatinine,albumin,haemoglobin,age,white blood cell count,sugar
132,176.0,219.0,11,5,13.8,0.0,8.60,50.0,77,0.0
309,25.0,129.0,27,37,1.2,0.0,17.20,51.0,38,0.0
341,37.0,130.0,28,25,0.9,0.0,13.40,63.0,31,0.0
196,158.0,129.0,11,13,11.8,3.0,8.10,49.0,51,0.0
246,215.0,106.0,13,3,15.2,3.0,8.60,48.0,9,0.0
...,...,...,...,...,...,...,...,...,...,...
146,23.0,213.0,27,26,1.0,1.0,12.65,53.0,37,3.0
135,24.0,214.0,26,26,1.3,0.0,13.20,48.0,37,2.0
390,25.0,99.0,39,31,0.8,0.0,15.00,52.0,21,0.0
264,24.0,132.0,37,23,0.7,0.0,14.40,42.0,9,0.0


In [30]:
y_pred=lr.predict(X_test)
y_pred

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1])

In [31]:
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_matrix(y_test,y_pred)

array([[60,  2],
       [ 2, 36]], dtype=int64)

In [32]:
accuracy_score(y_test,y_pred)

0.96

In [33]:
import pickle
pickle.dump(lr,open('models/model.pkl','wb'))

In [38]:
X_data =np.array([25,200,30,40,2,2,15,35,50,2])
x_reshaped=X_data[np.newaxis,:] #this line will add a new row vector, it converts to 2D
print(x_reshaped.shape)

(1, 10)


In [36]:
load_model=pickle.load(open('models/model.pkl','rb'))
print(load_model.predict(x_reshaped))

[0]
