In [57]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import pickle

In [58]:
df = pd.read_csv("kidney_disease.csv")
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [59]:
df[['htn','dm','cad','pe','ane']] = df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})
df[['rbc','pc']] = df[['rbc','pc']].replace(to_replace={'abnormal':1,'normal':0})
df[['pcc','ba']] = df[['pcc','ba']].replace(to_replace={'present':1,'notpresent':0})
df[['appet']] = df[['appet']].replace(to_replace={'good':1,'poor':0,'no':np.nan})
df['classification'] = df['classification'].replace(to_replace={'ckd':1.0,'ckd\t':1.0,'notckd':0.0,'no':0.0})
df.rename(columns={'classification':'class'},inplace=True)
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,0,48.0,80.0,1.02,1.0,0.0,,0.0,0.0,0.0,...,44,7800,5.2,1.0,1,0,1.0,0.0,0.0,1.0
1,1,7.0,50.0,1.02,4.0,0.0,,0.0,0.0,0.0,...,38,6000,,0.0,0,0,1.0,0.0,0.0,1.0
2,2,62.0,80.0,1.01,2.0,3.0,0.0,0.0,0.0,0.0,...,31,7500,,0.0,1,0,0.0,0.0,1.0,1.0
3,3,48.0,70.0,1.005,4.0,0.0,0.0,1.0,1.0,0.0,...,32,6700,3.9,1.0,0,0,0.0,1.0,1.0,1.0
4,4,51.0,80.0,1.01,2.0,0.0,0.0,0.0,0.0,0.0,...,35,7300,4.6,0.0,0,0,1.0,0.0,0.0,1.0


In [60]:
# Further cleaning
df['pe'] = df['pe'].replace(to_replace='good',value=0) # Not having pedal edema is good
df['appet'] = df['appet'].replace(to_replace='no',value=0)
df['cad'] = df['cad'].replace(to_replace='\tno',value=0)
df['dm'] = df['dm'].replace(to_replace={'\tno':0,'\tyes':1,' yes':1, '':np.nan})
df.drop('id',axis=1,inplace=True)

In [61]:
df = df.dropna(axis=0)
df.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [62]:
x = df.drop(['class'], axis = 1)
y = df['class']
cols = x.columns
vals = list(x.iloc[0])

In [71]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 5)
feature_scaler = MinMaxScaler()
x_train = feature_scaler.fit_transform(x_train)
x_test = feature_scaler.transform(x_test)

In [15]:
tuned_parameters = [{'n_estimators':[7,8,9,10,11,12,13,14,15,16],'max_depth':[2,3,4,5,6,None],
                     'class_weight':[None,{0: 0.33,1:0.67},'balanced'],'random_state':[42]}]
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=10,scoring='f1')
clf.fit(x_train, y_train)
clf.best_score_

1.0

In [16]:
clf.best_params_

{'class_weight': None, 'max_depth': 2, 'n_estimators': 8, 'random_state': 42}

In [18]:
print("Detailed classification report:")
y_true, lr_pred = y_test, clf.predict(x_test)
print(classification_report(y_true, lr_pred))

Detailed classification report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        23
         1.0       1.00      1.00      1.00         9

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32



In [19]:
x_train[2]

array([0.41891892, 0.2       , 0.75      , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.09285714,
       0.18686869, 0.04878049, 0.76923077, 0.05393258, 0.7704918 ,
       0.84210526, 0.12217195, 0.48148148, 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        ])

In [23]:
pickle.dump(clf, open('model.pkl','wb'))

In [25]:
model = pickle.load(open('model.pkl','rb'))

In [73]:
print(len(cols))

24


In [74]:
a = {}
for i,j in zip(cols,vals):
    a[i] = j
a

{'age': 68.0,
 'bp': 60.0,
 'sg': 1.025,
 'al': 0.0,
 'su': 0.0,
 'rbc': 0.0,
 'pc': 0.0,
 'pcc': 0.0,
 'ba': 0.0,
 'bgr': 125.0,
 'bu': 41.0,
 'sc': 1.1,
 'sod': 139.0,
 'pot': 3.8,
 'hemo': 17.4,
 'pcv': '50',
 'wc': '6700',
 'rc': '6.1',
 'htn': 0.0,
 'dm': 0.0,
 'cad': 0.0,
 'appet': 1.0,
 'pe': 0.0,
 'ane': 0.0}