## Load Data

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('kidney_disease.csv')
data.drop(['id','pcv','wc','rc'],axis=1,inplace=True)
data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,sod,pot,hemo,htn,dm,cad,appet,pe,ane,classification
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,,,15.4,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,,,11.3,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,,,9.6,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,111.0,2.5,11.2,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,,,11.6,no,no,no,good,no,no,ckd


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             391 non-null    float64
 1   bp              388 non-null    float64
 2   sg              353 non-null    float64
 3   al              354 non-null    float64
 4   su              351 non-null    float64
 5   rbc             248 non-null    object 
 6   pc              335 non-null    object 
 7   pcc             396 non-null    object 
 8   ba              396 non-null    object 
 9   bgr             356 non-null    float64
 10  bu              381 non-null    float64
 11  sc              383 non-null    float64
 12  sod             313 non-null    float64
 13  pot             312 non-null    float64
 14  hemo            348 non-null    float64
 15  htn             398 non-null    object 
 16  dm              398 non-null    object 
 17  cad             398 non-null    obj

# Import Model and Training from Splitted data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit

In [4]:
data['classification']=data['classification'].apply(lambda x:0 if x=='notckd' else 1)

In [5]:
X = data.drop("classification", axis=1)
y = data["classification"].copy()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5, random_state=6)

In [6]:
y_test.unique()

array([1, 0], dtype=int64)

In [7]:
num = X._get_numeric_data().columns
print(num)
cat = []
for i in X.columns.tolist():
    if i not in num:
        cat.append(i)
print(cat)  

Index(['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo'], dtype='object')
['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']


In [8]:
numeric_features = num
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = cat
#categorical_transformer = OneHotEncoder(handle_unknown="ignore",sparse=False)
categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore",sparse=False))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)



In [9]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", svm.SVC(kernel='linear'))]
)


clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.980


In [10]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier(criterion="entropy", max_depth=6))]
)


clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.975


In [11]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(n_estimators=270))]
)


clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.995


In [12]:
predictions = clf.predict(X_test)

## Accuracy Checking of Model

In [13]:
data['htn'].unique()

array(['yes', 'no', nan], dtype=object)

In [14]:
X_train.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,sc,sod,pot,hemo,htn,dm,cad,appet,pe,ane
273,47.0,80.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,95.0,...,0.9,140.0,4.1,,no,no,no,good,no,no
321,65.0,60.0,1.02,0.0,0.0,normal,normal,notpresent,notpresent,109.0,...,1.0,144.0,3.5,13.9,no,no,no,good,no,no
329,33.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,80.0,...,0.9,146.0,3.5,14.1,no,no,no,good,no,no
247,54.0,90.0,1.025,1.0,0.0,normal,abnormal,notpresent,notpresent,150.0,...,1.2,140.0,4.2,,no,no,no,poor,yes,yes
170,66.0,70.0,1.015,2.0,5.0,,normal,notpresent,notpresent,447.0,...,1.7,131.0,3.9,12.5,yes,yes,no,good,no,no


In [15]:
X_train['appet'].unique()

array(['good', 'poor'], dtype=object)

In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 273 to 394
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     199 non-null    float64
 1   bp      194 non-null    float64
 2   sg      178 non-null    float64
 3   al      180 non-null    float64
 4   su      177 non-null    float64
 5   rbc     122 non-null    object 
 6   pc      170 non-null    object 
 7   pcc     198 non-null    object 
 8   ba      198 non-null    object 
 9   bgr     177 non-null    float64
 10  bu      192 non-null    float64
 11  sc      192 non-null    float64
 12  sod     162 non-null    float64
 13  pot     162 non-null    float64
 14  hemo    176 non-null    float64
 15  htn     199 non-null    object 
 16  dm      199 non-null    object 
 17  cad     199 non-null    object 
 18  appet   200 non-null    object 
 19  pe      200 non-null    object 
 20  ane     200 non-null    object 
dtypes: float64(11), object(10)
memory usa

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)*100

99.5

## Saving Model

In [18]:
from joblib import dump, load
dump(clf, 'Kidney.joblib')

['Kidney.joblib']