In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [39]:
df =pd.read_csv('kidney_disease.csv')
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [40]:
columns= pd.read_csv('data_description.txt',sep='-')
columns=columns.reset_index()
columns

Unnamed: 0,index,1. Features:
0,id,id
1,age,age
2,bp,blood pressure
3,sg,specific gravity
4,al,albumin
5,su,sugar
6,rbc,red blood cells
7,pc,pus cell
8,pcc,pus cell clumps
9,ba,bacteria


In [41]:
columns.columns=(['cols','ab_col_names'])
columns
df.columns = columns['ab_col_names'].values

In [42]:
df.drop('id',axis=1,inplace=True)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      391 non-null    float64
 1   blood pressure           388 non-null    float64
 2   specific gravity         353 non-null    float64
 3   albumin                  354 non-null    float64
 4   sugar                    351 non-null    float64
 5   red blood cells          248 non-null    object 
 6    pus cell                335 non-null    object 
 7   pus cell clumps          396 non-null    object 
 8   bacteria                 396 non-null    object 
 9   blood glucose random     356 non-null    float64
 10  blood urea               381 non-null    float64
 11  serum creatinine         383 non-null    float64
 12  sodium                   313 non-null    float64
 13  potassium                312 non-null    float64
 14  haemoglobin              3

nf=[feature for feature in df.columns if df[feature].dtype!='O']
print("Numerical Features:{}".format(nf))
print("Count of Features:",len(nf))

cf=[feature for feature in df.columns if df[feature].dtype=='O']
print("Categorical Features:{}".format(cf))
print("Count of Features:",len(cf))

for feature in cf:
    print('{}: {}'.format(feature,df[feature].unique()))

In [44]:
#Cleaning data
df['diabetes mellitus'].replace(to_replace={'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
df['coronary artery disease'].replace(to_replace={'\tno':'no'},inplace=True)
df['class'].replace(to_replace={'ckd\t':'ckd'},inplace=True)
df['red blood cell count'].replace(to_replace={'\t?':'nan'},inplace=True)
df['white blood cell count'].replace(to_replace={'\t6200':'nan','\t8400':'nan','\t?':'nan'},inplace=True)
df['packed cell volume'].replace(to_replace={'\t?':'nan','\t43':'nan'},inplace=True)

In [45]:
def convert_type(df,feature):
    df[feature] = pd.to_numeric(df[feature],errors='coerce')

In [46]:
for feature in ['packed cell volume','white blood cell count','red blood cell count']:
    convert_type(df,feature)

# Handling Missing Values
df.isnull().sum()

In [47]:
def replace_mode(feature):   #handling categorical values
    replace_with = df[feature].mode()[0]
    df[feature].fillna(replace_with,inplace=True)

In [48]:
####replacing null values with the most frequent one for the below lists
missing_list=['appetite','pedal edema','anemia','pus cell clumps','bacteria','ypertension','diabetes mellitus','coronary artery disease']

for feature in missing_list:
    replace_mode(feature)

df.describe()

for feature in nf:
    if 0 in df[feature].unique():
        pass
    else:
#         df[feature]=np.log(df[feature])
        df.boxplot(column=feature)
        plt.xlabel(feature)
        plt.ylabel(feature)
        plt.show()

import seaborn as sns
for feature in nf:
#     df[feature]=np.log(df[feature])
    sns.displot(df[feature])
    plt.xlabel(feature)
    plt.ylabel(feature)
    plt.show()

In [49]:
# we are filling missing value using median as seen above in distplot(the distribution is skewed) and boxplot(features has outliers) so hence considering the conditions we are using median


In [50]:
df=df.fillna(df.median())
df

  df=df.fillna(df.median())


Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,ypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,class
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,121.0,...,38.0,6000.0,4.8,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,4.8,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47.0,6700.0,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54.0,7800.0,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49.0,6600.0,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51.0,7200.0,5.9,no,no,no,good,no,no,notckd


df.isnull().sum()

df['red blood cells'].dropna().sample()

df['red blood cells'].isnull().sum()

# randomily selecting values for nan/missing values 
random_sample=df['red blood cells'].dropna().sample(df['red blood cells'].isnull().sum())
random_sample

df[df['red blood cells'].isnull()].index

random_sample.index

#this step is used so that the index should be same for the both
random_sample.index=df[df['red blood cells'].isnull()].index

random_sample.index

random_sample

df.loc[df['red blood cells'].isnull(),'red blood cells']=random_sample

In [51]:
def Random_value_imputation(feature):#all the function is executed line by line above
    random_sample=df[feature].dropna().sample(df[feature].isnull().sum())
    random_sample.index=df[df[feature].isnull()].index
    df.loc[df[feature].isnull(),feature]=random_sample

In [52]:
Random_value_imputation(' pus cell')

df.isnull().sum()

# Handling Categorical Values
### looking how many categories are there in each feature
for feature in cf:
    print('{}:\n{}\n'.format(feature,df[feature].value_counts()))

In [53]:
from sklearn.preprocessing import LabelEncoder

In [54]:
lb=LabelEncoder()
for feature in ['red blood cells', ' pus cell', 'pus cell clumps', 'bacteria', 'ypertension', 'diabetes mellitus', 'coronary artery disease', 'appetite', 'pedal edema', 'anemia', 'class']:
    df[feature] = lb.fit_transform(df[feature])
#     data= pd.get_dummies(data,columns=['red blood cells', ' pus cell', 'pus cell clumps', 'bacteria', 'ypertension', 'diabetes mellitus', 'coronary artery disease', 'appetite', 'pedal edema', 'anemia', 'class'],drop_first=True)

df   #0-->ckd, 1-->notckd

for feature in df.columns:
#     df[feature] = np.log(df[feature])
    sns.displot(df[feature])
    plt.xlabel(feature)
    plt.ylabel(feature)
    plt.show()

plt.figure(figsize=(18,9))
sns.heatmap(df.corr(),annot=True)

plt.figure(figsize=(18,20))

for i,feature in enumerate(nf):
    plt.subplot(4,4,i+1)
    df[feature].hist()
    plt.title(feature)

plt.figure(figsize=(18,20))

for i,feature in enumerate(cf):
    plt.subplot(5,3,i+1)
    sns.countplot(df[feature])

In [55]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [56]:
ind_col=[col for col in df.columns if col!='class']
dep_col='class'

In [57]:
X=df[ind_col]
y=df[dep_col]

In [58]:
ordered_rank_features=SelectKBest(score_func=chi2,k=20)
ordered_feature=ordered_rank_features.fit(X,y)

In [59]:
ordered_feature

ordered_feature.scores_

In [60]:
datascore=pd.DataFrame(ordered_feature.scores_,columns=['Scores'])

In [61]:
datascore

Unnamed: 0,Scores
0,115.85994
1,81.786701
2,0.005035
3,216.0
4,94.8
5,7.795248
6,12.307692
7,25.2
8,13.2
9,2241.651289


In [62]:
dfcols=pd.DataFrame(X.columns)
dfcols

Unnamed: 0,0
0,age
1,blood pressure
2,specific gravity
3,albumin
4,sugar
5,red blood cells
6,pus cell
7,pus cell clumps
8,bacteria
9,blood glucose random


In [63]:
features_rank=pd.concat([dfcols,datascore],axis=1)
features_rank

Unnamed: 0,0,Scores
0,age,115.85994
1,blood pressure,81.786701
2,specific gravity,0.005035
3,albumin,216.0
4,sugar,94.8
5,red blood cells,7.795248
6,pus cell,12.307692
7,pus cell clumps,25.2
8,bacteria,13.2
9,blood glucose random,2241.651289


In [64]:
features_rank.columns=['features','Score']
features_rank

Unnamed: 0,features,Score
0,age,115.85994
1,blood pressure,81.786701
2,specific gravity,0.005035
3,albumin,216.0
4,sugar,94.8
5,red blood cells,7.795248
6,pus cell,12.307692
7,pus cell clumps,25.2
8,bacteria,13.2
9,blood glucose random,2241.651289


In [65]:
features_rank.nlargest(10,'Score')

Unnamed: 0,features,Score
16,white blood cell count,9814.515378
10,blood urea,2343.097145
9,blood glucose random,2241.651289
11,serum creatinine,357.792101
15,packed cell volume,308.893581
3,albumin,216.0
14,haemoglobin,123.856342
0,age,115.85994
4,sugar,94.8
18,ypertension,88.2


In [66]:
selected_features=features_rank.nlargest(10,'Score')['features'].values
selected_features

array(['white blood cell count', 'blood urea', 'blood glucose random',
       'serum creatinine', 'packed cell volume', 'albumin', 'haemoglobin',
       'age', 'sugar', 'ypertension'], dtype=object)

In [67]:
X_new=df[selected_features]
X_new.head()

Unnamed: 0,white blood cell count,blood urea,blood glucose random,serum creatinine,packed cell volume,albumin,haemoglobin,age,sugar,ypertension
0,7800.0,36.0,121.0,1.2,44.0,1.0,15.4,48.0,0.0,1
1,6000.0,18.0,121.0,0.8,38.0,4.0,11.3,7.0,0.0,0
2,7500.0,53.0,423.0,1.8,31.0,2.0,9.6,62.0,3.0,0
3,6700.0,56.0,117.0,3.8,32.0,4.0,11.2,48.0,0.0,1
4,7300.0,26.0,106.0,1.4,35.0,2.0,11.6,51.0,0.0,0


In [68]:
X_new.shape

(400, 10)

In [69]:
# Splitting Dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25, random_state=0)

X_train.shape,X_test.shape

y_train.value_counts()

In [70]:
#Training Model
from xgboost import XGBClassifier
xg_classifier=XGBClassifier()

params={
    'learning_rate':[0.05,0.20,0.25],
    'max_depth':[5,8,10],
    'min_child_weight':[1,3,5,7],
    'gamma':[0.0,0.1,0.2,0.4],
    'colsample_bytree':[0.3,0.4,0.7]
    
}

from sklearn.model_selection import RandomizedSearchCV

random_search=RandomizedSearchCV(xg_classifier,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

random_search.fit(X_train,y_train)

print(random_search.best_estimator_)

random_search.best_params_

classifier=XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.4, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=1, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None)

In [71]:
xg_classifier.fit(X_train,y_train)

In [72]:
y_pred=xg_classifier.predict(X_test)

In [73]:
print("Mean squared error between test and predicted=",np.mean(y_pred-y_test)**2)

Mean squared error between test and predicted= 0.0001


from sklearn.metrics import confusion_matrix,accuracy_score

confusion_matrix(y_test,y_pred)

accuracy_score(y_test,y_pred)

In [74]:
import pickle
pickle.dump(xg_classifier,open('models/model.pkl','wb'))

X_test

In [75]:
# a=np.arange(5*5).reshape(5,5)
# a.shape
# a5= a[np.newaxis,np.newaxis]
# a5



In [76]:
X_data =np.array([37.0,130.0,28,25,0.9,0.0,13.40,63.0,31,0.0])
x_reshaped=X_data[np.newaxis,:] #this line will add a new row vector, it converts to 2D
print(x_reshaped.shape)


(1, 10)


In [77]:
load_model=pickle.load(open('models/model.pkl','rb'))
print(load_model.predict(x_reshaped))

[0]
