In [None]:
#importing required imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings ("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
sns.set()

In [None]:
df=pd.read_csv('/Users/abhisheksenapati/Desktop/Machine Learning & Stats/ML_final_project/Health Care/Data & Data_Dictionary & releted/kidney disease/kidney_disease.csv')

In [None]:
df.head()

In [None]:
df.shape

### Problem Statement-
###### -----------------------------------------------
###### This is a classification problem where the goal is to predict whether a patient has a kidney disease or not.
#### The dataset contains medical records of 400 patients
#### collected from healthcare data sources.
###### -----------------------------------------------
###### Each record represents one patient and includes 25 variables (features), such as:
###### - Age
###### - Blood Pressure (bp)
###### - Specific Gravity (sg)
###### - Albumin (al)
###### - Sugar (su)
###### - Red Blood Cells (rbc)
###### - Pus Cells (pc)
###### - Pus Cell Clumps (pcc)
###### - Bacteria (ba)
###### - and several other clinical and physiological attributes.
###### -----------------------------------------------
#### The objective is to:
##### 1. Analyze the data to understand relationships between features.
##### 2. Identify key factors associated with kidney disease.
##### 3. Build a predictive model that classifies whether a patient is likely to have chronic kidney disease (CKD) or not.


![Screenshot%202568-10-21%20at%2010.03.29%E2%80%AFPM.png](attachment:Screenshot%202568-10-21%20at%2010.03.29%E2%80%AFPM.png)

### Utilise automated EDA packages to comprehend the data->

In [None]:
# Patch Sweetviz compatibility just for bypass error
if not hasattr(np, "VisibleDeprecationWarning"):
    np.VisibleDeprecationWarning = Warning

import sweetviz as sv
report = sv.analyze(df)
report.show_html("EDA_sweetViz_kidney_eda.html")

In [None]:
df.columns

In [None]:
# Duplicates

df.duplicated().sum()

In [None]:
#missing value checks

df.isnull().sum().sum()

In [None]:
missing_data=((df.isnull().sum()/len(df))*100).round(2)
missing_data.sort_values(ascending=False)

##### Note
- This is a healthcare dataset, and each data point is crucial. We cannot drop any data directly,
even if the missing value is 25 or more. Therefore, we must use an imputation method to handle
the missing values in this case.

In [None]:
df.info()

In [None]:
# finding the unique value

for i in df.columns:
    print(f"**************************** {i}****************************")
    print('*'*25)
    print()
    print(set(df[i].tolist()))

- From the above unique value, some features have unwanted objects like \t or \t ?--> We must handle that type of data by cleaning it.
- Lets do it.

In [None]:
# cleaning all those features by removing unwanted object:

df['classification']=df['classification'].apply(lambda x: 'ckd' if x=='ckd\t' else x)
df['cad']=df['cad'].apply(lambda x: 'no' if x=='\tno' else x)

df['dm']=df['dm'].apply(lambda x: 'yes' if x=='\tyes' else x)
df['dm']=df['dm'].apply(lambda x: 'no' if x=='\tno' else x)
df['dm']=df['dm'].apply(lambda x: 'yes' if x==' yes' else x)

#these are numerical value, but in object format due to unwanted objects ,need to replace it with the mode:

mode_rc=df['rc'].mode()[0]
df['rc']=df['rc'].apply(lambda x:mode_rc if x=='\t?' else x)

mode_wc=df['wc'].mode()[0]
df['wc']=df['wc'].apply(lambda x:'8400' if x=='\t8400' else x)
df['wc']=df['wc'].apply(lambda x:mode_wc if x=='\t?' else x)
df['wc']=df['wc'].apply(lambda x:'6200' if x=='\t6200' else x)

mode_pcv=df['pcv'].mode()[0]
df['pcv']=df['pcv'].apply(lambda x:mode_pcv if x=='\t?' else x)
df['pcv']=df['pcv'].apply(lambda x:'43' if x=='\t43' else x)

In [None]:
# recheck now whether it exists or not:

for i in df.columns:
    print(f"**************************** {i}****************************")
    print('*'*25)
    print()
    print(set(df[i].tolist()))

In [None]:
df.dtypes

In [None]:
# Converting to appropriate data types:

df['pcv']=df['pcv'].fillna(df['pcv'].mode()[0])
df['wc']=df['wc'].fillna(df['wc'].mode()[0])
df['rc']=df['rc'].fillna(df['rc'].mode()[0])

df['pcv']=df['pcv'].astype('int64')
df['wc']=df['wc'].astype('int64')
df['rc']=df['rc'].astype('float64')

In [None]:
# separating object and int columns

object_col=df.select_dtypes(include=['object']).columns
print('\nobject types columns')

print(object_col)
print()

numerical_col=df.select_dtypes(include=['int64','float64']).columns
print('numerical types columns')
print(numerical_col)

In [None]:
# handle missing value using simple imputer

from sklearn.impute import SimpleImputer
imp_mode=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imp_median=SimpleImputer(missing_values=np.nan,strategy='median')

df_imp1=pd.DataFrame(imp_mode.fit_transform(df[object_col]),
                     columns=df[object_col].columns)


df_imp2=pd.DataFrame(imp_mode.fit_transform(df[numerical_col]),
                     columns=df[numerical_col].columns)

In [None]:
df_imp1

In [None]:
df_imp1.isnull().sum()

In [None]:
df_imp2

In [None]:
df_imp2.isnull().sum()

In [None]:
#remove id col

df_imp2=df_imp2.iloc[:,1:]
df_imp2

In [None]:
# checking outlier

def distplot(col):
    sns.distplot(df_imp2[col])
    plt.show()

for i in list(df_imp2.select_dtypes(exclude='object').columns)[0:]:
    distplot(i)

In [None]:
def boxplot(col):
    sns.boxplot(df_imp2[col],color='red',medianprops={'color': 'yellow', 'linewidth': 2.6})
    plt.show()

for i in list(df_imp2.select_dtypes(exclude='object').columns)[0:]:
    boxplot(i)

In [None]:
#merging both table

print(f"df_imp1_col:{df_imp1.columns}\n{'-----'*20}\ndf_imp1_col:{df_imp2.columns}")

In [None]:
#so there no common col so we will create a col in both just for merging

table_df=pd.concat([df_imp1,df_imp2],axis=1)
table_df

In [None]:
table_df.shape

In [None]:
table_df.columns

In [None]:
# split the data ino x & y

x= table_df.drop('classification',axis=1)
y=table_df['classification']

In [None]:
print(f"x-> shape : {x.shape}\ny-> shape{y.shape}")

In [None]:
x.columns

In [None]:
y

In [None]:
(y.value_counts(normalize=True))*100

In [None]:
# handling encoding concept

def classify_features(x):
    categorical_feature = []
    non_categorical_features = []
    discrete_features = []
    continuous_features = []

    for column in x.columns:
        if x[column].dtypes == 'object':
            if x[column].nunique() < 10:
                categorical_feature.append(column)
            else:
                non_categorical_features.append(column)
        elif x[column].dtypes in ['int64', 'float64']:
            if x[column].nunique() < 10:
                discrete_features.append(column)
            else:
                continuous_features.append(column)

    return categorical_feature, non_categorical_features, discrete_features, continuous_features

In [None]:
categorical,non_categorical,discreate,continous=classify_features(x)

In [None]:
print(f"Categorical Features:\n{categorical}\n{'*'*30}")
print(f"Non-Categorical Features :\n{non_categorical}\n{'*'*30}")
print(f"Discrete Features:\n{discreate}\n{'*'*30}")
print(f"Continuous Features:\n{continous}\n{'*'*30}")

In [None]:
for i in categorical:
    print(x[i].value_counts())
    print('************************')

In [None]:
df_dummies=pd.get_dummies(x[categorical],
                         drop_first=True)

In [None]:
df_dummies

In [None]:
# replace categorical column with df dummies

df1=pd.concat([x.drop(columns=categorical),df_dummies],axis=1)

In [None]:
df1

In [None]:
#replace true-1 and false 0

df_dummies1 = pd.DataFrame(np.where(df_dummies == True, 1, 0),
                           columns=df_dummies.columns)

In [None]:
df_dummies1

In [None]:
df1=pd.concat([x.drop(columns=categorical),df_dummies1],axis=1)
df1.head()

In [None]:
df1.describe()

In [None]:
#1. pre processing done
#2. missing value done
#3. encoding done
#4. outlier not req. (as these are healthcare data ->),removing such records would erase genuine,
#   high risk patient cases->which are often the most important for prediction or diagnosis.
#5. feature scaling
#6. imbalance treatment

In [None]:
# splitting the data into train and test

In [None]:
x=df1
x

In [None]:
y

In [None]:
(y.value_counts(normalize=True))*100

In [None]:
#making classifation lebeling into binary:

y=np.where(y=='notckd',0,1)
y

In [None]:
pd.DataFrame(y).value_counts()

In [None]:
#split the data train and test:

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# Building Model:

In [None]:
#importing all model then will check which one giving best model interms of accuracy:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,BernoulliNB

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [None]:
list_model=[]
list_accuracy=[]

#for logit

logit=LogisticRegression()
lr=logit.fit(x_train,y_train)
y_pred_lr=logit.predict(x_test)
accuracy_lr=accuracy_score(y_test,y_pred_lr)

# dec tree

dtree=DecisionTreeClassifier()
dt=dtree.fit(x_train,y_train)
y_pred_dt=dtree.predict(x_test)
accuracy_dt=accuracy_score(y_test,y_pred_dt)

#baggingclassifier

bagging=BaggingClassifier()
bg=bagging.fit(x_train,y_train)
y_pred_bg=bagging.predict(x_test)
accuracy_bg=accuracy_score(y_test,y_pred_bg)

#Randomforestclassifier

rforest=RandomForestClassifier()
rf=rforest.fit(x_train,y_train)
y_pred_rf=rforest.predict(x_test)
accuracy_rf=accuracy_score(y_test,y_pred_rf)

#adaboostclassifier

ada=AdaBoostClassifier()
ab=ada.fit(x_train,y_train)
y_pred_ab=ada.predict(x_test)
accuracy_ab=accuracy_score(y_test,y_pred_ab)

#gradboosting

gdboost=GradientBoostingClassifier()
gdb=gdboost.fit(x_train,y_train)
y_pred_gdb=gdboost.predict(x_test)
accuracy_gdb=accuracy_score(y_test,y_pred_gdb)

#xgboost

xgboost=XGBClassifier()
xgb=xgboost.fit(x_train,y_train)
y_pred_xgb=xgboost.predict(x_test)
accuracy_xgb=accuracy_score(y_test,y_pred_xgb)

#svm
svm=SVC()
sv=svm.fit(x_train,y_train)
y_pred_svm=svm.predict(x_test)
accuracy_svm=accuracy_score(y_test,y_pred_svm)

#KNeighborsClassifier

knn=KNeighborsClassifier()
kn=knn.fit(x_train,y_train)
y_pred_knn=knn.predict(x_test)
accuracy_knn=accuracy_score(y_test,y_pred_knn)

#GussianNb

gnn=GaussianNB()
gn=gnn.fit(x_train,y_train)
y_pred_gnn=gnn.predict(x_test)
accuracy_gnn=accuracy_score(y_test,y_pred_gnn)

#BernoulliNB

bnb=BernoulliNB()
bn=bnb.fit(x_train,y_train)
y_pred_bnb=bnb.predict(x_test)
accuracy_bnb=accuracy_score(y_test,y_pred_bnb)

# votingclass- comb pred from multiple model

voting=VotingClassifier(estimators=[('lr',logit),('dt',dtree),('bg',bagging),('rf',rforest),('ada',ada),('gdb',gdboost),('xgboost',xgboost),('svm',svm),('knn',knn),('gnn',gnn),('bnb',bnb)],voting='hard')
model_voting=voting.fit(x_train,y_train)
y_pred_voting=voting.predict(x_test)
accuracy_voting=accuracy_score(y_test,y_pred_voting)

In [None]:
models_name=['LogisticRegg','DecisionTree','Bagging','Randomforest','Adaboost','GradBoosting','XGBoost','SVM','KNN','GaussianNB','BernoulliNB','VotingClassifier']

accuracies=[accuracy_lr,accuracy_dt,accuracy_bg,accuracy_rf,accuracy_ab,accuracy_gdb,accuracy_xgb,accuracy_svm,accuracy_knn,accuracy_gnn,accuracy_bnb,accuracy_voting]

model=[logit,dtree,bagging,rforest,ada,gdboost,xgboost,svm,knn,gnn,bnb,voting]

df_accuracy=pd.DataFrame({'MODEL_USED':models_name,
                          'ACCURACY':accuracies})
df_accuracy=df_accuracy.sort_values(by='ACCURACY',ascending=False)
df_accuracy.reset_index(drop=True)

In [None]:
chart= sns.barplot(x='MODEL_USED',
                   y='ACCURACY',
                   data=df_accuracy)

chart.set_xticklabels(chart.get_xticklabels(),
                      rotation=90)
plt.show()
chart

## **Choose the model and check underfitting and over fitting problem**

**1. VOTING:**

In [None]:
voting_modelEvaluation_train=voting.predict(x_train)
voting_modelEvaluation_test=voting.predict(x_test)
accuracy_voting_train=accuracy_score(y_train,voting_modelEvaluation_train)
accuracy_voting_test=accuracy_score(y_test,voting_modelEvaluation_test)
print()
print()
print('*'*50)
print('Training accuracy: ',accuracy_voting_train)
print()
print('*'*50)
print('Test accuracy: ',accuracy_voting_test)
print()
print('*'*50)

In [None]:
print('Training accuracy: ',classification_report(y_train,voting_modelEvaluation_train))
print('*'*50)
print('Test accuracy: ',classification_report(y_test,voting_modelEvaluation_test))

In [None]:
#cross validation for voting:

from sklearn.model_selection import cross_val_score
training=cross_val_score(voting,x_train,y_train,cv=10)
print('Training accuracy',training.mean())
print('*'*25)
print('cv std: ',training.std())
print('*'*25)
print('Test accuracy: ',accuracy_voting_test)


In [None]:
plt.figure(figsize=(3, 3))
cm = confusion_matrix(y_test, voting_modelEvaluation_test)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not CKD (0)', 'CKD (1)'], yticklabels=['Not CKD (0)', 'CKD (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix-test')
plt.show()
print('*****************************************')
plt.figure(figsize=(3, 3))
cm2 = confusion_matrix(y_train,voting_modelEvaluation_train)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not CKD (0)', 'CKD (1)'], yticklabels=['Not CKD (0)', 'CKD (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix-train')
plt.show()

### **2. GussianNb:**

In [None]:
gussianModeleval_train=gnn.predict(x_train)
gussianModeleval_test=gnn.predict(x_test)
accuracy_gussianModel_train=accuracy_score(y_train,gussianModeleval_train)
accuracy_gussianModel_test=accuracy_score(y_test,gussianModeleval_test)

print()
print()
print('*'*50)
print('Training accuracy: ',accuracy_gussianModel_train)
print()
print('*'*50)
print('Test accuracy: ',accuracy_gussianModel_test)
print()
print('*'*50)

In [None]:
cv_gnb = cross_val_score(gnn, x_train, y_train, cv=10)
print("CV mean accuracy:", cv_gnb.mean())
print("CV std deviation:", cv_gnb.std())

In [None]:
print('Training accuracy: ',classification_report(y_train,gussianModeleval_train))
print('*'*50)
print('Test accuracy: ',classification_report(y_test,gussianModeleval_test))

In [None]:
plt.figure(figsize=(3, 3))
cm = confusion_matrix(y_test,gussianModeleval_test)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not CKD (0)', 'CKD (1)'], yticklabels=['Not CKD (0)', 'CKD (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix-test')
plt.show()
print('*****************************************')
plt.figure(figsize=(3, 3))
cm2 = confusion_matrix(y_train,gussianModeleval_train)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not CKD (0)', 'CKD (1)'], yticklabels=['Not CKD (0)', 'CKD (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix-train')
plt.show()

In [None]:
import numpy as np
import sklearn
import xgboost

print(np.__version__)
print(sklearn.__version__)
print(xgboost.__version__)


- I have chosen the top two models in terms of accuracy: voting and Gaussian Naive Bayes. Voting is preferred because it has 100% recall for CKD, meaning no patients are missed after confirmation using cross-validation. so, I will move voting to the preferred model.

# **for Automation approach prepare processed data for -Pycarat.....**

In [None]:
df1

In [None]:
target_df1 = pd.DataFrame(y, columns=['classification'])

In [None]:
# Combine df1 and classification for PyCaret_data
cleaned_dataset = pd.concat([df1, target_df1], axis=1)
cleaned_dataset

In [None]:
cleaned_dataset.shape

In [None]:
cleaned_dataset.to_csv("kidney_dis_cleaneddata_pycarat.csv")