In [54]:
# Importing necessary libraries
# NumPy provides support for large, multi-dimensional arrays and matrices.
import numpy as np
# Pandas provides support for data manipulation and analysis.
import pandas as pd
# Matplotlib.pyplot is a plotting library in Python that provides a MATLAB-like interface for creating visualizations.
import matplotlib.pyplot as plt

In [55]:
# Load the dataset
data = pd.read_csv("CKD.csv")

In [56]:
# Convert categorical variables into dummy/indicator variables
data = pd.get_dummies(data, drop_first=True)
print(data.head(4))

   age         bp   al   su         bgr         bu        sc         sod  \
0  2.0  76.459948  3.0  0.0  148.112676  57.482105  3.077356  137.528754   
1  3.0  76.459948  2.0  0.0  148.112676  22.000000  0.700000  137.528754   
2  4.0  76.459948  1.0  0.0   99.000000  23.000000  0.600000  138.000000   
3  5.0  76.459948  1.0  0.0  148.112676  16.000000  0.700000  138.000000   

        pot       hrmo  ...  pc_normal  pcc_present  ba_present  htn_yes  \
0  4.627244  12.518156  ...          0            0           0        0   
1  4.627244  10.700000  ...          1            0           0        0   
2  4.400000  12.000000  ...          1            0           0        0   
3  3.200000   8.100000  ...          1            0           0        0   

   dm_yes  cad_yes  appet_yes  pe_yes  ane_yes  classification_yes  
0       0        0          1       1        0                   1  
1       0        0          1       0        0                   1  
2       0        0          1  

In [57]:
# Create the dependent variable (target)
dependent = data[["classification_yes"]]

In [58]:
# Check for missing values
print(data.isnull().sum())

age                   0
bp                    0
al                    0
su                    0
bgr                   0
bu                    0
sc                    0
sod                   0
pot                   0
hrmo                  0
pcv                   0
wc                    0
rc                    0
sg_b                  0
sg_c                  0
sg_d                  0
sg_e                  0
rbc_normal            0
pc_normal             0
pcc_present           0
ba_present            0
htn_yes               0
dm_yes                0
cad_yes               0
appet_yes             0
pe_yes                0
ane_yes               0
classification_yes    0
dtype: int64


In [59]:
# Create the independent variables (features)
independent = data[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes']]
independent.columns

Index(['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes'],
      dtype='object')

In [60]:
# Importing train_test_split from sklearn.model_selection
# This function is used for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

In [61]:
# Splitting data into training and testing sets
# Train and test split ratio is 70% and 30% respectively
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

In [62]:
# Importing Gaussian Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import GaussianNB  

# Initializing the Gaussian Naive Bayes classifier
classifier = GaussianNB()

# Training the classifier using training data
classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = classifier.predict(X_test)

# Importing confusion matrix metric from scikit-learn
from sklearn.metrics import confusion_matrix  

# Calculating the confusion matrix to evaluate the performance of the classifier
cm = confusion_matrix(y_test, y_pred)

# Printing the confusion matrix
print(cm)

# Importing classification report metric from scikit-learn
from sklearn.metrics import classification_report  

# Generating a classification report to summarize the performance of the classifier
clf_report = classification_report(y_test, y_pred)

# Printing the classification report
print(clf_report)

# Importing roc_curve and roc_auc_score metrics from scikit-learn
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Calculating the probabilities of positive class (class 1) predictions
y_prob = classifier.predict_proba(X_test)[:, 1]

# Calculating the false positive rate, true positive rate, and thresholds for the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculating the Area Under the Curve (AUC) score
roc_auc= roc_auc_score(y_test, y_prob)

# Printing the AUC score
print("ROC AUC Score::", roc_auc)


[[45  0]
 [ 2 73]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        45
           1       1.00      0.97      0.99        75

    accuracy                           0.98       120
   macro avg       0.98      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120

ROC AUC Score:: 1.0


  y = column_or_1d(y, warn=True)


In [63]:
# Importing Multinomial Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import MultinomialNB  

# Initializing the Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Training the classifier using training data
classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = classifier.predict(X_test)

# Importing confusion matrix metric from scikit-learn
from sklearn.metrics import confusion_matrix  

# Calculating the confusion matrix to evaluate the performance of the classifier
cm = confusion_matrix(y_test, y_pred)

# Printing the confusion matrix
print(cm)

# Importing classification report metric from scikit-learn
from sklearn.metrics import classification_report  

# Generating a classification report to summarize the performance of the classifier
clf_report = classification_report(y_test, y_pred)

# Printing the classification report
print(clf_report)

# Calculating the Area Under the Curve (AUC) score
roc_auc= roc_auc_score(y_test, y_prob)

# Printing the AUC score
print("ROC AUC Score::", roc_auc)


[[44  1]
 [22 53]]
              precision    recall  f1-score   support

           0       0.67      0.98      0.79        45
           1       0.98      0.71      0.82        75

    accuracy                           0.81       120
   macro avg       0.82      0.84      0.81       120
weighted avg       0.86      0.81      0.81       120

ROC AUC Score:: 1.0


  y = column_or_1d(y, warn=True)


In [64]:
# Importing Bernoulli Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import BernoulliNB  

# Initializing the Bernoulli Naive Bayes classifier
classifier = BernoulliNB()

# Training the classifier using training data
classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = classifier.predict(X_test)

# Importing confusion matrix metric from scikit-learn
from sklearn.metrics import confusion_matrix  

# Calculating the confusion matrix to evaluate the performance of the classifier
cm = confusion_matrix(y_test, y_pred)

# Printing the confusion matrix
print(cm)

# Importing classification report metric from scikit-learn
from sklearn.metrics import classification_report  

# Generating a classification report to summarize the performance of the classifier
clf_report = classification_report(y_test, y_pred)

# Printing the classification report
print(clf_report)

# Calculating the Area Under the Curve (AUC) score
roc_auc= roc_auc_score(y_test, y_prob)

# Printing the AUC score
print("ROC AUC Score::", roc_auc)



[[45  0]
 [ 8 67]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        45
           1       1.00      0.89      0.94        75

    accuracy                           0.93       120
   macro avg       0.92      0.95      0.93       120
weighted avg       0.94      0.93      0.93       120

ROC AUC Score:: 1.0


  y = column_or_1d(y, warn=True)


In [65]:
# Importing Complement Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import ComplementNB  

# Initializing the Complement Naive Bayes classifier
classifier = ComplementNB()

# Training the classifier using training data
classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = classifier.predict(X_test)

# Importing confusion matrix metric from scikit-learn
from sklearn.metrics import confusion_matrix  

# Calculating the confusion matrix to evaluate the performance of the classifier
cm = confusion_matrix(y_test, y_pred)

# Printing the confusion matrix
print(cm)

# Importing classification report metric from scikit-learn
from sklearn.metrics import classification_report  

# Generating a classification report to summarize the performance of the classifier
clf_report = classification_report(y_test, y_pred)

# Printing the classification report
print(clf_report)

# Calculating the Area Under the Curve (AUC) score
roc_auc= roc_auc_score(y_test, y_prob)

# Printing the AUC score
print("ROC AUC Score::", roc_auc)



[[44  1]
 [22 53]]
              precision    recall  f1-score   support

           0       0.67      0.98      0.79        45
           1       0.98      0.71      0.82        75

    accuracy                           0.81       120
   macro avg       0.82      0.84      0.81       120
weighted avg       0.86      0.81      0.81       120

ROC AUC Score:: 1.0


  y = column_or_1d(y, warn=True)


In [66]:
# Importing Categorical Naive Bayes classifier from scikit-learn
from sklearn.naive_bayes import CategoricalNB  

# Initializing the Categorical Naive Bayes classifier
classifier = CategoricalNB()

# Training the classifier using training data
classifier.fit(X_train, y_train)

# Making predictions on the test data
#y_pred = classifier.predict(X_test)

# Importing confusion matrix metric from scikit-learn
from sklearn.metrics import confusion_matrix  

# Calculating the confusion matrix to evaluate the performance of the classifier
cm = confusion_matrix(y_test, y_pred)

# Printing the confusion matrix
print(cm)

# Importing classification report metric from scikit-learn
from sklearn.metrics import classification_report  

# Generating a classification report to summarize the performance of the classifier
clf_report = classification_report(y_test, y_pred)

# Printing the classification report
print(clf_report)

# Calculating the Area Under the Curve (AUC) score
roc_auc= roc_auc_score(y_test, y_prob)

# Printing the AUC score
print("ROC AUC Score::", roc_auc)



[[44  1]
 [22 53]]
              precision    recall  f1-score   support

           0       0.67      0.98      0.79        45
           1       0.98      0.71      0.82        75

    accuracy                           0.81       120
   macro avg       0.82      0.84      0.81       120
weighted avg       0.86      0.81      0.81       120

ROC AUC Score:: 1.0


  y = column_or_1d(y, warn=True)


In [67]:
from sklearn.preprocessing import KBinsDiscretizer

# Initialize the KBinsDiscretizer to discretize continuous features
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')  # Adjust n_bins as needed

# Fit and transform the training and test data
X_train_discrete = kbd.fit_transform(X_train)
X_test_discrete = kbd.transform(X_test)

# Import CategoricalNB
from sklearn.naive_bayes import CategoricalNB

# Initialize and train the Categorical Naive Bayes classifier
classifier = CategoricalNB()
classifier.fit(X_train_discrete, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test_discrete)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix
print(cm)

# Import classification_report
from sklearn.metrics import classification_report

# Generate classification report
clf_report = classification_report(y_test, y_pred)

# Print classification report
print(clf_report)

# Calculating the Area Under the Curve (AUC) score
roc_auc= roc_auc_score(y_test, y_prob)

# Printing the AUC score
print("ROC AUC Score::", roc_auc)



[[45  0]
 [ 3 72]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        45
           1       1.00      0.96      0.98        75

    accuracy                           0.97       120
   macro avg       0.97      0.98      0.97       120
weighted avg       0.98      0.97      0.98       120

ROC AUC Score:: 1.0


  y = column_or_1d(y, warn=True)
