In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('breast.csv')
df

Unnamed: 0,clump,size,shape,adhesion,epithsize,bare,bland,nucleoli,mitoses,class
0,1,0,0,0,0,0,1,0,0,2
1,1,1,1,1,1,1,1,1,0,2
2,0,0,0,0,0,1,1,0,0,2
3,1,1,1,0,1,1,1,1,0,2
4,0,0,0,1,0,0,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...
678,0,0,0,0,1,1,0,0,0,2
679,0,0,0,0,0,0,0,0,0,2
680,1,1,1,1,1,1,1,1,1,4
681,0,1,1,1,1,1,1,1,0,4


In [None]:
df.describe()

Unnamed: 0,clump,size,shape,adhesion,epithsize,bare,bland,nucleoli,mitoses,class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,0.455344,0.45388,0.408492,0.424597,0.385066,0.41142,0.54612,0.367496,0.175695,2.699854
std,0.498367,0.498233,0.491915,0.494644,0.486968,0.492452,0.498233,0.482477,0.38084,0.954592
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [None]:
X = df.drop('class', axis=1).values
y = df['class'].values

In [None]:
# Function to train a Bernoulli Naive Bayes model
def train_naive_bayes(X_train, y_train):
        class_probs = {
            2: np.sum(y_train == 2) / len(y_train),
            4: np.sum(y_train == 4) / len(y_train)
        }

        feature_probs = {}
        for label in [2, 4]:
            feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + 1) / (np.sum(y_train == label) + 2)

        return class_probs, feature_probs


In [None]:
    # Function to predict using the trained model
def predict_naive_bayes(X_test, class_probs, feature_probs):
        predictions = []
        for row in X_test:
            probabilities = {label: np.log(class_probs[label]) + np.sum(np.log(feature_probs[label][row == 1])) for label in [2, 4]}
            predictions.append(max(probabilities, key=probabilities.get))
        return np.array(predictions)

In [None]:
# Number of repetitions for cross-validation
num_repetitions = 20

In [None]:
# Arrays to store individual misclassification rates
misclassification_rates = np.zeros(num_repetitions)

In [None]:
# Perform cross-validation
for i in range(num_repetitions):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)



    # Train the model
    class_probs, feature_probs = train_naive_bayes(X_train, y_train)

    # Make predictions on the test set
    y_pred = predict_naive_bayes(X_test, class_probs, feature_probs)

    # Calculate misclassification rate
    misclassification_rate = 1 - accuracy_score(y_test, y_pred)
    misclassification_rates[i] = misclassification_rate

    # Print the misclassification rate for each repetition
    print(f"Iteration {i + 1}: Misclassification Rate = {misclassification_rate}")

# Calculate and print the mean misclassification rate
mean_misclassification_rate = np.mean(misclassification_rates)
print(f"\nMean Misclassification Rate across {num_repetitions} iterations: {mean_misclassification_rate}")


Iteration 1: Misclassification Rate = 0.4233576642335767
Iteration 2: Misclassification Rate = 0.44525547445255476
Iteration 3: Misclassification Rate = 0.46715328467153283
Iteration 4: Misclassification Rate = 0.3722627737226277
Iteration 5: Misclassification Rate = 0.4744525547445255
Iteration 6: Misclassification Rate = 0.43795620437956206
Iteration 7: Misclassification Rate = 0.3868613138686131
Iteration 8: Misclassification Rate = 0.43795620437956206
Iteration 9: Misclassification Rate = 0.3868613138686131
Iteration 10: Misclassification Rate = 0.4233576642335767
Iteration 11: Misclassification Rate = 0.4233576642335767
Iteration 12: Misclassification Rate = 0.4233576642335767
Iteration 13: Misclassification Rate = 0.4014598540145985
Iteration 14: Misclassification Rate = 0.44525547445255476
Iteration 15: Misclassification Rate = 0.416058394160584
Iteration 16: Misclassification Rate = 0.3941605839416058
Iteration 17: Misclassification Rate = 0.45255474452554745
Iteration 18: Misc

In [None]:
# Arrays to store individual misclassification rates
misclassification_rates1 = np.zeros(num_repetitions)

In [None]:
# Define functions for Bayesian Naive Bayes
def train_bayesian_naive_bayes(X_train, y_train):
    # Prior hyperparameters for Beta distribution
    alpha_prior = 1
    beta_prior = 1

    # MLE estimates as initial parameters
    class_probs = {
        2: np.sum(y_train == 2) / len(y_train),
        4: np.sum(y_train == 4) / len(y_train)
    }

    feature_probs = {}
    for label in [2, 4]:
        feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + 1) / (np.sum(y_train == label) + 2)

    # Bayesian updating using Beta prior
    class_probs[0] = (np.sum(y_train == 2) + alpha_prior - 1) / (len(y_train) + alpha_prior + beta_prior - 2)
    class_probs[1] = (np.sum(y_train == 4) + alpha_prior - 1) / (len(y_train) + alpha_prior + beta_prior - 2)

    for label in [0, 1]:
        feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)

    return class_probs, feature_probs

In [None]:
# Perform cross-validation
for i in range(num_repetitions):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    # Train the Bayesian Naive Bayes model
    class_probs, feature_probs = train_bayesian_naive_bayes(X_train, y_train)

    # Make predictions on the test set
    y_pred = predict_naive_bayes(X_test, class_probs, feature_probs)

    # Calculate misclassification rate
    misclassification_rate = 1 - accuracy_score(y_test, y_pred)
    misclassification_rates[i] = misclassification_rate

    # Print the misclassification rate for each repetition
    print(f"Iteration {i + 1}: Misclassification Rate = {misclassification_rate}")

# Calculate and print the mean misclassification rate
mean_misclassification_rate = np.mean(misclassification_rates)
print(f"\nMean Misclassification Rate across {num_repetitions} iterations: {mean_misclassification_rate}")

Iteration 1: Misclassification Rate = 0.4233576642335767
Iteration 2: Misclassification Rate = 0.44525547445255476
Iteration 3: Misclassification Rate = 0.46715328467153283
Iteration 4: Misclassification Rate = 0.3722627737226277
Iteration 5: Misclassification Rate = 0.4744525547445255
Iteration 6: Misclassification Rate = 0.43795620437956206
Iteration 7: Misclassification Rate = 0.3868613138686131
Iteration 8: Misclassification Rate = 0.43795620437956206
Iteration 9: Misclassification Rate = 0.3868613138686131
Iteration 10: Misclassification Rate = 0.4233576642335767
Iteration 11: Misclassification Rate = 0.4233576642335767
Iteration 12: Misclassification Rate = 0.4233576642335767
Iteration 13: Misclassification Rate = 0.4014598540145985
Iteration 14: Misclassification Rate = 0.44525547445255476
Iteration 15: Misclassification Rate = 0.416058394160584
Iteration 16: Misclassification Rate = 0.3941605839416058
Iteration 17: Misclassification Rate = 0.45255474452554745
Iteration 18: Misc

  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)
  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)
  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)
  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)
  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)
  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + alpha_prior + beta_prior - 2)
  feature_probs[label] = (np.sum(X_train[y_train == label], axis=0) + alpha_prior - 1) / (np.sum(y_train == label) + a