# Implementing our novel approach with Naive Bayes

> Preprocessing: Clean and preprocess your dataset. This may include handling missing values, encoding categorical variables, and scaling features.

## Load dataset

In [121]:
from libs import data
from libs import kde_lib
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from libs.exp_lib import Density_model

X0, y0 = data.load_data("banana")

X_train, X_test, y_train, y_test = train_test_split(X0, y0, test_size=0.3, random_state=42)

/Users/biad/Desktop/THESIS/Tests/RKDE_HHO/libs/../data/banana/banana
Loaded banana data: 5300 samples, 2 dimensions, 2 labels
classes = {1.0, -1.0}


In [122]:
# Get the dimension of our data
dim = X_train.ndim

# Calculate the number of classes and features
classes = np.unique(y_train)
n_features = X_train.shape[1]

# Initialize robust densities
robust_densities = np.zeros((len(X_train), dim))

print(robust_densities.shape)
kernel = 'gaussian'
#h = .5


(3710, 2)


## Step2 - step3 : Robust Kernel Density Estimation (RKDE) & Bandwidth Selection using HHO:

- Implement the RKDE algorithm with IRLS and Robust M-estimation (Hampel function) for each class.
- Calculate robust densities for each class based on the RKDE algorithm.


In [123]:

for class_label in classes:
    # GET for each class
    class_indices = np.where(y_train==class_label)[0]
    class_X = X_train[class_indices]
    h = kde_lib.hho_bandwith_selection(class_X,class_X)
    #h = .5
    # Get for each feature
    rkde_feature =[]
    for d in range(dim):
        X = class_X[:,d] 
        X = X[:,np.newaxis]
        #print(X.shape)
        model = Density_model("rkde", "banana", 0,kernel,h)
        model.fit(X,X,grid=None)
        rkde = model.density
        
        robust_densities[class_indices,d] = rkde[:,0]

    


Stop at 7 iterations
Stop at 100 iterations
Stop at 7 iterations
Stop at 100 iterations
Stop at 6 iterations
Stop at 100 iterations
Stop at 7 iterations
Stop at 100 iterations


In [124]:
print(robust_densities[:10])

[[0.35634059 0.31197553]
 [0.34275537 0.24223146]
 [0.21389602 0.25945782]
 [0.35184567 0.28811403]
 [0.39364133 0.27045237]
 [0.18732134 0.24181388]
 [0.14516289 0.32839426]
 [0.3084838  0.29030089]
 [0.36525692 0.19846938]
 [0.29885404 0.28221212]]


In [125]:
print(robust_densities[:5])

[[0.35634059 0.31197553]
 [0.34275537 0.24223146]
 [0.21389602 0.25945782]
 [0.35184567 0.28811403]
 [0.39364133 0.27045237]]


In [126]:
print(robust_densities_per_class.shape)

(3710, 1)


## Step4: Incorporating RKDE into Naive Bayes:

> - Modify Naive Bayes classifier to use the RKDE densities instead of traditional Gaussian densities.
> - For prediction, calculate the likelihood using the robust densities obtained from RKDE.

In [127]:
class RobustNaiveBayes:

    def __init__(self) -> None:
        self.gnb = GaussianNB()
        self.kde_estimators = []
    
    def fit(self, X, y, robust_densities):
        """
        Fit the robust Naive Bayes model with RKDE densities.

        Parameters:
        X (array-like): Training data features.
        y (array-like): Training data labels.
        robust_densities (list of KernelDensity objects): List of KDE estimators for each class.
        """
        self.gnb.fit(X,y)
        self.kde_estimators = robust_densities

    def predict(self, X):
        """
        Predict class labels and RKDE likelihoods for input data.

        Parameters:
        X (array-like): Input data features.

        Returns:
        y_pred (array-like): Predicted class labels.
        rkde_likelihoods (array-like): RKDE likelihoods for each class.
        """
