# Implementing our novel approach with Naive Bayes

> Preprocessing: Clean and preprocess your dataset. This may include handling missing values, encoding categorical variables, and scaling features.

## Load dataset

In [80]:
from libs import data
from libs import kde_lib
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from libs.exp_lib import Density_model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy.stats import norm
from scipy import stats
from sklearn.datasets import make_circles, make_moons, make_classification
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris,load_breast_cancer
from libs.RNB import RobustNaiveBayes


In [81]:
def generate_outliers(X,y,outlier_proportion=.1):

    # Calculate the number of outliers to add
    num_outliers = int(outlier_proportion * len(X))
    # Generate random outlier points within the range of the dataset
    outliers_X = np.random.rand(num_outliers, 2) * (np.max(X, axis=0) - np.min(X, axis=0)) + np.min(X, axis=0)
    outliers_y = np.array([1] * num_outliers)  # Assign a class label to outliers

    # Concatenate outliers with the original dataset
    X = np.vstack((X, outliers_X))
    y = np.concatenate((y, outliers_y))
    return X, y

#X0, y0 = data.load_data_outlier("banana") # OK

# =======================================================
#   Generate synthetic data with outliers
# =======================================================
#X0, y0 = make_circles(1500, noise=.1, random_state=42)
num_dimensions = 2  # Number of dimensions
outlier_proportion = .2
#X0, y0 = make_classification(n_samples=2000, n_features=num_dimensions, n_informative=2, n_redundant=0,random_state=1, n_clusters_per_class=1) 
###### T distribution #########
degrees_of_freedom = 1  # Degrees of freedom for the T-distribution
sample_size = 3000  # Number of data points
# Set the random seed for reproducibility
np.random.seed(42)
# Generate data with a T-distribution
X0 = np.random.standard_t(degrees_of_freedom, size=(sample_size, num_dimensions))

# Define a threshold for binary classification
threshold = 1.0  # You can adjust this threshold as needed

# Assign labels based on the threshold
y0 = (X0[:, 0] > threshold).astype(int)  # You can choose a different dimension for comparison
""" # Print the first few data points and labels
print("Data:")
print(X0[:5])
print("\nLabels:")
print(y0[:5]) """
###### End #########
#X0, y0 = generate_outliers(X0, y0)
""" dataset = load_iris()
X0, y0 = dataset.data, dataset.target """
""" data = load_breast_cancer()
X0, y0 = data.data, data.target
selected_features = [0, 3] 
X0 = X0[:,selected_features] """
# Introduce outliers by modifying some data points
num_outliers = int(outlier_proportion * len(X0))
outliers_indices = np.random.choice(len(X0), num_outliers, replace=False)
outliers = np.random.uniform(low=np.min(X0, axis=0)-10, high=np.max(X0, axis=0)+10, size=(num_outliers, num_dimensions))
X0[outliers_indices] = outliers
# Set labels for the outliers
sep = int(len(outliers_indices)/2)
y0[outliers_indices[:sep]] = 1 
y0[outliers_indices[sep:]] = 0 

#rng = np.random.RandomState(2)
#X0 += 2 * rng.uniform(size=X0.shape)
linearly_separable = (X0, y0)
# =======================================================
#   Done Generate  the synthetic data
# =======================================================

## Generate data with different distribution

In [82]:
""" # Set the random seed for reproducibility
np.random.seed(0)

# Define the number of data points
num_samples = 1000

# Define the dimensions for data generation
dimensions = [2]

# Parameters for the distributions
distribution_params = {
    2 : {
    'Gaussian': {
        0: {'loc': [0, 0], 'scale': [3, 1.2]},
        1: { 'loc': [1, 1], 'scale': [3, 1.5]}
    }, 
    'T': {
        0: {'df': 5, 'loc': [0, 0], 'scale': [2, .1]}},
        1: {'df': 5, 'loc': [2, 2], 'scale': [2, .1]}},
    
}
 'Cauchy': {
        1: {'loc': 0, 'scale': 1},
        2: {'loc': 0, 'scale': 1}},
    'Laplace': {'loc': 0, 'scale': 1}
# Distributions to generate data
distributions = {
    'Gaussian': stats.norm,  
}
'T': stats.t,
    'Cauchy': stats.cauchy,
    'Laplace': stats.laplace  """

" # Set the random seed for reproducibility\nnp.random.seed(0)\n\n# Define the number of data points\nnum_samples = 1000\n\n# Define the dimensions for data generation\ndimensions = [2]\n\n# Parameters for the distributions\ndistribution_params = {\n    2 : {\n    'Gaussian': {\n        0: {'loc': [0, 0], 'scale': [3, 1.2]},\n        1: { 'loc': [1, 1], 'scale': [3, 1.5]}\n    }, \n    'T': {\n        0: {'df': 5, 'loc': [0, 0], 'scale': [2, .1]}},\n        1: {'df': 5, 'loc': [2, 2], 'scale': [2, .1]}},\n    \n}\n 'Cauchy': {\n        1: {'loc': 0, 'scale': 1},\n        2: {'loc': 0, 'scale': 1}},\n    'Laplace': {'loc': 0, 'scale': 1}\n# Distributions to generate data\ndistributions = {\n    'Gaussian': stats.norm,  \n}\n'T': stats.t,\n    'Cauchy': stats.cauchy,\n    'Laplace': stats.laplace  "

In [83]:
""" # Generate data for each dimension and distribution
for dim in dimensions:
    for name, distribution in distributions.items():
        # Get distribution parameters
        dict_params = distribution_params[dim][name]
        data = []
        # Generate data
        for index, params in dict_params.items():
            #data = (np.vstack((data,distribution.rvs(size=(num_samples, dim)))), distribution.rvs(size=(num_samples, dim), **params))[len(data) == 0]
            data = (np.vstack((data,distribution.rvs(size=(num_samples, dim)))), distribution.rvs(size=(num_samples, dim), **params))[len(data) == 0]
            print(data)
            #data = np.row_stack(data,distribution.rvs(size=(num_samples, dim), **params))
        # Create labels for binary classification
        labels = np.random.randint(2, size=num_samples)
        
        # Combine data and labels
        labeled_data = np.column_stack((data, labels))
        print(labeled_data) """
        
        # Save or use the generated data for experiments
        # For example, save it to a file or use it in your experiments
        # np.savetxt(f'data_{name}_{dim}D_binary.csv', labeled_data, delimiter=',')


' # Generate data for each dimension and distribution\nfor dim in dimensions:\n    for name, distribution in distributions.items():\n        # Get distribution parameters\n        dict_params = distribution_params[dim][name]\n        data = []\n        # Generate data\n        for index, params in dict_params.items():\n            #data = (np.vstack((data,distribution.rvs(size=(num_samples, dim)))), distribution.rvs(size=(num_samples, dim), **params))[len(data) == 0]\n            data = (np.vstack((data,distribution.rvs(size=(num_samples, dim)))), distribution.rvs(size=(num_samples, dim), **params))[len(data) == 0]\n            print(data)\n            #data = np.row_stack(data,distribution.rvs(size=(num_samples, dim), **params))\n        # Create labels for binary classification\n        labels = np.random.randint(2, size=num_samples)\n        \n        # Combine data and labels\n        labeled_data = np.column_stack((data, labels))\n        print(labeled_data) '

## Prepare the data

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X0, y0, test_size=.3, random_state=42)
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Naive Bayes


In [85]:
model = GaussianNB()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(np.unique(predictions))

print("\n")
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions )
print("Accuracy 2:", accuracy)
""" print("precision 2:", precision)
print("recall 2:", recall)
print("f1 2:", f1) """


[0 1]


Accuracy 2: 0.6877777777777778


' print("precision 2:", precision)\nprint("recall 2:", recall)\nprint("f1 2:", f1) '

## Robust Naive Bayes with HHO

In [79]:
# Create and fit the RobustNaiveBayes classifier
model = RobustNaiveBayes()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(np.unique(predictions))

print("\n")
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions )
print("Accuracy 2:", accuracy)
""" print("precision 2:", precision)
print("recall 2:", recall)
print("f1 2:", f1) """

2023/11/03 06:11:21 PM, INFO, mealpy.swarm_based.HHO.OriginalHHO: Solving 2-objective optimization problem with weights: [1 1].


KeyboardInterrupt: 

## Our model VS other classification models
Compare the Naive Bayes classifier with Optimized Robust Kernel Density Estimation to other classification models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

In [None]:
# Define classifiers
classifiers = [
    ('Naive Bayes', GaussianNB()),
    ('Naive Bayes with RKDE',RobustNaiveBayes()),
    ('K-Nearest Neighbors', KNeighborsClassifier(n_neighbors=5)),
    ('Support Vector Machine', SVC(kernel='linear')),
    ('Decision Tree', DecisionTreeClassifier(max_depth=3)),
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
    ('K-Means Clustering', KMeans(n_clusters=3))
]

#Compare classifier performances
results = []
for name,classifer in classifiers:
    clf = Pipeline([('classifier', classifer)])
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append((name, accuracy))

for name, accuracy in results:
    print(f'{name}: Accuracy = {accuracy:.2f}')

In [None]:
# Create and fit the RobustNaiveBayes classifier
""" model = RobustNaiveBayes("pso")
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(np.unique(predictions))

print("\n")
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions )
print("Accuracy 2:", accuracy)
print("precision 2:", precision)
print("recall 2:", recall)
print("f1 2:", f1) """

## Classifiers Comparaison 

In [None]:
# =======================================================
#   Generate synthetic data with outliers
# =======================================================
#make_moons = make_moons(500, noise=.2, random_state=42)

#X0, y0 = make_classification(n_samples=500, n_features=2, n_informative=2, n_redundant=0,random_state=1, n_clusters_per_class=1) 

 # Introduce outliers by modifying some data points
""" outlier_proportion = .1
num_outliers = int(outlier_proportion * len(X0))
outliers_indices = np.random.choice(len(X0), num_outliers, replace=False)
outliers = np.random.uniform(low=np.min(X0, axis=0)-1, high=np.max(X0, axis=0)+1, size=(num_outliers, 2))
X0[outliers_indices] = outliers
# Set labels for the outliers
y0[outliers_indices] = 1  """
#X0, y0 = generate_outliers(X0, y0)
""" data = load_breast_cancer()
X0, y0 = data.data, data.target
selected_features = [0, 3] 
X0 = X0[:,selected_features] """

""" rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)  """
#X, y = data.load_data("banana")
# =======================================================
#   Done Generate  the synthetic data
# =======================================================
#X0, y0 = data.load_data_outlier("banana")
linearly_separable = (X0, y0)
classifiers = {
    "Naive Bayes": GaussianNB(),
    "RNB with HHO": RobustNaiveBayes(),
    #"RNB with PSO": RobustNaiveBayes("pso"),
}


#iris = load_iris()

datasets = [
    linearly_separable
    
]
datasets_name = [
    #"make_moons",
   # "make_circles ",
    "Synthetic",
]

figure = plt.figure(figsize=(27, 9))
i = 1

In [None]:
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#2ca02c", "#0000FF"])
    ax = plt.subplot(len(datasets), len(classifiers) +1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    # Plot the testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k")
    # Plot the Outliers points
    ax.scatter(X[outliers_indices, 0], X[outliers_indices, 1],marker="X", c=y[outliers_indices], edgecolors="r")
    
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    for name, clf in classifiers.items():
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

        # Create a meshgrid for plotting
        h = .02  # Step size in the mesh
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

        #clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)

        # Make predictions using the classifier
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contourf(xx, yy, Z, alpha=.5)

        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        """ DecisionBoundaryDisplay.from_estimator(
            clf, X, cmap=cm, alpha=.8, ax=ax, eps=.5
        ) """
        
        # Plot the training points
        ax.scatter(
            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        )
        # Plot the testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )
        # Plot the Outliers points
        ax.scatter(X[outliers_indices, 0], X[outliers_indices, 1],marker="X", c=y[outliers_indices], edgecolors="r")
        
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(
            x_max - 0.3,
            y_min + 0.3,
            ("Ac: {}".format(("%.2f" % accuracy).lstrip("0"))),
            size=10,
            horizontalalignment="right",
        )
        i += 1
plt.tight_layout()
plt.show()