# Selective Naïve Bayes Algorithm Project

Implementation of Selective Naïve Bayes Algorithm presented in the article and comparing its effectiveness with the classical Bayesian algorithm on several datasets.

Article: https://www.researchgate.net/publication/337968750_A_novel_selective_naive_Bayes_algorithm

# Imports

In [None]:
from sklearn.datasets import load_iris, load_breast_cancer, load_digits, load_wine, load_boston
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sklearn
import numpy as np
import math
import warnings
import pandas as pd

warnings.filterwarnings('ignore')

# Creating Selective Naïve Bayes Algorithm


In [None]:
def get_column(dataset, column_number):
  return [item[column_number] for item in dataset]       

def generate_sublists(list_to_use):
  result = []
  for i in range(len(list_to_use)):
    result.append(list(list_to_use[:i+1]))
  return result

def get_columns_of_dataset(dataset, list_of_indexes):
  new_dataset = []
  for i in range(len(dataset)):
    row = []
    for index in list_of_indexes:
      row.append(dataset[i][index])
    new_dataset.append(row)

  return new_dataset

In [None]:
def selective_naive_bayes(X, y, split_parameter):
  mutual_items = {}
  for i in range(X.shape[1]):
      mutual_items[i] = sklearn.metrics.mutual_info_score(get_column(X, i), y)

  sortedMutualInfoScore = dict(sorted(mutual_items.items(), key=lambda x: x[1]))
  mutualToList = list(sortedMutualInfoScore.keys())
  subGroups = generate_sublists(mutualToList)
  

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_parameter, random_state=0)

  RMSEs, results, models = [], [], []
  for group in subGroups:
    new_X_train = get_columns_of_dataset(X_train, group)
    new_X_test = get_columns_of_dataset(X_test, group)
    cnb = GaussianNB()
    y_pred = cnb.fit(new_X_train, y_train).predict(new_X_test)
    results.append(accuracy_score(y_pred, y_test))
    models.append(cnb)

    MSE = np.square(np.subtract(y_test, y_pred)).mean() 
    RMSE = math.sqrt(MSE)
    RMSEs.append(RMSE)

  max_result = max(results)
  max_index = results.index(max_result)
  min_RMSE = RMSEs.index(min(RMSEs))

  maxGroup = subGroups[min_RMSE]
  
  return max_result, models[min_RMSE], maxGroup

# Functions to test both algorithms

In [None]:
def compare_algorithms(X, y, split_parameter):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_parameter, random_state=0)
  naive_bayes = GaussianNB()
  y_pred = naive_bayes.fit(X_train, y_train).predict(X_test)

  score, model, subGroup = selective_naive_bayes(X, y, split_parameter)

  y_pred_2 = model.predict(get_columns_of_dataset(X_test, subGroup))

  print("Split parameter: " + str(split_parameter))
  print(f"Naive Bayes accuracy:           {accuracy_score(y_pred, y_test)}")
  print(f"Selective Naive Bayes accuracy: {accuracy_score(y_pred_2, y_test)}")
  print(subGroup)

def check_algorithms(X, y):
  compare_algorithms(X, y, 0.5)
  print()
  compare_algorithms(X, y, 0.6)
  print()
  compare_algorithms(X, y, 0.7)
  print()
  compare_algorithms(X, y, 0.8)
  print()
  compare_algorithms(X, y, 0.9)

# Iris dataset

In [None]:
X_iris, y_iris = load_iris(return_X_y=True)
check_algorithms(X_iris, y_iris)

Split parameter: 0.5
Naive Bayes accuracy:           0.9466666666666667
Selective Naive Bayes accuracy: 0.9466666666666667
[1, 0, 3, 2]

Split parameter: 0.6
Naive Bayes accuracy:           0.9444444444444444
Selective Naive Bayes accuracy: 0.9555555555555556
[1, 0, 3]

Split parameter: 0.7
Naive Bayes accuracy:           0.9333333333333333
Selective Naive Bayes accuracy: 0.9333333333333333
[1, 0, 3]

Split parameter: 0.8
Naive Bayes accuracy:           0.9333333333333333
Selective Naive Bayes accuracy: 0.9333333333333333
[1, 0, 3, 2]

Split parameter: 0.9
Naive Bayes accuracy:           0.9481481481481482
Selective Naive Bayes accuracy: 0.9481481481481482
[1, 0, 3, 2]


# Breast cancer dataset

In [None]:
X_breast_cancer, y_breast_cancer = load_breast_cancer(return_X_y=True)
check_algorithms(X_breast_cancer, y_breast_cancer)

Split parameter: 0.5
Naive Bayes accuracy:           0.9368421052631579
Selective Naive Bayes accuracy: 0.9578947368421052
[24, 8, 4, 18, 1, 9, 28, 17, 0, 21, 11, 29, 22, 20]

Split parameter: 0.6
Naive Bayes accuracy:           0.935672514619883
Selective Naive Bayes accuracy: 0.9444444444444444
[24, 8, 4, 18, 1, 9, 28, 17, 0, 21, 11, 29, 22, 20]

Split parameter: 0.7
Naive Bayes accuracy:           0.9373433583959899
Selective Naive Bayes accuracy: 0.9448621553884712
[24, 8, 4, 18, 1, 9, 28, 17, 0, 21, 11, 29, 22, 20]

Split parameter: 0.8
Naive Bayes accuracy:           0.9210526315789473
Selective Naive Bayes accuracy: 0.9429824561403509
[24, 8, 4, 18, 1, 9, 28, 17, 0, 21, 11, 29, 22, 20, 5, 27, 19, 25, 15, 13, 2, 16, 3]

Split parameter: 0.9
Naive Bayes accuracy:           0.9337231968810916
Selective Naive Bayes accuracy: 0.9571150097465887
[24, 8, 4, 18, 1, 9, 28, 17, 0, 21, 11, 29, 22, 20]


# Digits dataset

In [None]:
X_digits, y_digits = load_digits(return_X_y=True)
check_algorithms(X_digits, y_digits)

Split parameter: 0.5
Naive Bayes accuracy:           0.8342602892102335
Selective Naive Bayes accuracy: 0.8342602892102335
[0, 32, 39, 56, 24, 16, 31, 48, 8, 40, 47, 23, 15, 7, 55, 49, 63, 57, 1, 11, 14, 4, 6, 17, 12, 45, 52, 3, 59, 22, 9, 51, 25, 41, 18, 5, 19, 37, 62, 50, 60, 27, 35, 29, 44, 46, 53, 10, 2, 38, 54, 13, 58, 20, 36, 28, 61, 30, 43, 42, 26, 33, 34, 21]

Split parameter: 0.6
Naive Bayes accuracy:           0.830398517145505
Selective Naive Bayes accuracy: 0.830398517145505
[0, 32, 39, 56, 24, 16, 31, 48, 8, 40, 47, 23, 15, 7, 55, 49, 63, 57, 1, 11, 14, 4, 6, 17, 12, 45, 52, 3, 59, 22, 9, 51, 25, 41, 18, 5, 19, 37, 62, 50, 60, 27, 35, 29, 44, 46, 53, 10, 2, 38, 54, 13, 58, 20, 36, 28, 61, 30, 43, 42, 26, 33, 34, 21]

Split parameter: 0.7
Naive Bayes accuracy:           0.8251192368839427
Selective Naive Bayes accuracy: 0.8251192368839427
[0, 32, 39, 56, 24, 16, 31, 48, 8, 40, 47, 23, 15, 7, 55, 49, 63, 57, 1, 11, 14, 4, 6, 17, 12, 45, 52, 3, 59, 22, 9, 51, 25, 41, 18, 5, 1

# Wine dataset

In [None]:
X_wine, y_wine = load_wine(return_X_y=True)
check_algorithms(X_wine, y_wine)

Split parameter: 0.5
Naive Bayes accuracy:           0.9438202247191011
Selective Naive Bayes accuracy: 0.9775280898876404
[7, 4, 3, 2, 10, 8, 5, 1, 12, 0, 11, 9]

Split parameter: 0.6
Naive Bayes accuracy:           0.9345794392523364
Selective Naive Bayes accuracy: 0.9532710280373832
[7, 4, 3, 2, 10, 8, 5, 1, 12, 0, 11, 9]

Split parameter: 0.7
Naive Bayes accuracy:           0.968
Selective Naive Bayes accuracy: 0.968
[7, 4, 3, 2, 10, 8, 5, 1, 12, 0, 11]

Split parameter: 0.8
Naive Bayes accuracy:           0.965034965034965
Selective Naive Bayes accuracy: 0.9790209790209791
[7, 4, 3, 2, 10, 8, 5, 1, 12, 0, 11, 9]

Split parameter: 0.9
Naive Bayes accuracy:           0.906832298136646
Selective Naive Bayes accuracy: 0.9130434782608695
[7, 4, 3, 2, 10, 8, 5, 1, 12, 0, 11, 9]


# Students dataset

In [None]:
link = "https://storage.googleapis.com/kagglesdsdata/datasets/1811753/2955046/student_prediction.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220112%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220112T231855Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=a60ba818da224d1c5cecf95c505e142eb603feb5a28f0ce27edfc730a06909dd3afd09b0b18195a10d691019df52e00e4896e9bc1b145b21a361211defba3d53453d71261a7975c8a4941a6ccf549558e9db3bc673be7e4f5e55358d25a7458ca31a8efd611d1e52d898b2bb6014da678421871d2c06799940ef28a51223bc4fe22e6c356b6b23087af3b445c3a90962719d60241c6600327ca8420aeec3309001cf79f72db3751bb2b2fe67db4f94bc6efc71b5fce85dfc559575ee1df2a42bc7d3b078807d6b8328a678d150892bbd3497bd7458b74ca1bf23645dda05071c4637cfb33a857ed2dcd1ed648a8a3735d8da8ab1c73f39f018a9abc4f9e608f7"

df = pd.read_csv(link)
students = df.to_numpy()

X_students = np.array([x[1:-2] for x in students])
y_students = np.array([x[-1] for x in students])

In [None]:
check_algorithms(X_students, y_students)

Split parameter: 0.5
Naive Bayes accuracy:           0.1780821917808219
Selective Naive Bayes accuracy: 0.136986301369863
[19, 23]

Split parameter: 0.6
Naive Bayes accuracy:           0.21839080459770116
Selective Naive Bayes accuracy: 0.1839080459770115
[19]

Split parameter: 0.7
Naive Bayes accuracy:           0.13725490196078433
Selective Naive Bayes accuracy: 0.09803921568627451
[19, 23, 27, 5, 4, 6]

Split parameter: 0.8
Naive Bayes accuracy:           0.1724137931034483
Selective Naive Bayes accuracy: 0.1724137931034483
[19]

Split parameter: 0.9
Naive Bayes accuracy:           0.1984732824427481
Selective Naive Bayes accuracy: 0.16793893129770993
[19]


# Boston housing dataset

In [None]:
boston = load_boston(return_X_y=False).data

X_boston = np.array([np.concatenate((x[:8], x[9:])) for x in boston])
Y_boston = np.array([int(x[8]) for x in boston])

check_algorithms(X_boston, Y_boston)

Split parameter: 0.5
Naive Bayes accuracy:           0.6205533596837944
Selective Naive Bayes accuracy: 0.6205533596837944
[3, 1, 10, 9, 6, 8, 5, 11, 4, 2, 7, 0]

Split parameter: 0.6
Naive Bayes accuracy:           0.5888157894736842
Selective Naive Bayes accuracy: 0.5888157894736842
[3, 1, 10, 9, 6, 8, 5, 11, 4, 2, 7, 0]

Split parameter: 0.7
Naive Bayes accuracy:           0.5380281690140845
Selective Naive Bayes accuracy: 0.5380281690140845
[3, 1, 10, 9, 6, 8, 5, 11, 4, 2, 7, 0]

Split parameter: 0.8
Naive Bayes accuracy:           0.5185185185185185
Selective Naive Bayes accuracy: 0.5185185185185185
[3, 1, 10, 9, 6, 8, 5, 11, 4, 2, 7, 0]

Split parameter: 0.9
Naive Bayes accuracy:           0.48464912280701755
Selective Naive Bayes accuracy: 0.5
[3, 1, 10, 9, 6, 8]
