<a href="https://colab.research.google.com/github/aakhterov/ML_algorithms_from_scratch/blob/master/Naive_Bayes_Classifier_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes Classifiers

[Naive Bayes Classifiers](https://www.geeksforgeeks.org/naive-bayes-classifiers/)

[Categorical Naive Bayes¶ on sklearn doc](https://scikit-learn.org/stable/modules/naive_bayes.html#categorical-naive-bayes)

[Naive Bayes Classifiers](https://www.geeksforgeeks.org/naive-bayes-classifiers/)

In [79]:
import numpy as np
from typing import Dict
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.naive_bayes import CategoricalNB

In [80]:
from pandas.tseries.frequencies import unique
from pandas.io.parsers.base_parser import Iterable
class CustomNaiveBayesClassifier:

  def __init__(self, alpha=1):
    self.alpha = alpha
    self.features_distr = {} # dict with distribution tables by every feature
    self.target_distribution = {} # target variable distribution

  def fit(self, X, y):
    """
      Calculate probabilities of feature values with respect to dataset X and target value y
    """

    y_series = pd.Series(y)

    df = pd.DataFrame(X)
    features = df.columns.to_list() # save feature names as a list
    df["target"] = y_series # add y as a column to calculate features distribution with respect to the target variable values
    df["target_copy"] = y_series # add y as a column once again. This is a trick to leverage pivot_table method


    for feature in features:
      # use pivot_table to calculate features distribution with respect to the target variable values.
      # we get the folowing dataframe for each feature:
      #      	          target_var_class_1	target_var_class_2 ....
      # feature_val_1	          0                 4     	<= count of feature_val_1, given target_var_class_1, target_var_class_2 and so on
      # feature_val_2	          5	                2
      # feature_val_3	          2	                3
      # feature_val_4	          3	                2
      distribution_table = pd.pivot_table(df, index=feature, columns='target', values='target_copy', aggfunc='count', fill_value=0)
      count_unique_values = len(distribution_table.index)

      # add to distribution_table columns with probabilities (frequences)
      for target_var_value in distribution_table.columns:
        probability_column_name = f"p_{target_var_value}"
        # https://scikit-learn.org/stable/modules/naive_bayes.html#categorical-naive-bayes
        # Use Laplace smoothing to avoid Zero Probability problem
        distribution_table[probability_column_name] = \
         (distribution_table[target_var_value] + self.alpha) / (distribution_table[target_var_value].sum() + self.alpha*count_unique_values)

      self.features_distr[feature] = distribution_table

    # Calculate target variable (y) distribution and
    # save it to dictionary target_distribution = {"class_1": probability_of_class_1, ...}

    value_counts = y_series.value_counts()
    total = np.sum(value_counts)
    for c in value_counts.keys():
      self.target_distribution[c] = value_counts[c]/total

  def predict_proba(self, X):
    """
      Calculate probabilities of target classes for every sample in X
    """
    df = pd.DataFrame(X)
    classes = self.target_distribution.keys() # get target classes

    probabilities = []
    for _, sample in df.iterrows(): # Iterate over rows (samples)

      # dict with final probabilities of target classes. It looks like {"p_yes": 0.84, "p_no": 0.16}, where yes and no - target classes
      classes_probabilities = {}
      for class_ in classes: # Iterate over target classes

        # Firstly set probability of current class (class_) to this class probability (priori probability)
        classes_probabilities[f"p_{class_}"] = self.target_distribution[class_]
        for feature in df.columns: # Iterate over features

          feature_distr = self.features_distr[feature] # get current feature distribution table
          count_unique_values = len(feature_distr.index)  # get count of unique values of current feature (use in Laplace smoothing)
          if sample[feature] in feature_distr.index: # if current value was in the train dataset ...
            value_prob = feature_distr.loc[sample[feature], f"p_{class_}"] # then take the calculated probability
          else:
            value_prob = self.alpha/(feature_distr[f"p_{class_}"].sum() + self.alpha*count_unique_values) # or use Laplace smoothing
          classes_probabilities[f"p_{class_}"] *= value_prob # multiply probability of current class to probability of current value

      # We should normalize the calculated probabilities. (sum should be equal to one)
      prob_sum = np.sum(list(classes_probabilities.values()))
      classes_probabilities = {class_: prob/prob_sum for class_, prob in classes_probabilities.items()}

      probabilities.append(classes_probabilities)

    return probabilities

  def predict(self, X):
    """
      Calculate predicted class for every sample in X
    """
    y_hat = []
    probabilities = self.predict_proba(X)
    for classes_prob in probabilities:
      max_prob = np.max(list(classes_prob.values()))
      for class_, prob in classes_prob.items():
        if prob == max_prob:
          y_hat.append(class_[2:])
          break
    return y_hat

Let's test

In [81]:
# Define a toy datset
data ={
    'age' : ['youth', 'youth', 'middle_age', 'senior', 'senior', 'senior','middle_age', 'youth', 'youth', 'senior', 'youth', 'middle_age','middle_age', 'senior'],
    'income' : ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium','low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student' : ['no','no','no','no','yes','yes','yes','no','yes','yes','yes','no','yes','no'],
    'credit_rate' : ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair','excellent', 'excellent', 'fair', 'excellent'],
    'default' : ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes','yes', 'yes', 'yes', 'no']
}
df = pd.DataFrame (data, columns=data.keys())
df

Unnamed: 0,age,income,student,credit_rate,default
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_age,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_age,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


In [82]:
cnb = CustomNaiveBayesClassifier()
cnb.fit(X=df.drop(columns=['default']), y=df['default'])

In [83]:
print("Target class distribution", cnb.target_distribution)

Target class distribution {'yes': 0.6428571428571429, 'no': 0.35714285714285715}


In [84]:
# Distribution table for feature 'age'
cnb.features_distr["age"]

target,no,yes,p_no,p_yes
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
middle_age,0,4,0.125,0.416667
senior,2,3,0.375,0.333333
youth,3,2,0.5,0.25


In [85]:
# Distribution table for feature 'income'
cnb.features_distr["income"]

target,no,yes,p_no,p_yes
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,2,2,0.375,0.25
low,1,3,0.25,0.333333
medium,2,4,0.375,0.416667


In [86]:
# Distribution table for feature 'student'
cnb.features_distr['student']

target,no,yes,p_no,p_yes
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,4,3,0.714286,0.363636
yes,1,6,0.285714,0.636364


In [87]:
# Distribution table for feature 'credit_rate'
cnb.features_distr['credit_rate']

target,no,yes,p_no,p_yes
credit_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
excellent,3,3,0.571429,0.363636
fair,2,6,0.428571,0.636364


In [88]:
# Define test dataset
data_new ={
    'age' : ['senior', 'middle_age'],
    'income' : ['low', 'medium'],
    'student' : ['yes','no'],
    'credit_rate' : ['fair', 'excellent'],
}
df_new = pd.DataFrame(data_new, columns=data_new.keys())
df_new

Unnamed: 0,age,income,student,credit_rate
0,senior,low,yes,fair
1,middle_age,medium,no,excellent


In [89]:
# Probabilities of target class for every sample in test dataset
probs = cnb.predict_proba(df_new)
probs

[{'p_yes': 0.8758578235790339, 'p_no': 0.12414217642096623},
 {'p_yes': 0.6835222319093288, 'p_no': 0.31647776809067124}]

In [90]:
y_hat = cnb.predict(df_new)
y_hat

['yes', 'yes']