# Implementing Naive Bayes from scratch using an example

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import sklearn
import matplotlib as mpl

In [2]:
mpl.rcParams["figure.figsize"] = (8,6)
mpl.rcParams["axes.grid"] = False

In [3]:
%matplotlib inline

In [4]:
X_train = np.array([
    [0,1,1],
    [0,0,1],
    [0,0,0],
    [1,1,0]
])
Y_train = ["Y", "N", "Y", "Y"]
X_test = np.array([[1,1,0]])

In [5]:
from collections import defaultdict

def get_label_indices(labels):
    
    label_indices = defaultdict(list)
    
    for index, label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

In [6]:
label_indices = get_label_indices(Y_train)
label_indices

defaultdict(list, {'Y': [0, 2, 3], 'N': [1]})

In [7]:
def get_prior(label_indices):
    prior = {label: len(indices) for label, indices in label_indices.items()}
    total_count = sum(prior.values())
    for label in prior.keys():
        prior[label] /= total_count
    return prior

In [8]:
prior = get_prior(label_indices)
prior

{'Y': 0.75, 'N': 0.25}

In [9]:
def get_likelihood(features, label_indices, smoothing = 1):
    likelihood = {}
    for label, indices in label_indices.items():
        likelihood[label] = (np.sum(features[indices, :], axis = 0) + smoothing)/(len(indices) + 2*smoothing)
    return likelihood

In [10]:
likelihood = get_likelihood(X_train, label_indices)
likelihood

{'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}

In [11]:
def get_posterior(X, prior, likelihood):
    posterior = prior.copy()
    for x in X:
        for label, likelihood_label in likelihood.items():
            for index, bool_value in enumerate(x):
                posterior[label] *= likelihood_label[index] if bool_value else (1-likelihood_label[index])
    total_sum = sum(posterior.values())
    posterior = {label: value/total_sum for label, value in posterior.items()}
    return posterior

In [12]:
posterior = get_posterior(X_test, prior, likelihood)
posterior

{'Y': 0.9210360075805433, 'N': 0.07896399241945673}

# Using the BernouliiNB class of naive bayes module from sklearn package

In [13]:
from sklearn.naive_bayes import BernoulliNB

In [14]:
bernoulli_nb = BernoulliNB(alpha = 1, fit_prior = True)

In [15]:
bernoulli_nb.fit(X_train, Y_train)

In [16]:
bernoulli_nb.predict_proba(X_test)

array([[0.07896399, 0.92103601]])

In [17]:
bernoulli_nb.predict(X_test)

array(['Y'], dtype='<U1')

## Implimenting the movie preference predictor on Movie Lens Dataset

In [22]:
import os
import urllib
import tarfile
from pathlib import Path

folder_path = Path(os.getcwd()) / "Dataset"
download_root = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
path = urllib.request.urlretrieve(download_root, os.getcwd())
tar_file = tarfile.open(path)

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\spect\\OneDrive\\Desktop\\ML_conda_projects\\spectre_lab\\Python_ML_by_example\\1. Building a movie recommendation system with Naive Bayes'

In [20]:
path

('C:\\Users\\spect\\AppData\\Local\\Temp\\tmp789kxgi1',
 <http.client.HTTPMessage at 0x17631d4e7a0>)