Caelan Osman

November 14, 2021

Homework 6.2

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import time
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import  GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Exercise 6.6
We prove the formulas (6.15), (6.18), and (6.19) in the text for training the naive Bayes classifer with maximum Likelihood estimators.
We want to find the minimizer of

\begin{align}
    &-\sum_{c\in\mathscr{Y}} N_c \log(\pi_c) - \sum_{j=1}^d\sum_{c\in\mathscr{Y}}\sum_{i:y_i = c} \log(p(x_{ij}|\boldsymbol{\theta}_{j, c}))\\
    &\text{subject to} \sum \pi_c - 1 = 0 \\
    &-\pi_c \leq 0
\end{align}

Using the KKT conditions this leads the Lagrangian

\begin{align}
    \mathscr{L} = &-\sum_{c\in\mathscr{Y}} N_c \log(\pi_c) - \sum_{j=1}^d\sum_{c\in\mathscr{Y}}\sum_{i:y_i = c} \log(p(x_{ij}|\boldsymbol{\theta}_{j, c})) + \lambda\left(\sum \pi_c - 1\right) - \boldsymbol{\mu}^T\boldsymbol{\pi}
\end{align}
Taking derivatives tells us $\boldsymbol{\mu} = \boldsymbol{0}$.
Furthermore $\frac{N_k}{\pi_k} = \lambda$ also that $\lambda = N$ this gives us

\begin{align}
    \widehat{\pi}_c = \frac{N_c}{N}.
\end{align}

Similarly, we want to find the argmin of the following

\begin{align}
    \min_{\sigma^2, \mu}\sum_{i:y_i =c} \log(2\pi \sigma^2) + \frac{1}{\sigma^2} (x_{ij} - \mu)^2
\end{align}
Taking the derivative with respect to $\mu$ gives

\begin{align}
    \sum_{i:y_i = c} (x_{ij} - \mu) = 0 \implies \widehat{\mu}_{j, c} = \frac{1}{N_c}\sum_{i: y_i = c}x_{ij}
\end{align}

Taking the derivative with respect to $\sigma^2$ gives

\begin{align}
    &\sum \frac1{\sigma^2} + \frac{1}{(\sigma^2)^2}(x_{ij} -\mu)^2 = 0\\
    &\sum \frac1{\sigma^2}\left(1 - \frac1{\sigma^2}(x_{ij} - \mu)^2    \right) = 0\\
    &\implies N_c = \frac{1}{\sigma^2}\sum(x_{ij} - \mu)^2\\
    &\implies \widehat{\sigma^2}_{j, c} = \frac{1}{N_c}\sum_{i:y_i = c}(x_{ij} - \mu_{j, c})^2
\end{align}
as desired.

Finally we have one more thing to minimize
\begin{align}
    &\sum - \log\left(\theta_{j, c}^{x_j}(1 - \theta_{j, c})^{1 - x_j}\right)\\
    &-\sum  x_j\log(\theta_{j, c}) + (1 - x_j)\log(1 - \theta_{j, c})\\
    &\implies -\sum \frac{x_j}{\theta_{j, c}} + \sum \frac{x_j}{\theta_{j, c}} = 0\\
    &\implies \sum \frac{x_j}{\theta_{j, c}} = \sum\frac{(1 - x_j)}{1 - \theta_{j, c}}\\
    &\implies \widehat{\theta}_{j, c} = \frac{N_{j, c}}{N_c}
\end{align}
as desired.

# Exercise 6.7

In [4]:
class GNBC(object):
    ''' This is a Gaussian naive Bayes classifier for
        normally distributed features.
        Paramters:
            :param eps: the minimum variance
        Methods:
            :method __init__: the initizlizer function
            :method fit_: the fitting function
            :method predict_: the predict function
    '''

    def __init__(self, eps):
        '''
        :param eps: (float) the minimum variance
        '''
        self.eps = eps
        return

    def fit_(self,X, y):
        '''
        Parameters:
            :param X: ((N, d), np.ndarray) training set, d is the number of features and
                                           N is the number of data points
            :param y: ((N, ), np.ndarray) the labels of the data points

        Sets As Attributes:
            :param n_features: (int) the total number of features
            :param n_classes: (int) the total number of classes
            :param class_counts: ((n_classes, ), np.ndarray) corresponds to the number N_c of
                                                             training samples observed in each class c
            :param pis: ((n_classes, ), np.ndarray) corresponding to the estimated probability Pi_c
                                                    of each class c
            :param classes: ((n_classes, ), np.ndarray) consisting of the class labels Y known to the classifier
            :param sigmas: ((n_classes, n_features)) corresponding to the variance sigma_{j, c}^2 of each feature
                                                     j per class c (also adding epsilon so no variance is ever zero)
            :param mus:  ((n_classes, n_features)) corresponding to the mean mu_{j, c} of each feature j per class
                                                   c.
        '''
        N, d = X.shape
        #get a list of the unique_classes
        self.n_classes = len(set(y))
        #number of features is number of columns of X
        self.n_features = d
        #number of classes is the number of unique inputs in y
        self.classes, self.class_counts = np.unique(y, return_counts=True)
        #get pis
        self.pis = self.class_counts / N
        #now we calculate the mans and variances
        self.sigmas = np.zeros((self.n_classes, self.n_features))
        self.mus = np.zeros_like(self.sigmas)
        for i in range(self.n_classes):
            mask = y == self.classes[i]
            for j in range(self.n_features):
                self.mus[i, j] = np.mean(X[mask, j])
                self.sigmas[i, j] = np.var(X[mask, j]) + self.eps

        return

    def predict_(self, xs):
        '''predicts the labels for the given array
        :param x: ((d, ), np.ndarray): the values to predict for
        :return: ((d, ), np.ndarray): the predictions given by the maximizer
        '''
        def scalar(x):
            product = []
            for c in range(self.n_classes):
                p_x = np.exp(-(x - self.mus[c])**2/
                             (2*self.sigmas[c])) \
                      /np.sqrt(2*np.pi*self.sigmas[c])
                product.append(np.prod(p_x) * self.pis[c])
            return np.argmax(product)
        xs = np.atleast_1d(xs)
        return np.array([scalar(x) for x in xs])

# Exercise 6.8

In [5]:
def problem6_8():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    #part 1
    X_train, X_test, y_train, y_test\
        = train_test_split(X, y, test_size = 0.3)
    #part 2
    start = time.time()
    model = GNBC(1e-9)
    model.fit_(X_train, y_train)
    ans = model.predict_(X_test)
    end = time.time()
    print('My Code:')
    print('time:', end-start)
    mask = y_test == ans
    print('Misclassification rate:', 1 - sum(mask)/ len(mask))

    #part 3
    start = time.time()
    model = GaussianNB(var_smoothing=1e-9)
    model.fit(X_train, y_train)
    ans = model.predict(X_test)
    end = time.time()

    print()
    print('Sklearn Naive Bayes:')
    print('time:', end-start)
    mask = y_test == ans
    print('Misclassification rate:', 1 - sum(mask)/ len(mask))

    return
problem6_8()


My Code:
time: 0.013268470764160156
Misclassification rate: 0.022222222222222254

Sklearn Naive Bayes:
time: 0.010965347290039062
Misclassification rate: 0.022222222222222254


# Exercise 6.9

In [6]:
def problem6_9():
    digits = datasets.load_digits()
    X = digits.data
    y = digits.target
    #part 1
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, train_size=0.3)
    #part 2
    start = time.time()
    model = GNBC(1e-9)
    model.fit_(X_train, y_train)
    ans = model.predict_(X_test)
    end = time.time()
    print('My Code:')
    print('time:', end-start)
    mask = y_test == ans
    print('Misclassification rate:', 1 - sum(mask)/ len(mask))

    #part 3
    start = time.time()
    model = GaussianNB(var_smoothing=1e-9)
    model.fit(X_train, y_train)
    ans = model.predict(X_test)
    end = time.time()

    print()
    print('Sklearn Naive Bayes:')
    print('time:', end-start)
    mask = y_test == ans
    print('Misclassification rate:', 1 - sum(mask)/ len(mask))

    start = time.time()
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    ans = model.predict(X_test)
    end = time.time()

    print()
    print('Sklearn KNN:')
    print('time:', end-start)
    mask = y_test == ans
    print('Misclassification rate:', 1 - sum(mask)/ len(mask))

    return
problem6_9()

My Code:
time: 0.53261399269104
Misclassification rate: 0.21462639109697934

Sklearn Naive Bayes:
time: 0.008588790893554688
Misclassification rate: 0.18044515103338632

Sklearn KNN:
time: 0.16542768478393555
Misclassification rate: 0.027027027027026973
