In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import multivariate_normal

## Naive Bayes Classification
#### Ni√±o Paul Batanay
------
### Problem 1

In [2]:
#load the dataset
data = np.loadtxt("seeds_dataset.txt")

#split it to train and test set
test_ind = np.random.choice(np.arange(len(data)), size=40)
train_ind = np.array([i for i in np.arange(len(data)) if i not in test_ind ])

train = data[train_ind]
test = data[test_ind]

X_train, y_train = train[:,:-1], train[:,-1]
X_test, y_test = test[:,:-1], test[:,-1]

In [3]:
#get the mean and variance for each feature of each class label in the training set
mean1, covar1 = np.mean(X_train[y_train==1], axis=0), np.diag(np.var(X_train[y_train==1], axis=0))
mean2, covar2 = np.mean(X_train[y_train==2], axis=0), np.diag(np.var(X_train[y_train==2], axis=0))
mean3, covar3 = np.mean(X_train[y_train==3], axis=0), np.diag(np.var(X_train[y_train==3], axis=0))

In [4]:
#make a function for the log_likelihood, we omit the uniform prior p(c_i) since it is constant across classes
log_lik = lambda x,mean,covar: multivariate_normal(mean=mean, cov=covar).logpdf(x)
log_like_test = np.column_stack((log_lik(X_test, mean1, covar1),
                                 log_lik(X_test, mean2, covar2),
                                 log_lik(X_test, mean3, covar3)))

preds = np.argmax(log_like_test, axis=1)+1 #since indexing starts at 0

#get the accuracy
acc = np.mean(preds==y_test)

In [5]:
acc

0.94999999999999996

### Problem 2

In [6]:
#equivalent naive bayes in sklearn
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
preds_sk = nb_classifier.predict(X_test)
acc_sk = np.mean(preds_sk==y_test)

In [7]:
acc_sk

0.94999999999999996

In [8]:
np.all(preds==preds_sk) #same prediction

True

### Problem 3

In [9]:
class naiveBayes(object):
    """
    This class performs Naive Bayes classification for word-count document -
    features.
    """
    def __init__(self, train_data=None, train_labels=None):
        """
        Initialize a Naive Bayes classifier.
        """
        self.X = train_data
        self.y = train_labels
        
    def fit(self,X,Y):
        """
        Fit the parameters according to the labeled training data (X,Y).
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
        Each row is the word-count vector for one of the documents
        Y : ndarray of shape (n_samples,)
        Gives the class label for each instance of training data. -
        Assume class labels are in {0,1,...,k-1} where k is the -
        number of classes.
        """
        # get prior class probabilities P(c_i)
        # (you may wish to store these as a length k vector as a class attribute)
        # get (smoothed) word-class probabilities
        # (you may wish to store these in a (k, n_features) matrix as a class attribute)
        self.classes = np.unique(Y)
        self.prob_c = np.array([np.mean(Y==i) for i in self.classes])
        self.prob_word = np.array([(np.sum(X[Y==i], axis=0)+1)/(np.sum(X[Y==i])+X.shape[1])
                                   for i in self.classes])
        
        
    def predict(self, X):
        """
        Predict the class labels of a set of test data.
        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
        The test data
        Returns
        -------
        Y : ndarray of shape (n_samples,)
        Gives the classification of each row in X
        """
        log_lik = np.log(self.prob_c) + np.dot(X, np.log(self.prob_word).T)
        preds = np.argmax(log_lik, axis=1).astype(float)
        return preds

### Problem 4

In [10]:
data = np.loadtxt("SpamFeatures.txt")
labels = np.loadtxt("SpamLabels.txt")

In [11]:
#split it to train and test set
test_ind = np.random.choice(np.arange(len(data)), size=500)
train_ind = np.array([i for i in np.arange(len(data)) if i not in test_ind])

X_train, y_train = data[train_ind], labels[train_ind]
X_test, y_test = data[test_ind], labels[test_ind]

In [12]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4691, 8167) (4691,)
(500, 8167) (500,)


In [13]:
nb = naiveBayes(X_train, y_train)
nb.fit(X_train, y_train)
preds = nb.predict(X_test)

acc = np.mean(preds == y_test)

In [14]:
acc

0.95599999999999996

In [15]:
#sklearn naive bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
preds_sk = nb.predict(X_test)
acc_sk = np.mean(preds_sk==y_test)

In [16]:
acc_sk

0.95599999999999996

In [17]:
np.all(preds==preds_sk) #same prediction 

True