# Bernoulli Naive Bayes

## Imports

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from collections import defaultdict

In [35]:
cleaned_data = np.load('./Data/cleaned_data_train.npy', allow_pickle=True)
feature_matrix = sp.load_npz('./Data/feature_matrix_train.npz')

## Naive-Bayes Implementation

In [185]:
class BernoulliNaiveBayes():
    def __init__(self, alpha = 1):
        self.alpha = alpha # used for Laplace smoothing
        self.classes = None
        self.classes_log_priors = None
    
    def __group_samples(self, X,Y):
        # append X|Y arrays
        XY = np.column_stack((X,Y))
        
        # initialize array of empty arrays with length of number of classes
        group_by_class = [[] for _ in range(len(self.classes))]
        
        # for each class, append an X|y sample into array index i if y == classes[i]
        for class_index in range(len(self.classes)):
            for sample in XY:
                if sample[-1] == self.classes[class_index]:
                    group_by_class[class_index].append(sample)
        return group_by_class
    
    def __predict(self, X):
#         return [(np.log(self.features_probs[i])) for i in range(len(self.classes))]
        scores = np.zeros(len(self.classes))
        for i in range(len(self.classes)):
            sum = 0
            for j in range(len(X)):
                if (X[j] == 0):
                    sum += np.log(1 - self.features_probs[i][j])
                else:
                    sum += np.log(self.features_probs[i][j])
            sum += self.classes_log_priors[i]
            scores[i] = sum
            
        
        return np.argmax(scores)
        
    def fit(self, X, Y):
        self.classes = np.unique(Y)
        num_samples = len(X)
        groups = self.__group_samples(X,Y)

        self.classes_log_priors = np.array(list(map(lambda g: np.log(len(g)/num_samples), groups)))

        # get count per group, number of samples per group, and divide.
        word_count = np.array(list(map(lambda g: np.array(g).sum(axis=0)[:-1] + self.alpha, groups)))
        group_count = np.array(list(map(lambda g: len(g), groups)))
        self.features_probs = word_count/((group_count + 2*self.alpha)[:,None])
        
    def predict(self, X):
        return self.__predict(X)
#         return np.argmax(self.__predict(X), axis=1);
#         return np.argmax(self.__predict(X), axis=1)

In [188]:
X = np.array([[0,1,0],[1,0,1],[0,0,1],[1,1,1]])
Y = np.array([6,7,6,6])
b = BernoulliNaiveBayes()

b.fit(X,Y)
print(b.predict([1,1,1]))

0


In [55]:


num_samples = len(X)
XY = np.column_stack((X,Y))
group_by_class = [[] for _ in range(len(classes))]
for class_index in range(len(classes)):
    for sample in XY:
        if sample[-1] == classes[class_index]:
            group_by_class[class_index].append(sample)


# group_by_class[0][0][:-1]

list(map(lambda c: np.array(c).sum(axis=0)[:-1] + 1, group_by_class))

            
# z = zip(X,Y)
# print(list(z))

[array([11, 13, 15]), array([4, 5, 6])]

## Data Loader

In [4]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

#load
comment_data = pd.read_csv(train_data)

#clean
comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', '')
comment_data['prep'] = comment_data['prep'].str.lower()
comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")

#load
test_data = pd.read_csv(test_path)

#clean
test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', '')
test_data['prep'] = test_data['prep'].str.lower()
test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' wasurl ')
test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
test_data['prep'] = test_data['prep'].str.replace(" +", " ")

## K-Fold Cross Validation

In [1]:
# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        #np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds[0]) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))