# Bernoulli Naive Bayes

## Imports

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp

# natural language toolkit
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag_sents
from nltk.stem import WordNetLemmatizer

# SciKit-Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Naive-Bayes Implementation

In [3]:
class BernoulliNaiveBayes():
    def __init__(self, alpha = 1):
        self.alpha = alpha # used for Laplace smoothing
        self.classes = None
        self.priors = None
    
    def __group_samples(self, X,Y):
        # labels -> numbers
        labels = []
        for label in Y:
            labels.append(list(self.classes).index(label))
        print(labels[:10])
        # append X|Y arrays
        XY = np.zeros((X.shape[0], X.shape[1]+1))
        XY[:,:X.shape[1]] = X
        XY[:,-1:] = np.array(labels).reshape(-1,1)
        
        
        # initialize array of empty arrays with length of number of classes
        group_by_class = [[] for _ in range(self.classes.shape[0])]
        
        # for each class, append an X|y sample into array index i if y == classes[i]
        for class_index in range(self.classes.shape[0]):
            for sample in XY:
                if sample[-1] == class_index:
                    group_by_class[class_index].append(sample)
        return group_by_class
        
    def fit(self, X, Y):
        self.classes = np.unique(Y)
        num_samples = X.shape[0]
        groups = self.__group_samples(X,Y)

        self.priors = np.array(list(map(lambda g: np.log(len(g)/num_samples), groups)))

        # get count per group, number of samples per group, and divide.
        word_count = np.array(list(map(lambda g: np.array(g).sum(axis=0)[:-1] + self.alpha, groups)))
        group_count = np.array(list(map(lambda g: len(g), groups)))
        
        # get probabilities, apply laplace smoothing
        self.features_probs = word_count/((group_count + 2*self.alpha)[:,None])
        
    def predict(self, X):
        scores = np.zeros(self.classes.shape[0])
        for i in range(self.classes.shape[0]):
            sum = 0
            for j in range(X.shape[0]):
                if (X[j] == 0):
                    sum += np.log(1 - self.features_probs[i][j])
                else:
                    sum += np.log(self.features_probs[i][j])
            sum += self.priors[i]
            scores[i] = sum
        
        return np.argmax(scores)

In [None]:
X = np.array([[0,1,0],[1,0,1],[0,0,1],[1,1,1]])
Y = np.array([6,7,6,6])
b = BernoulliNaiveBayes()

b.fit(X,Y)
# print(b.predict(np.array([1,1,1])))
print(X.shape)
print(Y.shape)

## Data Loader

In [4]:
# filepaths
train_data = './Data/reddit_train.csv'
test_path = './Data/reddit_test.csv'

#load
comment_data = pd.read_csv(train_data)

#clean
comment_data['prep'] = comment_data['comments'].str.replace(r'[^\w\s]+', '')
comment_data['prep'] = comment_data['prep'].str.lower()
comment_data['prep'] = comment_data['prep'].str.replace('(\d+)', ' num ')
comment_data['prep'] = comment_data['prep'].str.replace(r'http(?<=http).*', ' ')
comment_data['prep'] = comment_data['prep'].str.replace(r'\s+', " ")
comment_data['prep'] = comment_data['prep'].str.replace(" +", " ")

#load
test_data = pd.read_csv(test_path)

#clean
test_data['prep'] = test_data['comments'].str.replace(r'[^\w\s]+', '')
test_data['prep'] = test_data['prep'].str.lower()
test_data['prep'] = test_data['prep'].str.replace('(\d+)', ' num ')
test_data['prep'] = test_data['prep'].str.replace(r'http(?<=http).*', ' ')
test_data['prep'] = test_data['prep'].str.replace(r'\s+', " ")
test_data['prep'] = test_data['prep'].str.replace(" +", " ")

In [5]:
lemmatizer = WordNetLemmatizer()
tt = TweetTokenizer()
def lemmatize_col(row):
    row = tt.tokenize(row)
    return ' '.join([lemmatizer.lemmatize(w) for w in row])

comment_data['prep'] = comment_data['prep'].apply(lemmatize_col)
test_data['prep'] = comment_data['prep'].apply(lemmatize_col)

# stopwords
stop = stopwords.words('english')
comment_data['prep'] = comment_data['prep'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_data['prep'] = test_data['prep'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [6]:
clean_data = comment_data['prep'].to_numpy()
clean_labels = comment_data['subreddits'].to_numpy()

train_comments = []
for idx in range(clean_data.shape[0]):
    item = (clean_data[idx], clean_labels[idx])
    train_comments.append(item)
train_comments = np.asarray(train_comments)
print(train_comments.shape)

(70000, 2)


In [8]:
# # 60000/10000
# training_data = clean_data[:60000]
# testing_data = clean_data[60000:]
# training_labels = clean_labels[:60000]
# testing_labels = clean_labels[60000:]

commentFolds = kFold(train_comments)
commentFolds.generateSplits()
splits = commentFolds.splits
x, y, z = splits[0]
print(x.shape, y.shape, z.shape)


(56000, 2) (7000, 2) (7000, 2)


In [15]:
def unpack(subset):
    data = []
    labels = []

    for x,y in subset:
        data.append(x)
        labels.append(y)

    data = np.array(X)
    labels = np.array(Y)
    
    return (data, labels)

In [18]:
nb = BernoulliNaiveBayes()
tt = TweetTokenizer()

for split in splits:
    train, val, test = split
    
    training_data, training_labels = unpack(train)
    validation_data, validation_labels = unpack(val)
    testing_data, testing_labels = unpack(test)
    
    # tokenize and remove min words on "training set"
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, ngram_range=(1,1), min_df=3)
    tfidf = tfidf_vectorizer.fit(training_data)
    
    # filter out bad words
    training_vec = tfidf_vectorizer.transform(training_data).astype(np.float32).toarray()
    validation_vec = tfidf_vectorizer.transform(validation_data).astype(np.float32).toarray()
    testing_vec = tfidf_vectorizer.transform(testing_data).astype(np.float32).toarray()
    
    nb.fit(training_vec,training_labels)
    
    num_correct = 0
    for idx, vec in enumerate(validation_vec):
        result = (nb.predict(vec))
        if nb.classes[result] == validation_labels[idx]:
            num_correct += 1
    print("Validation accuracy is: " , num_correct/(validation_vec.shape[0]))
    
    num_correct = 0
    for idx, vec in enumerate(testing_vec):
        result = (nb.predict(vec))
        if nb.classes[result] == testing_labels[idx]:
            num_correct += 1
    print("Testing accuracy is: " , num_correct/(testing_vec.shape[0]))

[11, 14, 12, 16, 9, 9, 13, 4, 13, 3]


KeyboardInterrupt: 

In [13]:
# tokenize and remove min words on "training set"
tfidf_vectorizer = TfidfVectorizer(tokenizer=tt.tokenize, ngram_range=(1,1), min_df=3)
tfidf = tfidf_vectorizer.fit(training_data)

In [None]:
# filter out bad words
training_vec = tfidf_vectorizer.transform(training_data).astype(np.float32)
testing_vec = tfidf_vectorizer.transform(testing_data).astype(np.float32)

In [None]:
nb = BernoulliNaiveBayes()

nb.fit(training_vec.toarray(),training_labels)


In [None]:
results = []
num_correct = 0
for idx, vec in enumerate(testing_vec.toarray()):
    result = (nb.predict(vec))
    if nb.classes[result] == testing_labels[idx]:
        num_correct += 1
    print(num_correct/(idx+1))
    


In [7]:

# leverages pandas for fast csv load but operates in numpy
class kFold():
    def __init__(self, data, numFolds=5):
        self.data = data
        self.numFolds = numFolds
        self.splits = []
        
    def generateSplits(self):
        #np.random.shuffle(self.data)
        
        folds = []
        splitPoint = self.data.shape[0] // (self.numFolds)  #breakpoint index jump
        
        for i in range(self.numFolds - 1):
            folds.append(self.data[i*splitPoint:(i+1)*splitPoint, :])
            
        folds.append(self.data[(i+1)*splitPoint:,:]) #get extra points in last batch
        
        # create split permutations 80/10/10
        foldDivisor = len(folds[0]) // 2
        for i in range(self.numFolds):
            train = []
            for k in range(self.numFolds):
                if i == k:
                    validation = folds[i][:foldDivisor] 
                    test = folds[i][foldDivisor:] 
                else:
                    train.append(folds[k])
            
            train = np.vstack(train) # adapt dims
            self.splits.append((train, validation, test))