# Naive bayes from scratch


In [2]:
import numpy as np
import pandas as pd
import heapq
import math

In [4]:
class BayesModel:
    def __init__(self):
        self.words = {}
        with open('data/words.txt', 'r') as f:
            for i, j in enumerate(f, 1):
                self.words[i] = j.strip()
        

        self.train_labels = {}
        self.train_data = {}
        self.pw1c1 = {}
        self.pw1c0 = {}
        self.pc1 = None
            
    def train(self, train_labels, train_data):
        self.train_labels = train_labels
        self.train_data = train_data

        # get constants first 
        # P(C = 1), 1: atheist. 0: books
        self.atheism = []
        self.books = []
        for key, label in self.train_labels.items():
            if key not in self.train_data:
                continue
            if label == 1: # atheism
                self.atheism.append(key)
            elif label == 2: 
                self.books.append(key)
        
        self.pc1 = len(self.atheism) / (len(self.atheism) + len(self.books))

        # P(word = 1 | C = 1), or, doc contains word?, with laplace
        for word in self.words:
            count = 0
            for ath_idx in self.atheism:
                if word in self.train_data[ath_idx]:
                    count += 1
            self.pw1c1[word] = (count + 1) / (len(self.atheism) + 2)


        # P(word = 1 | C = 0), with laplace
        for word in self.words:
            count = 0
            for book_idx in self.books:
                if word in self.train_data[book_idx]:
                    count += 1
            self.pw1c0[word] = (count + 1) / (len(self.books) + 2)
        
    def predict(self, test_data: dict):
        # implementing P(a1. a2, ... | c = 1)P(c=1)
        res = {}
        for doc, doc_words in test_data.items():
            pc1 = math.log(self.pc1)
            pc0 = math.log(1 - self.pc1)
            # pc1 = self.pc1
            # pc0 = 1 - self.pc1

            for word in self.words:
                m = 1 if word in doc_words else 0
                pc1 = pc1 + (m * np.log2(self.pw1c1[word]) + (1 - m) * np.log2(1 - self.pw1c1[word]))
                pc0 = pc0 + (m * np.log2(self.pw1c0[word]) + (1 - m) * np.log2(1 - self.pw1c0[word]))
                # pc1 = pc1 * (self.pw1c1[word] ** m * (1 - self.pw1c1[word])** (1-m)) 
                # pc0 = pc0 * (self.pw1c0[word] ** m * (1 - self.pw1c0[word])** (1-m)) 

            if pc1 > pc0:
                res[doc] = 1
            else: 
                res[doc] = 2
        
        return res

b = BayesModel()

In [5]:
# parsing input data
train_labels = {}
with open('data/trainLabel.txt', 'r') as f:
    for i, j in enumerate(f, 1):
        train_labels[i] = int(j)

test_labels = {}
with open('data/testLabel.txt', 'r') as f:
    for i, j in enumerate(f, 1):
        test_labels[i] = int(j)

train_data = {}
word_to_doc = {}
with open('data/trainData.txt', 'r') as f:
    for i, j in enumerate(f, 1):
        doc, word = (j.split('\t'))
        word = word.strip()
        temp = train_data.get(int(doc), set())
        temp.add(int(word))
        train_data[int(doc)] = temp
        temp = word_to_doc.get(int(word), set())
        temp.add(int(doc))
        word_to_doc[int(word)] = temp

test_data = {}
with open('data/testData.txt', 'r') as f:
    for i, j in enumerate(f, 1):
        doc, word = (j.split('\t'))
        word = word.strip()
        temp = test_data.get(int(doc), set())
        temp.add(int(word))
        test_data[int(doc)] = temp

In [6]:
b.train(train_labels, train_data)

In [7]:
train_preds = b.predict(train_data)
test_preds = b.predict(test_data)

In [8]:
def calc_acc(preds, labels):
    df = pd.DataFrame(list(preds.items()))
    df_true = pd.DataFrame(list(labels.items()))
    res = pd.merge(df, df_true, left_on=0, right_on=0, how='left')
    return len(res.loc[res["1_x"] == res['1_y']]) / len(res)

print(f"Train accuracy: {calc_acc(train_preds, train_labels)}")
print(f"Test accuracy: {calc_acc(test_preds, test_labels)}")


Train accuracy: 0.9283018867924528
Test accuracy: 0.8896746817538896
