In [6]:
from os import listdir
import re
from typing import List, Counter
import numpy as np
import collections
import operator
from sklearn.model_selection import train_test_split

In [7]:
def strip(word: str, chars_to_strip: List[str]) -> str:
  for char in chars_to_strip:
    word = word.strip(char)
      
  return word

def format_word(word: str) -> str:
  return strip(word, ['"', "'", "(", ")"])

def tokenize(doc: str) -> List[str]:
    doc=doc.lower()
    doc=re.sub('\n',' ',doc)
    doc=re.sub('[,.+=]',' ',doc)
    doc=re.sub('[0-9]','',doc)
    doc=re.sub('\s+',' ',doc)
    doc=doc.strip()
    return [w for w in [format_word(word) for word in doc.split()] if len(w) > 0]

def read_docs(category: str, target: str):
  categories = ['business', 'entertainment', 'politics', 'sport', 'tech']

  if category not in categories:
    raise NameError(f'Category {category} not found')

  directory = f'./bbc/{category}/'
  filenames = [f'{directory}{f}' for f in listdir(directory) if f.endswith('.txt')]

  for file in filenames:
    with open(file, encoding='latin-1') as f:
      yield Document(f.read(), target)

In [8]:
class Document:
  def __init__(self, text, target):
    self.text = text
    self.target = target
    self.words = tokenize(self.text)

  def get_vocabulary(self):
    return collections.Counter(self.words)

In [9]:
class NaiveBayes:
  def __init__(self) -> None:
      self.log_prior = {}
      self.log_likelihood = {}
      self.targets = []
      self.vocabulary = {}

  def train(self, documents: List[Document]):
    docs_count = len(documents)
    self.targets = self.get_targets(documents)
    self.vocabulary = collections.Counter()

    for doc in documents: 
      self.vocabulary += doc.get_vocabulary()

    for target in self.targets:
      target_docs = self.get_target_documents(documents, target)
      target_docs_count = len(target_docs)

      self.log_prior[target] = np.log(target_docs_count/docs_count)
      
      target_vocabulary = self.get_docs_vocabulary(target_docs)
      target_words_count = sum(target_vocabulary.values())

      for word in self.vocabulary.keys():
        word_count = target_vocabulary[word]
        self.log_likelihood[f'{target}-{word}'] = np.log( (word_count + 1) / (target_words_count + 1) )

    return self

  def test(self, document: Document, stop_words_count=50) -> str:
    target_sum  = {}
    stop_words = [word for (word, _) in self.vocabulary.most_common(stop_words_count)]

    for target in self.targets:
      target_sum[target] = self.log_prior[target]

      for word in document.words:
        if word not in stop_words:
          try:
            target_sum[target] += self.log_likelihood[f'{target}-{word}']
          except KeyError:
            pass

    (max_index, _) = max(enumerate(list(target_sum.values())), key=operator.itemgetter(1))
    return list(target_sum)[max_index]

  def score(self, documents: List[Document]) -> float:
    success_count = 0
    documents_count = len(documents)

    for document in documents:
      prediction = self.test(document)
      if prediction == document.target:
        success_count += 1

    return {
      'accuracy': success_count / documents_count,
    }

  def get_targets(self, documents: List[Document]) -> List[str]:
    return list(collections.Counter([doc.target for doc in documents]).keys())

  def get_target_documents(self, documents: List[Document], target: str) -> List[Document]:
    target_docs = [doc for doc in documents if doc.target == target]
    return target_docs

  def get_docs_vocabulary(self, documents: List[Document]) -> Counter:
    vocabulary = collections.Counter()
    for doc in documents:
      vocabulary += doc.get_vocabulary()

    return vocabulary


In [10]:
dataset = [doc for doc in read_docs('tech', 'true')] + [doc for doc in read_docs('business', 'false')] + [doc for doc in read_docs('politics', 'false')] + [doc for doc in read_docs('entertainment', 'false')]

X_train, X_test, y_train, y_test = train_test_split(dataset, dataset, test_size=0.3, random_state=42)
classifier = NaiveBayes().train(X_train)
result = classifier.score(X_test)
print(result)


{'accuracy': 0.9669902912621359}
