In [1]:
import math
from collections import defaultdict

# Training dataset with (text, label)
data = [
    ("fun, couple, love, love, comedy", "Comedy"),
    ("fast, furious, shoot, action", "Action"),
    ("couple, fly, fast, fun, fun, comedy", "Comedy"),
    ("furious, shoot, shoot, fun, action", "Action"),
    ("fly, fast, shoot, love, action", "Action")
]

test_doc = "fast, couple, shoot, fly"

# Tokenization function
def tokenize(text):
    return text.lower().split(", ")

# Naïve Bayes Training Function
def train_naive_bayes(data):
    word_counts = defaultdict(lambda: defaultdict(int))  # Word frequency per class
    class_counts = defaultdict(int)  # Document count per class
    vocab = set()  # Unique words in training data
    
    for text, label in data:
        class_counts[label] += 1
        for word in tokenize(text):
            word_counts[label][word] += 1
            vocab.add(word)
    
    return word_counts, class_counts, len(vocab)

# Naïve Bayes Classification Function
def classify(text, word_counts, class_counts, vocab_size):
    priors = {c: math.log(class_counts[c] / sum(class_counts.values())) for c in class_counts}
    posteriors = {c: priors[c] + sum(
        math.log((word_counts[c][w] + 1) / (sum(word_counts[c].values()) + vocab_size))  # Laplace smoothing
        for w in tokenize(text)) for c in class_counts}
    
    return max(posteriors, key=posteriors.get)

# Train and test
word_counts, class_counts, vocab_size = train_naive_bayes(data)
predicted_class = classify(test_doc, word_counts, class_counts, vocab_size)

print(f"Test Doc: {test_doc}")
print(f"Predicted Class: {predicted_class}")


Test Doc: fast, couple, shoot, fly
Predicted Class: Action
