In [1]:
import math
from collections import Counter

In [2]:
def count_word_occurrences(data):
    word_counts = Counter()
    total_sentences = 0

    for category, sentences in data.items():
        for sentence in sentences:
            total_sentences += 1
            word_counts.update(sentence)

    return word_counts, total_sentences

In [3]:
def calculate_probabilities(word_given_chair, total_sentences, vocabulary, query):
    total_words = len(vocabulary)
    prob_word_given_chair = word_given_chair / total_sentences
    probabilities = [(list(query).count(word) + 1) / (word_given_chair + total_words) for word in vocabulary]

    return prob_word_given_chair, probabilities

In [4]:
def calculate_final_score(prob_chair, prob_word):
    return math.log10(prob_chair) + sum(math.log10(prob_word_i) for prob_word_i in prob_word)

In [5]:
data = {
    "f": [("Put", "coat", "back", "Chair", "sat", "down"), ("Chair", "made", "timber", "company"), ("Type", "different", "Chair", "Award", "fun", "use")],
    "p": [("Chair", "institute", "best"), ("Award", "IT", "Chair")]
}

vocabulary = set(["Put", "coat", "back", "Chair", "sat", "down", "made", "it", "institute", "best", "Award", "IT", "company"])
query = set(["Award", "Chair", "IT", "company"])

In [6]:
f_word_count, p_word_count = count_word_occurrences(data)
total_sentences = sum(len(sentences) for sentences in data.values())

In [7]:
f_prob_word_given_chair = sum(1 for sentence in data["f"] for word in sentence if word == "Chair")
p_prob_word_given_chair = sum(1 for sentence in data["p"] for word in sentence if word == "Chair")

In [8]:
f_prob_chair, prob_f = calculate_probabilities(f_prob_word_given_chair, total_sentences, vocabulary, list(query))
p_prob_chair, prob_p = calculate_probabilities(p_prob_word_given_chair, total_sentences, vocabulary, list(query))

In [9]:
final_score_f = calculate_final_score(f_prob_chair, prob_f)
final_score_p = calculate_final_score(p_prob_chair, prob_p)

In [10]:
print()
print("f_prob_chair:", round(f_prob_chair,2))
prob_f2 = []
for word in prob_f:
    word = round(word, 2)
    prob_f2.append(word)
print("prob_f:", prob_f2)
print("final_score_f:", round(final_score_f,2))


f_prob_chair: 0.6
prob_f: [0.06, 0.12, 0.12, 0.12, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.12, 0.06, 0.06]
final_score_f: -14.67


In [11]:
print()
print("p_prob_chair:", round(p_prob_chair,2))
prob_p2 = []
for word in prob_p:
    word = round(word, 2)
    prob_p2.append(word)
print("prob_p:", prob_p2)
print("final_score_p:", round(final_score_p,2))


p_prob_chair: 0.4
prob_p: [0.07, 0.13, 0.13, 0.13, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.13, 0.07, 0.07]
final_score_p: -14.48


In [12]:
print()
print("Category: Furniture" if final_score_f > final_score_p else "Category: Position")


Category: Position
