In [1]:
import nltk
import os
import sys

In [2]:
def load_data(directory):
    result = []
    for filename in ["positives.txt", "negatives.txt"]:
        with open(os.path.join(directory, filename)) as f:
            result.append([
                extract_words(line)
                for line in f.read().splitlines()
            ])
    return result


def extract_words(document):
    return set(
        word.lower() for word in nltk.word_tokenize(document)
        if any(c.isalpha() for c in word)
    )

In [3]:
def generate_frequency_map(words, positives, negatives):
    frequency_map = {}
    total_positive_tokens = 0
    total_negative_tokens = 0
    for word in words:
        positive_frequency = 0
        for document in positives:
            for token in document:
                if word == token:
                    positive_frequency += 1
                total_positive_tokens += 1
        negative_frequency = 0
        for document in negatives:
            for token in document:
                if word == token:
                    negative_frequency += 1
                total_negative_tokens += 1
        frequency_map[word] = {
            "positive_frequency": positive_frequency,
            "negative_frequency": negative_frequency
        }
    frequency_map["total_positive_tokens"] = total_positive_tokens
    frequency_map["total_negative_tokens"] = total_negative_tokens
    return frequency_map

In [21]:
def get_probability_value(token, frequency_map, type):
    count_token = frequency_map[token][f"{type}_frequency"] + 1
    count_total = frequency_map[f"total_{type}_tokens"]
    p = count_token / count_total
    if (type == 'negative'):
        p_adjusted = p * calculate_negative_modifier(token)
        return p_adjusted
    return p

In [27]:
def calculate_negative_modifier(token):
    if token in {
        "cancel",
        "no",
        "not",
        "don't"}:
        return 100
    return 1

In [22]:
def get_probability_value_sum(s, frequency_map, type):
    document_words = extract_words(s)
    p_sum = 0
    for word in document_words:
        p = get_probability_value(word, frequency_map, type)
        p_sum += p
    return p_sum

In [28]:
## main ##

# Read data from files
corpus_path = "../resources/corpus"
positives, negatives = load_data(corpus_path)

# Create a set of all words
words = set()
for document in positives:
    words.update(document)
for document in negatives:
    words.update(document)

# generate frequency map
frequency_map = generate_frequency_map(words, positives, negatives)

# get string to classify
s = "please cancel"
# s = input("s: ")

# sum the probability that each word is in positives
p_positive = get_probability_value_sum(s, frequency_map, "positive")
# sum the probability that each word is in negatives
p_negative = get_probability_value_sum(s, frequency_map, "negative")
adjusted_p_positive = p_positive / (p_positive + p_negative) * 100
result = f"{adjusted_p_positive:.3f} % probability of being positive"
print(result)


17.989 % probability of being positive
