In [9]:
import re
import os
from math import log

In [10]:
def normalize(text):
    text = re.sub("[^A-Za-z]", " ", text)
    text = re.sub(" +", " ", text).strip()
    return text.lower()


def get_occurrence(sets, word):
    occurrence = 0
    for set_ in sets:
        if word in set_:
            occurrence += 1

    return occurrence


In [11]:
class Classify:
    def __init__(self):
        self.spam_samples = []
        self.ham_samples = []
        self.size = 0

        self._p_spam = 0.5
        self._p_ham = 0.5

    def _p_word_spam(self, word):
        return (get_occurrence(self.spam_samples, word) + 1) / (self.size + 2)

    def _p_word_ham(self, word):
        return (get_occurrence(self.ham_samples, word) + 1) / (self.size + 2)

    def _p_text_spam(self, text):
        p = 1
        for word in text.split():
            if self._p_word_spam(word) != 0:
                p *= self._p_word_spam(word)

        return p

    def _p_text_ham(self, text):
        p = 1
        for word in text.split():
            if self._p_word_ham(word) != 0:
                p *= self._p_word_ham(word)

        return p

    def train(self, spam_files_folders, ham_files_folders, size):
        for spam_files_folder, ham_files_folder in zip(spam_files_folders, ham_files_folders):
            spam_samples = os.walk(spam_files_folder).__next__()[2][:size]
            ham_samples = os.walk(ham_files_folder).__next__()[2][:size]
    
            self.spam_samples += [
                set(
                    normalize(
                        open(spam_files_folder + '/' + i, errors='ignore').read()
                    ).split()[1:]
                ) for i in spam_samples
            ]
            self.ham_samples += [
                set(
                    normalize(
                        open(ham_files_folder + '/' + i, errors='ignore').read()
                    ).split()[1:]
                ) for i in ham_samples]
    
            self.size += size

    def predict(self, text):
        # print(self._p_text_spam(text) * self._p_spam)
        # print(self._p_text_ham(text) * self._p_ham)

        if self._p_text_spam(text) * self._p_spam == 0 or self._p_text_ham(text) * self._p_ham == 0:
            return 'none'

        if log(self._p_text_spam(text) * self._p_spam) > log(self._p_text_ham(text) * self._p_ham):
            # print(self._p_text_spam(text) * self._p_spam, self._p_text_ham(text) * self._p_ham)
            return 'spam'
        return 'ham'



In [12]:
cl = Classify()

train_data_size = 1200

cl.train(spam_files_folders=['enron1/spam', 'enron2/spam'], ham_files_folders=['enron1/ham', 'enron2/ham'], size=train_data_size)

In [13]:
body = """subject how are you now loading no more messages"""

print(cl.predict(normalize(body)))

spam


In [14]:
test_data_size = 300  # 300 - max (не хватит писем в датасете)

spam_samples = [
    normalize(
        open('enron1/spam/' + i, errors='ignore').read()
    ) for i in os.walk("enron1/spam").__next__()[2][train_data_size:test_data_size+train_data_size]
]

ham_samples = [
    normalize(
        open('enron1/ham/' + i, errors='ignore').read()
    ) for i in os.walk("enron1/ham").__next__()[2][train_data_size:test_data_size+train_data_size]
]


In [15]:
classified = 0
err = 0
failed = 0

for samples, _class in [(spam_samples, 'spam'), (ham_samples, 'ham')]:
    for sample in samples:
        rs = cl.predict(sample)
        if rs == _class:
            classified += 1
        elif rs == 'none':
            err += 1
        else:
            failed += 1
            
print(f"Classified {classified} out of {failed} failed. Error: {err}.")

Classified 427 out of 13 failed. Error: 160.


In [16]:
print(f"Accuracy: {(classified / (test_data_size * 2)) * 100:.2f}%")

Accuracy: 71.17%
