In [1]:
import re
from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS
from sklearn.model_selection import train_test_split
from functools import reduce

In [None]:
class Classificator:

    def __init__(self):
        self.vocab = set()
        self.word_count = {
            'spam': {},
            'ham': {}
        }
        self.freq = {
            'spam': {},
            'ham': {}
        }
    
    @classmethod
    def get_vocab(cls, X):
        vocab = map(lambda x: set(x), X)
        vocab = reduce(lambda x1, x2: x1.union(x2), vocab)
        return vocab
    
    @classmethod
    def word_count(cls, X, Y, count=None):
        if count is None:
            count = {
                'spam': {},
                'ham': {}
            }
        for x, y in zip(X, Y):
            for word in x:
                if not word in count[y].keys():
                    count[y][word] = 1
                else:
                    count[y][word] += 1
        return count
    
    def word_freq(self, count, vocab):
        freq = {
            'spam': {},
            'ham': {}
        }
        Nclass = {
            'spam': reduce(lambda w1, w2: w1 + w2, count['spam'].values()),
            'ham': reduce(lambda w1, w2: w1 + w2, count['ham'].values())
        }
        V = len(vocab)
        for y, ycount in count.items():
            for word, _count in ycount.items():
                freq[y][word] = (_count + 1) / (Nclass[y] + V)
        return freq
    
    def train(self, X, Y):
        self.vocab.update(self.get_vocab(X))
        self.word_count(X, Y, self.count)
        self.freq = self.word_freq(self.count, self.vocab)
    
    def predict(self, X):
        single = isinstance(X[0], str)
        if single:
            X = [X]
        Y = []
        for x in X:
            y_value = 1
            for word in x:
                y_value *= self.freq['spam'][word] / self.freq['ham'][word]
            Y.append(y_value)
        if single:
            Y = Y[0]
        return Y
    
    def test(self, X, Y):
        Y_predicted = self.predict(X)

    @classmethod
    def accuracy(cls, Y, Y_predicted):
        pass

    @classmethod
    def recall(cls, Y, Y_predicted):
        pass

    @classmethod
    def precision(cls, Y, Y_predicted):
        pass

    @classmethod
    def f1(cls, Y, Y_predicted):
        pass

In [None]:
class Model:

    def __init__(self):
        self.classificator = Classificator()
    
    @classmethod
    def load(filename):
        msgs = {
            'data' : [],
            'target' : []
        }

        with open(filename) as f:
            lines = f.readlines()
            for line in lines:
                target, msg = line.split('\t')
                msgs['data'].append(msg)
                msgs['target'].append(target)
        
        return msgs

    @classmethod
    def preprocess(cls, X):
        # lowercase (str -> str)
        X = [line.lower() for line in X]

        # numbers (str -> str)
        # X = [re.sub(r'\d+', '', line) for line in X]

        # punctuation (str -> str)
        X = [line.translate(str.maketrans("", "", string.punctuation)) for line in X]

        #replace ’ with ' and remove “ and ” (str -> str)
        X = [line.replace("’","'") for line in X]
        X = [line.replace("“","") for line in X]
        X = [line.replace("”","") for line in X]

        # whitespaces (str -> str)
        X = [line.strip() for line in X]

        # tokenization (str -> list)
        X = [line.split(' ') for line in X]

        # stopwords (list -> list)
        stopwords = set(STOPWORDS)
        stopwords.update({})
        X = [[word for word in line if len(word) > 2 and not word in stopwords] for line in X]

        # Lemmatization (list -> list)
        X = [[WordNetLemmatizer().lemmatize(word) for word in line] for line in X]

        return X
    
    @classmethod
    def split(cls, data, target, test_size=0.3, random_state=109):
        # Split dataset into training set and test set
        # 70% training and 30% test (default)
        return train_test_split(data, target, test_size=test_size, random_state=random_state)
    
    def train_test(self, filename="SMSSpamCollection", test_size=0.3, random_state=109):
        msgs = self.load(filename)
        msgs = self.preprocess(msgs['data'])
        X_train, X_test, Y_train, Y_test = self.split(msgs['data'], msgs['target'], test_size, random_state)
        self.classificator.train(X_train, Y_train)
        return self.classificator.test(X_test, Y_test)