In [1]:
import math
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split


class SentimentAnalysis:
    """
    A simple Naive Bayes sentiment classifier using regular-expression
    tokenisation instead of spaCy.
    """

    def __init__(self, pos_file, neg_file):
        # Read and tokenise both files
        pos_lines = self.read(pos_file)
        neg_lines = self.read(neg_file)

        # Combine and create labels
        X = pos_lines + neg_lines
        y = [1] * len(pos_lines) + [-1] * len(neg_lines)

        # Stratified train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.20,
            shuffle=True,
            stratify=y,
            random_state=42
        )

        self.training_set = list(zip(X_train, y_train))
        self.testing_set  = list(zip(X_test, y_test))

        # Train NB model
        self.word_counts, self.logpriors, self.loglikelihoods = self.train()

    # -----------------------------------------------------------
    # 1. Read and tokenise file using regular expressions
    # -----------------------------------------------------------
    def read(self, filename):
        lines = []
        tokeniser = re.compile(r"[a-zA-Z]+")    # alphabetic tokens only

        with open(filename, encoding='latin-1') as f:
            for line in f:
                tokens = tokeniser.findall(line.lower())
                lines.append(tokens)

        return lines

    # -----------------------------------------------------------
    # 2. Train a Naive Bayes classifier with log-probabilities
    # -----------------------------------------------------------
    def train(self):
        word_counts = {1: defaultdict(int), -1: defaultdict(int)}
        total_words = {1: 0, -1: 0}

        # Count occurrences
        for tokens, label in self.training_set:
            for w in tokens:
                word_counts[label][w] += 1
                total_words[label] += 1

        # Priors
        n_pos = sum(1 for _, y in self.training_set if y == 1)
        n_neg = sum(1 for _, y in self.training_set if y == -1)
        total_docs = n_pos + n_neg

        logpriors = {
            1: math.log(n_pos / total_docs),
            -1: math.log(n_neg / total_docs)
        }

        # Vocabulary
        vocab = set(word_counts[1].keys()) | set(word_counts[-1].keys())
        V = len(vocab)

        # Log-likelihoods
        loglikelihoods = {1: {}, -1: {}}
        for w in vocab:
            loglikelihoods[1][w] = math.log((word_counts[1][w] + 1) /
                                            (total_words[1] + V))
            loglikelihoods[-1][w] = math.log((word_counts[-1][w] + 1) /
                                             (total_words[-1] + V))

        return word_counts, logpriors, loglikelihoods

    # -----------------------------------------------------------
    # 3. Predict sentiment of a token list
    # -----------------------------------------------------------
    def predict(self, tokens):
        log_pos = self.logpriors[1]
        log_neg = self.logpriors[-1]

        for w in tokens:
            if w in self.loglikelihoods[1]:
                log_pos += self.loglikelihoods[1][w]
                log_neg += self.loglikelihoods[-1][w]

        return 1 if log_pos >= log_neg else -1

    # -----------------------------------------------------------
    # 4. Evaluate classifier accuracy on test set
    # -----------------------------------------------------------
    def eval(self):
        wrong = 0
        for tokens, true_label in self.testing_set:
            pred = self.predict(tokens)
            wrong += (pred != true_label)

        error = wrong / len(self.testing_set)
        print(f"error rate = {error:.4f}, accuracy = {1 - error:.4f}")


# ----------------------------------------------------------------------
# MAIN
# ----------------------------------------------------------------------
def main():
    sa = SentimentAnalysis(
        'data/rt-polarity.pos',
        'data/rt-polarity.neg'
    )
    sa.eval()


if __name__ == "__main__":
    main()


error rate = 0.0779, accuracy = 0.9221
