# Naive Bayes Classifier

In [16]:
import nltk
from nltk.corpus import movie_reviews
nltk.download("movie_reviews")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [17]:
from collections import defaultdict, Counter
import math
import random

train_X, train_Y = [], []
test_X, test_Y = [], []

random.seed(0)
for polarity in movie_reviews.categories():
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_X.append([w for w in movie_reviews.words(fid)])
            test_Y.append(polarity)
        else:
            train_X.append([w for w in movie_reviews.words(fid)])
            train_Y.append(polarity)

print(train_X[0], train_Y[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

## Model Construction

$\bar{y} = \text{arg}\max_{y \in \mathbf{y}} P(y|x) = \text{arg}\max_{y \in \mathbf{y}} P(y) \prod_{i=1}^n \frac{P(x_i|y)}{P(x_i)} = \text{arg}\max_{y \in \mathbf{y}} P(y) \prod_{i=1}^n P(x_i|y)$

$P(x_i|y)=\frac{C(x_i, y) + k}{C(y) + |\mathbf{y}| \times k}$

$\bar{y} = \textrm{arg} \max_{y \in \mathbf{y}} \log P(y) + \sum_{i=1}^n \log \frac{C(x_i, y) + k}{C(y) + k|\mathbf{y}|}$

     

In [18]:
from collections import Counter, defaultdict
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import math
import re

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
class NaiveBayesClassifier:
    def __init__(self, k=1):
        self.k = k
        self.features = set()
        self.class_feature_counts = defaultdict(Counter)
        self.class_counts = Counter()
        self.total = 0
        self.stemmer = PorterStemmer()
        self.total_token_counts = defaultdict(int)

    def lemmatize_tokens(self, tokens):
     lemmatizer = WordNetLemmatizer()
     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
     return lemmatized_tokens

    def filter_low_frequency_words(self, tokens, min_frequency):
      word_counts = Counter(tokens)  # 統計每個詞彙的出現次數
      filtered_tokens = [token for token in tokens if word_counts[token] > min_frequency]  # 過濾掉出現次數小於 min_frequency 的詞彙
      return filtered_tokens

    def remove_noise(self, tokens):
     stop_words = set(stopwords.words('english'))  # 加載停用詞列表
     filtered_tokens = [token for token in tokens if token.lower() not in stop_words]  # 過濾掉停用詞
     filtered_tokens = [token for token in tokens if re.match(r'^[a-zA-Z]+$', token)]  # 使用正則表達式匹配單字字符
     return filtered_tokens

    def handle_negation(self, text):
     tokens = text.split()
     tagged_tokens = nltk.pos_tag(tokens)  # 對文本進行詞性標註
     i = 0
     while i < len(tagged_tokens) - 1:
        if tagged_tokens[i][0] == "not" and tagged_tokens[i+1][1] == "JJ":  # 檢查是否是 "not" 後面的形容詞
            tokens[i+1] += "_NEG"  # 給形容詞添加 "_NEG" 標記
        i += 1
     return tokens

    def train(self, train_X, train_Y):
        for tokens, label in zip(train_X, train_Y):
            tokens = self.remove_noise(tokens)
            tokens = self.lemmatize_tokens(tokens)
            # tokens = [self.stemmer.stem(token) for token in tokens]
            # tokens = self.filter_low_frequency_words(tokens, 6)
            # text = ' '.join(tokens)
            # tokens = self.handle_negation(text)
            self.class_counts[label] += 1
            self.total += 1
            for token in set(tokens):
                self.features.add(token)
                self.class_feature_counts[label][token] += 1
                self.total_token_counts[label] += 1  # Increment total token count for this class
        # for label, tokens in self.class_feature_counts.items():
        #   for tok in tokens:
        #     if self.class_feature_counts[label][tok] < 6:
        #       self.features.discard(tok)
        #       self.class_feature_counts[label][tok] = 0

    def probabilities(self, token):
        probs = {}
        for cls, cls_cnt in self.class_counts.items():
            token_count = self.class_feature_counts[cls][token]
            total_token_count = self.total_token_counts[cls]
            probs[cls] = (token_count + self.k) / (total_token_count + len(self.features) * self.k)
        return probs

    def predict(self, tokens):
        tokens = self.remove_noise(tokens)
        tokens = self.lemmatize_tokens(tokens)
        # tokens = [self.stemmer.stem(token) for token in tokens]
        # tokens = self.filter_low_frequency_words(tokens, 6)
        # text = ' '.join(tokens)
        # tokens = self.handle_negation(text)
        tokens = set(tokens)
        log_probs = Counter()
        for cls, cls_cnt in self.class_counts.items():
            log_probs[cls] = math.log(cls_cnt / self.total)
        for token in self.features:
            probs = self.probabilities(token)
            if token in tokens:
                for cls, prob in probs.items():
                    log_probs[cls] += math.log(prob)
            else:
                for cls, prob in probs.items():
                    log_probs[cls] += math.log(1.0 - prob)
        return max(log_probs, key=log_probs.get), log_probs


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Using the Model

In [19]:
model = NaiveBayesClassifier()
model.train(train_X, train_Y)

In [20]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Taken from https://www.imdb.com/review/rw0990793/?ref_=tt_urv
review = """A whimsical, often spectacular view of a future in which advances in technology dominate the world. It is well shot and although slow-moving it is intense and enjoyable throughout. The featuring of classical music to establish atmosphere works brilliantly; it provides a feeling of awe, mystery and intrigue  the same aura that Walt Disney worked in creating 'Fantasia'. The special effects, both sound and visual, are still spellbinding by the standards of today's technology. Aside from the technical pluses of the film, it stands strong as it is one of not many films out there that has something important to say about humankind, and where the human race is heading in terms of our increasing reliance on machines and our unquenchable thirst to discover. Despite an ending that is hard to understand, it is even harder to overlook this film a true cinema classic."""

model.predict(word_tokenize(review))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


('pos', Counter({'neg': -747.4105070251276, 'pos': -732.8226860441249}))

In [21]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    prediction, _ = model.predict(x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

356 / 422 = 0.843602


## Exploring important features

In [22]:
def prob_class_given_feature(feature, cls, model):
    probs = model.probabilities(feature)
    return probs[cls] / sum(probs.values())

print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "pos", model), reverse=True)[:30])
print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "neg", model), reverse=True)[:30])

['thematic', 'dread', 'astounding', 'naval', 'turturro', 'reminder', 'kenobi', 'fascination', 'seamless', 'denial', 'en', 'keen', 'masterfully', 'lovingly', 'burbank', 'balancing', 'downside', 'timeless', 'outstanding', 'lofty', 'uncut', 'criticized', 'dewey', 'meryl', 'splash', 'deliberate', 'vocal', 'gattaca', 'topping', 'fabric']
['hudson', 'vomit', 'illogical', 'sans', 'overwrought', 'tedium', 'pathetically', 'horrid', 'bio', 'undermines', 'plant', 'zellweger', 'schumacher', 'hmmm', 'plodding', 'anthropologist', 'plod', 'stupidly', 'batgirl', 'campiness', 'insulting', 'biologist', 'sphere', 'guinea', 'leaden', 'chevy', 'mug', 'stalk', 'ludicrous', 'lecture']
