## Import libs and downloads



In [39]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Text to num feature conversion
from sklearn.naive_bayes import MultinomialNB # Standard multinomial naive Bayes described in the chapter - uses word counts/frequencies as features
from sklearn.naive_bayes import BernoulliNB #  "Multivariate Bernoulli naive Bayes" (different from binary multinomial NB) - estimates P(w|c) as the fraction of documents containing a term and includes probability for term absence
from sklearn.model_selection import train_test_split, cross_val_score # Data splitting and validation
from sklearn.metrics import classification_report, confusion_matrix

import nltk # nat lang tool kit
from nltk.corpus import stopwords

import re


In [40]:
# Download required NLTK data (one-time setup)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Text sample with ground truth/gold labels

In [140]:
# Each text represents a document to classify
texts = [
    "I love this movie!.... It's sweet, but with satirical humor.",  # Positive sentiment
    "The dialogue is great and the adventure scenes are fun.",   # Positive sentiment
    "It was pathetic. The worst part about it was the boxing scenes.",  # Negative sentiment
    "No plot twists or great scenes. Entirely predictable.",     # Negative sentiment
    "Awesome caramel sauce and sweet toasty almonds. I love this place!",  # Positive sentiment
    "Awful pizza and ridiculously overpriced food.",            # Negative sentiment
    "Very powerful and the most fun film of the summer.",       # Positive sentiment
    "Just plain boring and lacks energy. No surprises."         # Negative sentiment
]

# Corresponding labels for each text (ground truth)
labels = ['positive', 'positive', 'negative', 'negative',
          'positive', 'negative', 'positive', 'negative']


## Basic preprocessing

In [141]:
# Basic preprocessing like the one from the book
def preprocess_text(text):
  """Clean and normalize text for processing"""

  text = text.lower()
  # Removing punctuation but keeping spaces and letters/numbers
  text = re.sub(r'[^\w\s]','',text) ## caret = the opposite(in this case: non words, non spaces); \w = words ; \s = whitespaces
  return text

processed_texts = [preprocess_text(text) for text in texts]
print("Processed texts: " )
print(processed_texts)

Processed texts: 
['i love this movie its sweet but with satirical humor', 'the dialogue is great and the adventure scenes are fun', 'it was pathetic the worst part about it was the boxing scenes', 'no plot twists or great scenes entirely predictable', 'awesome caramel sauce and sweet toasty almonds i love this place', 'awful pizza and ridiculously overpriced food', 'very powerful and the most fun film of the summer', 'just plain boring and lacks energy no surprises']


## Naive Bayes Algorithm

In [43]:
from collections import defaultdict, Counter
import math # for log operations

In [83]:
class NaiveBayesClassifier:
  def __init__(self, smoothing=1):
    self.smoothing = smoothing # default as laplace add-one
    self.class_priors = {} # P(c) Prob of classes
    self.word_likelihoods = defaultdict(dict) # P(w|c) Prob of the word given the class
    self.vocabulary = set() # Cardinality/size of the vocabulary. Unique words

  def train(self, texts, labels):
    """ Train the Naive Bayes classifier"""

    class_counts = Counter(labels) ## for the Prior probability. Would be: Counter({'positive': 4, 'negative': 4})
    total_docs = len(labels)

    # Calc Prior probability count(class)/total docs
    for class_label, count in class_counts.items(): # dict_items([('positive', 4), ('negative', 4)])
      self.class_priors[class_label] = count/total_docs # MLE for the probs of the classes or labels, using just the frequency
      print("self.class_priors: ", str(self.class_priors))

    class_word_counts = defaultdict(Counter) # Counts each word in each class
    class_total_words = defaultdict(int) # Total words in each class

    for text, label in zip(texts, labels):
      words = text.split() # Tokenizing by split on whitespaces
      for word in words:
        self.vocabulary.add(word)
        class_word_counts[label][word] +=1
        class_total_words[label] += 1

    # Calculate the word likelihoods P(w|c) using add-one (laplace) smoothing
    vocab_size = len(self.vocabulary) # |V|
    for class_label in class_counts:
      for word in self.vocabulary:
        count = class_word_counts[class_label][word] # Raw count of word in class
        # (count + 1) / (total_words + |V|): it is the frequency of the word in the specific class, with an adding of smoothing to not have zeros
        self.word_likelihoods[class_label][word] = (
            (count + self.smoothing) /
            (class_total_words[class_label] + vocab_size * self.smoothing)
        )


  def predict(self, text):
    """Predict the class for a given text using Naive Bayes"""

    words = text.split()
    class_scores = {} # For storing log probabilities per class, logs avoid underflow and other benefits

    ## It will go summing up the logs of the probabilities instead of the raw probabilities
    for class_label in self.class_priors:
      score = math.log(self.class_priors[class_label]) # log P(c)

      # Adding log likelihood for each word in the doc
      for word in words:
        if word in self.vocabulary: # Only known words, if not will be a problematic 0
          score += math.log(self.word_likelihoods[class_label][word])

      class_scores[class_label] = score # stores final score for the class

    print("class_scores: ", str(class_scores))
    print("class probs: ")
    print({class_label: math.exp(log_prob) for class_label, log_prob in class_scores.items()} )

    # Predicts/retrieves the class with the max probability scored from the class_scores. Argmax function
    return max(class_scores, key=class_scores.get)



In [None]:
nb_classifier = NaiveBayesClassifier()
nb_classifier.train(processed_texts, labels)


In [101]:
test_text = "This movie... is... great and fun!"
prediction = nb_classifier.predict(preprocess_text(test_text))
print(f"\nPrediction for '{test_text}': {prediction}\n")

class_scores:  {'positive': -22.416275849231017, 'negative': -25.833205929724727}
class probs: 
{'positive': 1.839649530104649e-10, 'negative': 6.036444539388965e-12}

Prediction for 'This movie... is... great and fun!': positive



In [104]:
test_text_2 = "It was boring."
prediction_2 = nb_classifier.predict(preprocess_text(test_text_2))
print(f"\nPrediction for '{test_text_2}': {prediction_2}\n")


class_scores:  {'positive': -14.386191754963454, 'negative': -11.268684531860199}
class probs: 
{'positive': 5.651403356481483e-07, 'negative': 1.2766518811465485e-05}

Prediction for 'It was boring.': negative



## Handling Negation


In [126]:
def handle_negation(text):
  """
  Implementation of negation handling as described in the book, section 4.4
  Add NOT_ prefix to words after negation until punctuation
  """

  # Words that indicate negation (from chapter examples)
  negation_words = ['not', 'no', 'never', 'nothing', 'nowhere', 'nobody',
                    'none', 'neither', 'nor', 'dont', "don't", 'didnt',
                    "didn't", 'wont', "won't", 'cant', "can't"]

  words = text.split() # Simple tonekizer for individual words
  result = []
  negated = False # Flag that sets scope of negation, starts with detected word and closes with punctuation


  for word in words:
    clean_word = re.sub('[^\w]', '', word.lower()) # Cleaned to detect negation from the negation_words: lowercase and remove non-word chars

    if clean_word in negation_words: ## Detected negation words
      print("Detected negation: ", word)
      negated = True # Changing flag
      result.append(word) ## Adding the original negation word as it is

    elif any(char in word for char in '.:,;!?'): ## Detected punctuation: Punctuation chars close the negated flag scope
      print("Detected punctuation: ", word)
      result.append(f"NOT_{word}" if negated else word)
      negated = False # Changing flag


    else: ## Detected common word: applying the _NOT prefic only if the negated flag is true
      result.append(f"NOT_{word}" if negated else word)

  return(' '.join(result))



In [128]:
# Example from the chapter
text = "didn't like this movie , but I"
negated_text = handle_negation(text)
print(f"Original: {text}")
print(f"Negated:  {negated_text}")  # Should show: didn't NOT_like NOT_this NOT_movie , but I

Detected negation:  didn't
Detected punctuation:  ,
Original: didn't like this movie , but I
Negated:  didn't NOT_like NOT_this NOT_movie NOT_, but I


## Binary vs. Multinomial Naive Bayes

In [142]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(
    processed_texts, labels, # Documents and their gold labels
    test_size= 0.3,  # Splits by docs, not by words: 5 docs go to train, and 3 to test
    random_state = 42
    )


### Multinomial Naive Bayes (uses actual word counts)


In [None]:

# Multinomial NB
count_vectorizer = CountVectorizer() #  implements the bag of words by ignoring word order, counting frequencies, building a vocabulary, and converting texts to numerical vectors where each dimension represents word counts
X_train_counts = count_vectorizer.fit_transform(X_train) # Creates sparse matrix for train data.  ## Example: "(0, 13) 1 means Document 0, word at vocabulary index 13, appears 1 time
X_test_counts  = count_vectorizer.transform(X_test) # Creates sparse matrix for test data



In [144]:
## sparse matrix
print("X_train_counts:", str(X_train_counts))  ## Example: "(0, 13) 1 means Document 0, word at vocabulary index 13, appears 1 time
print("X_test_counts:", str(X_test_counts))

X_train_counts: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 44 stored elements and shape (5, 39)>
  Coords	Values
  (0, 13)	1
  (0, 23)	1
  (0, 4)	1
  (0, 2)	1
  (0, 14)	1
  (0, 7)	1
  (0, 17)	1
  (0, 30)	1
  (1, 12)	2
  (1, 37)	2
  (1, 21)	1
  (1, 32)	2
  (1, 38)	1
  (1, 20)	1
  (1, 0)	1
  (1, 5)	1
  (1, 28)	1
  (2, 2)	1
  (2, 3)	1
  (2, 6)	1
  (2, 27)	1
  (2, 31)	1
  (2, 34)	1
  (2, 1)	1
  (2, 15)	1
  (2, 33)	1
  (2, 22)	1
  (3, 17)	1
  (3, 28)	1
  (3, 24)	1
  (3, 35)	1
  (3, 19)	1
  (3, 11)	1
  (3, 8)	1
  (3, 26)	1
  (4, 2)	1
  (4, 32)	2
  (4, 36)	1
  (4, 25)	1
  (4, 16)	1
  (4, 10)	1
  (4, 9)	1
  (4, 18)	1
  (4, 29)	1
X_test_counts: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (3, 39)>
  Coords	Values
  (0, 2)	1
  (0, 10)	1
  (0, 11)	1
  (0, 28)	1
  (0, 32)	2
  (1, 2)	1
  (2, 15)	1
  (2, 31)	1
  (2, 33)	1


In [157]:

multinomial_nb = MultinomialNB(alpha=1.0) # Smoothing of 1.0
multinomial_nb.fit(X_train_counts, y_train)

print("multinomial_nb trained:\n")

# Show the key attributes of the trained model
print("Classes:", multinomial_nb.classes_)  # Class labels ['negative', 'positive']

print("\nClass log priors:", multinomial_nb.class_log_prior_)  # log P(c) for each class
print("Class priors (actual):", np.exp(multinomial_nb.class_log_prior_))  # P(c)

print("\nVocabulary size:", len(count_vectorizer.vocabulary_))  # Number of unique words
print("Feature names (first 10):", count_vectorizer.get_feature_names_out()[:10])  # First 10 words

print("\nFeature log probabilities shape:", multinomial_nb.feature_log_prob_.shape)  # (classes, features)
# This is log P(word|class) for each word in each class

# Example: probability of first few words given each class
for i, class_name in enumerate(multinomial_nb.classes_):
    print(f"\n{class_name} class - first word in probability:")
    for j in range(1):
        word = count_vectorizer.get_feature_names_out()[j]
        log_prob = multinomial_nb.feature_log_prob_[i][j]
        actual_prob = np.exp(log_prob)
        print(f"  P('{word}'|{class_name}) = {actual_prob:.6f}")

multinomial_nb trained:

Classes: ['negative' 'positive']

Class log priors: [-0.51082562 -0.91629073]
Class priors (actual): [0.6 0.4]

Vocabulary size: 39
Feature names (first 10): ['about' 'almonds' 'and' 'awesome' 'boring' 'boxing' 'caramel' 'energy'
 'entirely' 'film']

Feature log probabilities shape: (2, 39)

negative class - first word in probability:
  P('about'|negative) = 0.029851

positive class - first word in probability:
  P('about'|positive) = 0.016949


#### Binary Naive Bayes (uses only presence/absence of words)


In [158]:
binary_vectorizer = CountVectorizer(binary=True) # binary=True converts counts to just 0 or 1
X_train_binary = binary_vectorizer.fit_transform(X_train)
X_test_binary = binary_vectorizer.transform(X_test)

In [161]:
print("X_train_binary:", str(X_train_binary))
print("X_test_binary:", str(X_test_binary))

X_train_binary: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 44 stored elements and shape (5, 39)>
  Coords	Values
  (0, 13)	1
  (0, 23)	1
  (0, 4)	1
  (0, 2)	1
  (0, 14)	1
  (0, 7)	1
  (0, 17)	1
  (0, 30)	1
  (1, 12)	1
  (1, 37)	1
  (1, 21)	1
  (1, 32)	1
  (1, 38)	1
  (1, 20)	1
  (1, 0)	1
  (1, 5)	1
  (1, 28)	1
  (2, 2)	1
  (2, 3)	1
  (2, 6)	1
  (2, 27)	1
  (2, 31)	1
  (2, 34)	1
  (2, 1)	1
  (2, 15)	1
  (2, 33)	1
  (2, 22)	1
  (3, 17)	1
  (3, 28)	1
  (3, 24)	1
  (3, 35)	1
  (3, 19)	1
  (3, 11)	1
  (3, 8)	1
  (3, 26)	1
  (4, 2)	1
  (4, 32)	1
  (4, 36)	1
  (4, 25)	1
  (4, 16)	1
  (4, 10)	1
  (4, 9)	1
  (4, 18)	1
  (4, 29)	1
X_test_binary: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (3, 39)>
  Coords	Values
  (0, 2)	1
  (0, 10)	1
  (0, 11)	1
  (0, 28)	1
  (0, 32)	1
  (1, 2)	1
  (2, 15)	1
  (2, 31)	1
  (2, 33)	1


In [166]:
binary_nb = BernoulliNB(alpha=1.0) # Binary variant of NB
binary_nb.fit(X_train_binary, y_train)


print("binary_nb trained:\n")

# Show the key attributes of the trained BernoulliNB model
print("Classes:", binary_nb.classes_)  # Class labels ['negative', 'positive']

print("\nClass log priors:", binary_nb.class_log_prior_)  # log P(c) for each class
print("Class priors (actual):", np.exp(binary_nb.class_log_prior_))  # P(c)

print("\nVocabulary size:", len(binary_vectorizer.vocabulary_))  # Number of unique words
print("Feature names (first word):", binary_vectorizer.get_feature_names_out()[0])  # First word

print("\nFeature log probabilities shape:", binary_nb.feature_log_prob_.shape)  # (classes, features)
# This is log P(word_present|class) for binary features

# Example: probability of first word being present given each class
for i, class_name in enumerate(binary_nb.classes_):
    print(f"\n{class_name} class - first word presence probability:")
    word = binary_vectorizer.get_feature_names_out()[0]
    log_prob = binary_nb.feature_log_prob_[i][0]
    actual_prob = np.exp(log_prob)
    print(f"  P('{word}' present|{class_name}) = {actual_prob:.6f}")

binary_nb trained:

Classes: ['negative' 'positive']

Class log priors: [-0.51082562 -0.91629073]
Class priors (actual): [0.6 0.4]

Vocabulary size: 39
Feature names (first word): about

Feature log probabilities shape: (2, 39)

negative class - first word presence probability:
  P('about' present|negative) = 0.400000

positive class - first word presence probability:
  P('about' present|positive) = 0.250000


### Compare predictions from both approaches


In [169]:
y_test

['positive', 'negative', 'positive']

In [168]:
print("Multinomial NB predictions:", multinomial_nb.predict(X_test_counts))
print("Binary NB predictions:", binary_nb.predict(X_test_binary))

Multinomial NB predictions: ['negative' 'positive' 'positive']
Binary NB predictions: ['negative' 'negative' 'positive']
