In [2]:
from datasets import load_dataset

train_data = load_dataset("stanfordnlp/imdb", split="train")
test_data = load_dataset("stanfordnlp/imdb", split="test")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove punctuation and numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

train_data = train_data.map(lambda x: {'text': preprocess(x['text'])})
test_data = test_data.map(lambda x: {'text': preprocess(x['text'])})



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anupampatra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
train_data['text'][0]

'rented curious yellow video store controversy surrounded first released also heard first seized u customs ever tried enter country therefore fan films considered controversial really see plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married men kills curious yellow years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes films commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curious yellow good film anyone wanting study meat potatoes pun intended s

In [5]:
import numpy as np

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

In [7]:
X_train = vectorizer.fit_transform(train_data['text'])
X_test = vectorizer.transform(test_data['text'])
y_train = np.array(train_data['label'])
y_test = np.array(test_data['label'])

In [8]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25000, 5000) (25000,)
(25000, 5000) (25000,)


In [9]:
NEGATIVE_CLASS = 0
POSITIVE_CLASS = 1

### Calculate the priors for negative and positive class

In [10]:
negative_prob = len(y_train[y_train == NEGATIVE_CLASS])/len(y_train)
positive_prob = len(y_train[y_train == POSITIVE_CLASS])/len(y_train)
negative_prob, positive_prob

(0.5, 0.5)

In [11]:
VOCAB_SIZE = 5000

### Calculate word probability for each class

In [12]:
SMOOTHING_CONST = 1.0

In [13]:
positive_data = X_train[y_train == POSITIVE_CLASS]
positive_word_counts = np.sum(positive_data, axis=0)
total_positive_word_counts = np.sum(positive_word_counts)
positive_word_prob = (positive_word_counts + SMOOTHING_CONST)/(total_positive_word_counts + SMOOTHING_CONST*VOCAB_SIZE)

In [14]:
positive_word_prob.sum()

np.float64(1.0)

In [15]:
negative_data = X_train[y_train == NEGATIVE_CLASS]
negative_word_counts = np.sum(negative_data, axis=0)
total_negative_word_counts = np.sum(negative_word_counts)
negative_word_prob = (negative_word_counts + SMOOTHING_CONST)/(total_negative_word_counts + SMOOTHING_CONST*VOCAB_SIZE)

In [16]:
negative_word_prob.sum()

np.float64(1.0)

### Calculate the log probabilities

In [17]:
log_postive_prob = np.log(positive_prob)
log_negattive_prob = np.log(negative_prob)
log_positive_word_prob = np.log(positive_word_prob)
log_negative_word_prob = np.log(negative_word_prob)

### Calculate joint log probabilities for each test data

In [28]:
joint_positive_prob = log_postive_prob + X_test @ log_positive_word_prob.T

In [30]:
joint_negative_prob = log_negattive_prob + X_test @ log_negative_word_prob.T

### Predict classes for the test data

In [48]:
predictions = np.array((joint_positive_prob > joint_negative_prob).astype(int)).ravel()

### Compute the confusion matrix

In [52]:
from sklearn.metrics import confusion_matrix


In [53]:
cm = confusion_matrix(y_test, predictions)

In [54]:
cm

array([[10735,  1765],
       [ 2187, 10313]])

In [56]:
tn, fn, tp, fp = cm[0][0], cm[1][0], cm[1][1], cm[0][1]
tn, fn, tp, fp

(np.int64(10735), np.int64(2187), np.int64(10313), np.int64(1765))

### Calculate the metrics

In [59]:
n = cm.sum()

In [62]:
accuracy = (tn + tp)/n
precision = tp/(tp + fp)
recall = tp/(tp + fn)

In [63]:
print(accuracy, precision, recall)

0.84192 0.8538665341944031 0.82504


### Using the sklearn Naive Bayes classifier

In [65]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt

In [66]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [67]:
y_pred = nb_classifier.predict(X_test)

In [68]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.8419
Precision: 0.8539
Recall:    0.8250
F1 Score:  0.8392
