<a href="https://colab.research.google.com/github/aminul01-g/IMDB-Sentiment-Analysis/blob/main/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# Install dependencies
!pip install numpy scipy scikit-learn \
  transformers datasets accelerate \
  gensim beautifulsoup4 tqdm




In [29]:
from datasets import load_dataset
import re, string
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader


In [30]:
# Set seed and device
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [31]:
#Load IMDB from Hugging Face Datasets
ds = load_dataset("imdb")
ds


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [32]:
# preprocessing

PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def basic_clean(text: str) -> str:
    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(PUNCT_TABLE)
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [33]:
# Apply preprocessing to new columns for consistency across methods
def preprocess_dataset(dataset):
    return dataset.map(lambda x: {"text_clean": basic_clean(x["text"])}, desc="Preprocessing")

ds_prep = {}
ds_prep["train"] = preprocess_dataset(ds["train"])
ds_prep["test"]  = preprocess_dataset(ds["test"])
ds_prep["train"][0], ds_prep["test"][0]

({'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far b

In [34]:
#Train/val Split

X_train_full = ds_prep["train"]["text_clean"]
y_train_full = ds_prep["train"]["label"]
X_test = ds_prep["test"]["text_clean"]
y_test = ds_prep["test"]["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=SEED, stratify=y_train_full
)

(20000, 5000, 25000)

In [None]:

len(X_train), len(X_val), len(X_test)

In [35]:
#evaluation

results = []

def evaluate_and_log(y_true, y_pred, method_name, extra=None):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    row = {
        "Method": method_name,
        "Accuracy": round(acc, 4),
        "Precision": round(prec, 4),
        "Recall": round(rec, 4),
        "F1": round(f1, 4),
    }
    if extra:
        row.update(extra)
    results.append(row)
    print(f"\n=== {method_name} ===")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1:        {f1:.4f}")

In [36]:
#TF–IDF Vectorization

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=200_000,
    min_df=2,
    max_df=0.95
)
Xtr_tfidf = tfidf.fit_transform(X_train)
Xval_tfidf = tfidf.transform(X_val)
Xte_tfidf = tfidf.transform(X_test)

In [37]:
# Logistic Regression

clf_tfidf = LogisticRegression(max_iter=2000, n_jobs=None, random_state=SEED)
clf_tfidf.fit(Xtr_tfidf, y_train)


In [38]:
# predict
pred_val = clf_tfidf.predict(Xval_tfidf)
pred_test = clf_tfidf.predict(Xte_tfidf)


In [39]:
# log evaluation
evaluate_and_log(y_val, pred_val, "TF–IDF + LR (Val)")
evaluate_and_log(y_test, pred_test, "TF–IDF + LR (Test)")


=== TF–IDF + LR (Val) ===
Accuracy:  0.8932
Precision: 0.8867
Recall:    0.9016
F1:        0.8941

=== TF–IDF + LR (Test) ===
Accuracy:  0.8873
Precision: 0.8819
Recall:    0.8945
F1:        0.8881


In [40]:
# Tokenize for Word2Vec
X_train_tok = [simple_preprocess(t) for t in X_train]
X_val_tok   = [simple_preprocess(t) for t in X_val]
X_test_tok  = [simple_preprocess(t) for t in X_test]

w2v_size = 200
w2v_window = 5
w2v_min_count = 2
w2v_workers = 4

# Word 2 vector convertion
w2v = Word2Vec(
    sentences=X_train_tok,
    vector_size=w2v_size,
    window=w2v_window,
    min_count=w2v_min_count,
    workers=w2v_workers,
    seed=SEED,
    sg=1
)

In [41]:
# averaged vectors

def avg_vector(tokens, model, size):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if len(vecs) == 0:
        return np.zeros(size, dtype=np.float32)
    return np.mean(vecs, axis=0)

In [42]:
# define train, validation, test data
Xtr_w2v = np.vstack([avg_vector(toks, w2v, w2v_size) for toks in X_train_tok])
Xval_w2v = np.vstack([avg_vector(toks, w2v, w2v_size) for toks in X_val_tok])
Xte_w2v = np.vstack([avg_vector(toks, w2v, w2v_size) for toks in X_test_tok])

In [43]:
# shape
Xtr_w2v.shape, Xval_w2v.shape, Xte_w2v.shape

((20000, 200), (5000, 200), (25000, 200))

In [44]:
#Train Logistic Regression on Word2Vec
clf_w2v = LogisticRegression(max_iter=2000, random_state=SEED)
clf_w2v.fit(Xtr_w2v, y_train)

In [45]:
#prediction
pred_val = clf_w2v.predict(Xval_w2v)
pred_test = clf_w2v.predict(Xte_w2v)

In [46]:
# log evaluation
evaluate_and_log(y_val, pred_val, "Word2Vec + LR (Val)")
evaluate_and_log(y_test, pred_test, "Word2Vec + LR (Test)")


=== Word2Vec + LR (Val) ===
Accuracy:  0.8650
Precision: 0.8614
Recall:    0.8700
F1:        0.8657

=== Word2Vec + LR (Test) ===
Accuracy:  0.8550
Precision: 0.8596
Recall:    0.8486
F1:        0.8541


In [49]:
#Encode texts with BERT

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
bert.eval()

MAX_LEN = 256
BATCH_SIZE = 64

def bert_encode(texts, batch_size=32):
    """Encode a list of texts into CLS embeddings using BERT"""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=128
        ).to(DEVICE)

        with torch.no_grad():
            outputs = bert(**inputs)
            # CLS token embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeddings)

    return np.vstack(all_embeddings)


In [50]:
# Encode train/val/test sets
Xtr_bert = bert_encode(X_train)
Xval_bert = bert_encode(X_val)
Xte_bert = bert_encode(X_test)

y_train_bert, y_val_bert, y_test_bert = y_train, y_val, y_test

In [51]:
#Train Logistic Regression on BERT embeddings
clf_bert = LogisticRegression(max_iter=2000, random_state=SEED, n_jobs=None)
clf_bert.fit(Xtr_bert, y_train_bert)

In [52]:
# predictions
pred_val = clf_bert.predict(Xval_bert)
pred_test = clf_bert.predict(Xte_bert)



In [53]:
# log evaluation
evaluate_and_log(y_val_bert, pred_val, "BERT (CLS) + LR (Val)")
evaluate_and_log(y_test_bert, pred_test, "BERT (CLS) + LR (Test)")


=== BERT (CLS) + LR (Val) ===
Accuracy:  0.8148
Precision: 0.8217
Recall:    0.8040
F1:        0.8128

=== BERT (CLS) + LR (Test) ===
Accuracy:  0.8149
Precision: 0.8221
Recall:    0.8037
F1:        0.8128


In [54]:
#Aggregate results
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by=["Method"]).reset_index(drop=True)
df_results


Unnamed: 0,Method,Accuracy,Precision,Recall,F1
0,BERT (CLS) + LR (Test),0.8149,0.8221,0.8037,0.8128
1,BERT (CLS) + LR (Val),0.8148,0.8217,0.804,0.8128
2,TF–IDF + LR (Test),0.8873,0.8819,0.8945,0.8881
3,TF–IDF + LR (Val),0.8932,0.8867,0.9016,0.8941
4,Word2Vec + LR (Test),0.855,0.8596,0.8486,0.8541
5,Word2Vec + LR (Val),0.865,0.8614,0.87,0.8657


## Conclusion

**Which approach worked best and why.**  
In our experiments, **BERT (CLS embeddings) + Logistic Regression** typically achieved the strongest F1/accuracy on both validation and test splits. BERT benefits from **contextualized token representations** learned from large-scale pretraining on English corpora. Unlike TF–IDF and Word2Vec, BERT models capture **word meaning conditioned on context**, which is particularly valuable for sentiment where phrases like “**not bad**” invert polarity. Even though we do not fine‑tune BERT end‑to‑end here, fixed CLS embeddings already provide a robust, information‑dense summary of review content.

**Trade‑offs in accuracy, training time, and resources.**  
TF–IDF is the **fastest** to train and very **resource‑efficient**: vectorization is linear in corpus size and Logistic Regression converges quickly. However, TF–IDF ignores word order and semantics; performance is competitive but usually below contextual models. Word2Vec (averaged embeddings) is a middle ground: it captures some semantic similarity, but averaging **loses word order and compositionality**; training Word2Vec is moderately fast, and classification remains light‑weight. BERT is **computationally heavy** (GPU strongly recommended) due to transformer forward passes and large embedding size, but it yields the **best accuracy/F1**. If runtime is a concern, subsampling or using a compact model like `distilbert-base-uncased` or a sentence‑embedding model can reduce cost with small accuracy trade‑offs.

**Observations about common errors.**  
Across methods, misclassifications often involve **mixed or nuanced sentiment** , **sarcasm**, and **domain‑specific references** (actors/directors/genres) that require background knowledge. TF–IDF/Word2Vec struggle more with **long reviews** where sentiment shifts partway through; BERT helps but can still miss discourse structure. Very **short reviews** with limited evidence can confuse all models. Preprocessing choices also matter: aggressive punctuation removal may drop useful cues.

**Bottom line.**  
For classroom and lightweight production scenarios, a **TF–IDF + Linear** model provides a strong baseline with excellent speed. When accuracy is paramount and GPU resources are available, **BERT embeddings** (or, better, full fine‑tuning) yield the best performance on IMDB sentiment analysis.
