# **Advanced Sentiment Analysis: Classical ML vs Transformer Fine-Tuning**

This project compares two different NLP approaches for sentiment analysis on the IMDb movie review dataset

# Section 1 – Setup & Install Dependencies

In [1]:

# Install required libraries (latest enough versions for eval_strategy)
!pip install -q "torch" "transformers>=4.46.0" "datasets>=2.20.0" "scikit-learn" "matplotlib" "accelerate>=0.33.0"

In [2]:
import os
import random
import numpy as np

# Section 2 – Imports & Seed Setup

In [3]:
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [4]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

In [5]:
# Disable Weights & Biases logging (wandb)
import os
os.environ["WANDB_DISABLED"] = "true"

In [7]:
# Global constants
RANDOM_SEED = 42
BASE_OUTPUT_DIR = "./outputs"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(RANDOM_SEED)

print("Imports loaded, seed set, and outputs directory created.")

Imports loaded, seed set, and outputs directory created.


# Section 3 – Load IMDb Dataset



In [8]:
def load_imdb_subset(train_size=4000, test_size=1000):
    """
    Load the IMDb dataset and select a smaller subset
    to keep training fast on Colab.
    """
    print("Loading IMDb dataset...")
    dataset = load_dataset("imdb")

    train_data = dataset["train"]
    test_data = dataset["test"]

    train_idx = np.random.choice(len(train_data), size=train_size, replace=False)
    test_idx = np.random.choice(len(test_data), size=test_size, replace=False)

    train_subset = train_data.select(train_idx)
    test_subset = test_data.select(test_idx)

    X_train = train_subset["text"]
    y_train = train_subset["label"]
    X_test = test_subset["text"]
    y_test = test_subset["label"]

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_imdb_subset(
    train_size=4000,
    test_size=1000,
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print("\nExample review:\n")
print(X_train[0][:500])
print("\nLabel:", y_train[0])

Loading IMDb dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Training samples: 4000
Test samples: 1000

Example review:

Dumb is as dumb does, in this thoroughly uninteresting, supposed black comedy. Essentially what starts out as Chris Klein trying to maintain a low profile, eventually morphs into an uninspired version of "The Three Amigos", only without any laughs. In order for black comedy to work, it must be outrageous, which "Play Dead" is not. In order for black comedy to work, it cannot be mean spirited, which "Play Dead" is. What "Play Dead" really is, is a town full of nut jobs. Fred Dunst does however do

Label: 0


 ## Section 4 — Baseline Model (TF-IDF + LinearSVC)

4.1 – Confusion Matrix Helper

In [10]:


def plot_confusion_matrix(cm, classes, title, filename):
    """
    Plot a confusion matrix and save it as a PNG file
    under BASE_OUTPUT_DIR/filename.
    """
    fig, ax = plt.subplots(figsize=(4, 4))
    im = ax.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)

    ax.set(
        xticks=np.arange(len(classes)),
        yticks=np.arange(len(classes)),
        xticklabels=classes,
        yticklabels=classes,
        ylabel="True label",
        xlabel="Predicted label",
        title=title,
    )

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j,
                i,
                format(cm[i, j], "d"),
                ha="center",
                va="center",
                color="white" if cm[i, j] > thresh else "black",
            )

    fig.tight_layout()

    # Save figure as PNG
    save_path = os.path.join(BASE_OUTPUT_DIR, filename)
    plt.savefig(save_path, bbox_inches="tight", dpi=150)
    plt.close(fig)

    print(f"Saved confusion matrix to: {save_path}")

4.2 – Train & Evaluate TF-IDF + LinearSVC

In [11]:
# Baseline sentiment classifier using TF-IDF + LinearSVC

# 1) Vectorize text using TF-IDF
print("Fitting TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words="english",
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 2) Train LinearSVC classifier
print("Training LinearSVC baseline model...")
baseline_clf = LinearSVC()
baseline_clf.fit(X_train_vec, y_train)

# 3) Evaluate on the test set
print("Evaluating baseline model...")
y_pred_baseline = baseline_clf.predict(X_test_vec)

baseline_acc = accuracy_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline, average="weighted")

print(f"\nBaseline Accuracy: {baseline_acc:.4f}")
print(f"Baseline F1-score: {baseline_f1:.4f}\n")

print("Classification report (baseline):")
print(classification_report(y_test, y_pred_baseline, target_names=["neg", "pos"]))

# 4) Confusion matrix (plot + save)
cm_baseline = confusion_matrix(y_test, y_pred_baseline)
plot_confusion_matrix(
    cm_baseline,
    classes=["neg", "pos"],
    title="Baseline Confusion Matrix",
    filename="cm_baseline.png",
)

# 5) Show a few misclassified examples for error analysis
print("\nSample misclassified reviews (baseline):\n")
errors = []
for text, true_label, pred_label in zip(X_test, y_test, y_pred_baseline):
    if true_label != pred_label:
        errors.append((text, true_label, pred_label))

for text, true_label, pred_label in errors[:5]:
    print("-" * 80)
    print(f"True label:    {'pos' if true_label == 1 else 'neg'}")
    print(f"Predicted as:  {'pos' if pred_label == 1 else 'neg'}")
    print("Review snippet:")
    print(text[:500].replace("\n", " "))
    print()

Fitting TF-IDF vectorizer...
Training LinearSVC baseline model...
Evaluating baseline model...

Baseline Accuracy: 0.8410
Baseline F1-score: 0.8410

Classification report (baseline):
              precision    recall  f1-score   support

         neg       0.84      0.83      0.83       476
         pos       0.84      0.85      0.85       524

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000

Saved confusion matrix to: ./outputs/cm_baseline.png

Sample misclassified reviews (baseline):

--------------------------------------------------------------------------------
True label:    neg
Predicted as:  pos
Review snippet:
My brother plays "Moose" in this film. Although most of his scenes were left on the cutting room floor. The funniest line is the movie is "nothing wong with stat." So anyway, this is filmed in Portland, OR, where we grew up. The dance club is/was called "Up F

## Section 5 — Prepare Data for DistilBERT

In [12]:
# Prepare IMDb subset for DistilBERT fine-tuning

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=256,
    )

# Wrap raw lists into HuggingFace Dataset objects
train_dict = {"text": X_train, "label": y_train}
test_dict = {"text": X_test, "label": y_test}

train_ds = Dataset.from_dict(train_dict)
test_ds = Dataset.from_dict(test_dict)

# Tokenize datasets
print("Tokenizing text for DistilBERT...")
train_tokenized = train_ds.map(tokenize_batch, batched=True)
test_tokenized = test_ds.map(tokenize_batch, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Example tokenized entry:")
print(train_tokenized[0])

Tokenizing text for DistilBERT...


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Example tokenized entry:
{'text': 'Dumb is as dumb does, in this thoroughly uninteresting, supposed black comedy. Essentially what starts out as Chris Klein trying to maintain a low profile, eventually morphs into an uninspired version of "The Three Amigos", only without any laughs. In order for black comedy to work, it must be outrageous, which "Play Dead" is not. In order for black comedy to work, it cannot be mean spirited, which "Play Dead" is. What "Play Dead" really is, is a town full of nut jobs. Fred Dunst does however do a pretty fair imitation of Billy Bob Thornton\'s character from "A Simple Plan", while Jake Busey does a pretty fair imitation of, well, Jake Busey. - MERK', 'label': 0, 'input_ids': [101, 12873, 2003, 2004, 12873, 2515, 1010, 1999, 2023, 12246, 4895, 18447, 18702, 3436, 1010, 4011, 2304, 4038, 1012, 7687, 2054, 4627, 2041, 2004, 3782, 12555, 2667, 2000, 5441, 1037, 2659, 6337, 1010, 2776, 22822, 18757, 2046, 2019, 4895, 7076, 21649, 2544, 1997, 1000, 1996, 20

## Section 6 — Fine-Tune DistilBERT

In [13]:
# Fine-tune DistilBERT on the IMDb subset

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)

training_args = TrainingArguments(
    output_dir=os.path.join(BASE_OUTPUT_DIR, "distilbert"),
    eval_strategy="epoch",            # new name in recent transformers versions
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,               # you can set to 3 if Colab is okay
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1,
    remove_unused_columns=True,
    report_to="none",                 # disable external loggers (wandb, etc.)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting DistilBERT training...")
train_result = trainer.train()

print("\nEvaluating DistilBERT...")
eval_results = trainer.evaluate()

distilbert_acc = eval_results.get("eval_accuracy", None)
distilbert_f1 = eval_results.get("eval_f1", None)

print("\nDistilBERT evaluation results:")
print(eval_results)
print("\nDistilBERT Accuracy:", distilbert_acc)
print("DistilBERT F1-score:", distilbert_f1)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting DistilBERT training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3848,0.301245,0.883,0.882292
2,0.1499,0.404429,0.882,0.881975



Evaluating DistilBERT...



DistilBERT evaluation results:
{'eval_loss': 0.3012445569038391, 'eval_accuracy': 0.883, 'eval_f1': 0.8822919487348013, 'eval_runtime': 7.321, 'eval_samples_per_second': 136.592, 'eval_steps_per_second': 8.605, 'epoch': 2.0}

DistilBERT Accuracy: 0.883
DistilBERT F1-score: 0.8822919487348013


## Section 7 — DistilBERT Confusion Matrix & Error Analysis

In [14]:
# Evaluate DistilBERT on the test set with confusion matrix and error analysis

print("Running predictions with DistilBERT...")
pred_output = trainer.predict(test_tokenized)
logits = pred_output.predictions
y_pred_distilbert = np.argmax(logits, axis=-1)

# Confusion matrix (plot + save)
cm_distilbert = confusion_matrix(y_test, y_pred_distilbert)
plot_confusion_matrix(
    cm_distilbert,
    classes=["neg", "pos"],
    title="DistilBERT Confusion Matrix",
    filename="cm_distilbert.png",
)

print("\nClassification report (DistilBERT):")
print(classification_report(y_test, y_pred_distilbert, target_names=["neg", "pos"]))

# Show a few misclassified samples for error analysis
print("\nSample misclassified reviews (DistilBERT):\n")
distilbert_errors = []
for text, true_label, pred_label in zip(X_test, y_test, y_pred_distilbert):
    if true_label != pred_label:
        distilbert_errors.append((text, true_label, pred_label))

for text, true_label, pred_label in distilbert_errors[:5]:
    print("-" * 80)
    print(f"True label:    {'pos' if true_label == 1 else 'neg'}")
    print(f"Predicted as:  {'pos' if pred_label == 1 else 'neg'}")
    print("Review snippet:")
    print(text[:500].replace("\n", " "))
    print()

Running predictions with DistilBERT...


Saved confusion matrix to: ./outputs/cm_distilbert.png

Classification report (DistilBERT):
              precision    recall  f1-score   support

         neg       0.93      0.82      0.87       476
         pos       0.85      0.94      0.89       524

    accuracy                           0.88      1000
   macro avg       0.89      0.88      0.88      1000
weighted avg       0.89      0.88      0.88      1000


Sample misclassified reviews (DistilBERT):

--------------------------------------------------------------------------------
True label:    neg
Predicted as:  pos
Review snippet:
My brother plays "Moose" in this film. Although most of his scenes were left on the cutting room floor. The funniest line is the movie is "nothing wong with stat." So anyway, this is filmed in Portland, OR, where we grew up. The dance club is/was called "Up Front FX". What I loved about this movie is that the main character (who is not named on the box because Bolo brings more clout) is supposed to

## Section 8 — Final Comparison (Baseline vs DistilBERT)

In [15]:
# Compare baseline (TF-IDF + LinearSVC) vs DistilBERT fine-tuning

print("Final model comparison:\n")
print(f"Baseline Accuracy:   {baseline_acc:.4f}")
print(f"Baseline F1-score:   {baseline_f1:.4f}")
print(f"DistilBERT Accuracy: {distilbert_acc:.4f}")
print(f"DistilBERT F1-score: {distilbert_f1:.4f}")

Final model comparison:

Baseline Accuracy:   0.8410
Baseline F1-score:   0.8410
DistilBERT Accuracy: 0.8830
DistilBERT F1-score: 0.8823
