In [None]:
import logging

import numpy as np
import plotly.graph_objects as go
import torch
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

logging.getLogger("transformers").setLevel(logging.ERROR)

# Class names
classes = ["Negative", "Positive"]

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load IMDB dataset from Hugging Face
dataset = load_dataset("stanfordnlp/imdb")

print(f"Training set size: {len(dataset['train'])}")
print(f"Test set size: {len(dataset['test'])}")

# Show a sample
sample = dataset["train"][0]
print(f"\nSample review (truncated):\n{sample['text'][:200]}...")
print(f"Label: {sample['label']} ({classes[sample['label']]})")

Training set size: 25000
Test set size: 25000

Sample review (truncated):
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev...
Label: 0 (Negative)


In [4]:
# For faster training, we'll use a subset of the data
# Take 1000 samples for training and 200 for testing (optimized for speed)
train_subset = dataset["train"].shuffle(seed=42).select(range(1000))
test_subset = dataset["test"].shuffle(seed=42).select(range(200))

print(f"Training subset size: {len(train_subset)}")
print(f"Test subset size: {len(test_subset)}")

Training subset size: 1000
Test subset size: 200


In [5]:
# Analyze class distribution
train_labels = [example["label"] for example in train_subset]
test_labels = [example["label"] for example in test_subset]

# Count labels
train_counts = np.bincount(train_labels)
test_counts = np.bincount(test_labels)

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(
    go.Bar(
        name="Training Set",
        x=classes,
        y=train_counts,
        text=train_counts,
        textposition="auto",
        marker_color="lightblue",
    )
)

fig.add_trace(
    go.Bar(
        name="Test Set",
        x=classes,
        y=test_counts,
        text=test_counts,
        textposition="auto",
        marker_color="lightcoral",
    )
)

fig.update_layout(
    title="Class Distribution in Training and Test Sets",
    xaxis_title="Class",
    yaxis_title="Number of Samples",
    barmode="group",
    width=800,
    height=500,
)

fig.show()

# Print statistics
print("Training set distribution:")
for i, (class_name, count) in enumerate(zip(classes, train_counts)):
    percentage = (count / len(train_subset)) * 100
    print(f"{class_name:15} {count:5d} ({percentage:.1f}%)")

print(f"\nTotal training samples: {len(train_subset)}")
print(f"Total test samples: {len(test_subset)}")

Training set distribution:
Negative          512 (51.2%)
Positive          488 (48.8%)

Total training samples: 1000
Total test samples: 200


In [6]:
# Load pre-trained DistilBERT tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example of tokenization
sample_text = "This movie was absolutely fantastic! I loved every minute of it."
tokens = tokenizer(sample_text, padding=True, truncation=True, return_tensors="pt")

print(f"Original text: {sample_text}")
print(f"\nTokenized input IDs shape: {tokens['input_ids'].shape}")
print(f"Input IDs: {tokens['input_ids'][0][:20]}...")  # Show first 20 tokens
print(f"\nAttention mask shape: {tokens['attention_mask'].shape}")
print(f"Attention mask: {tokens['attention_mask'][0][:20]}...")

# Decode to see the tokens
print(f"\nDecoded tokens: {tokenizer.decode(tokens['input_ids'][0])}")

Original text: This movie was absolutely fantastic! I loved every minute of it.

Tokenized input IDs shape: torch.Size([1, 15])
Input IDs: tensor([  101,  2023,  3185,  2001,  7078, 10392,   999,  1045,  3866,  2296,
         3371,  1997,  2009,  1012,   102])...

Attention mask shape: torch.Size([1, 15])
Attention mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])...

Decoded tokens: [CLS] this movie was absolutely fantastic! i loved every minute of it. [SEP]


In [7]:
# Create PyTorch Dataset
class IMDBDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length=256):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        encoding = self.tokenizer(
            item["text"],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(item["label"], dtype=torch.long),
        }


# Create datasets
max_seq_length = 128  # Maximum sequence length for tokenization (reduced for faster training)
train_dataset = IMDBDataset(train_subset, tokenizer, max_seq_length)
test_dataset = IMDBDataset(test_subset, tokenizer, max_seq_length)

print(f"Max sequence length: {max_seq_length}")
print(f"Training dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Show a sample
sample = train_dataset[0]
print(f"\nSample input_ids shape: {sample['input_ids'].shape}")
print(f"Sample attention_mask shape: {sample['attention_mask'].shape}")
print(f"Sample label: {sample['label']}")

Max sequence length: 128
Training dataset size: 1000
Test dataset size: 200

Sample input_ids shape: torch.Size([128])
Sample attention_mask shape: torch.Size([128])
Sample label: 1


In [8]:
# Create DataLoaders
batch_size = 16

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

for batch in test_dataloader:
    print(f"Batch input_ids shape: {batch['input_ids'].shape}")
    print(f"Batch attention_mask shape: {batch['attention_mask'].shape}")
    print(f"Batch labels shape: {batch['label'].shape}")
    print(f"Unique labels in batch: {torch.unique(batch['label'])}")
    break

Batch input_ids shape: torch.Size([16, 128])
Batch attention_mask shape: torch.Size([16, 128])
Batch labels shape: torch.Size([16])
Unique labels in batch: tensor([0, 1])


In [9]:
# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(device)

print(model)
print(
    f"\nTotal number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
# Loss function (CrossEntropyLoss is used internally by the model)
# We'll use the model's built-in loss computation

# Optimizer - Adam with weight decay (AdamW)
learning_rate = 2e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

print("Optimizer: AdamW")
print(f"Learning rate: {learning_rate}")

Optimizer: AdamW
Learning rate: 2e-05


In [11]:
def train(dataloader, model, optimizer):
    size = len(dataloader.dataset)
    model.train()
    total_loss = 0
    correct = 0

    for batch_idx, batch in enumerate(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track metrics
        total_loss += loss.item()
        predictions = logits.argmax(dim=-1)
        correct += (predictions == labels).sum().item()

        if batch_idx % 10 == 0:  # Print more frequently (every 10 batches instead of 50)
            current = batch_idx * len(input_ids)
            print(f"loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]")

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / size
    print(f"Train: Avg loss: {avg_loss:.4f}, Accuracy: {(100 * accuracy):>0.1f}%")

In [12]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            test_loss += outputs.loss.item()

            predictions = outputs.logits.argmax(dim=-1)
            correct += (predictions == labels).sum().item()

    test_loss /= len(dataloader)
    accuracy = correct / size
    print(f"Test:  Avg loss: {test_loss:.4f}, Accuracy: {(100 * accuracy):>0.1f}%\n")

In [13]:
epochs = 2

for t in range(epochs):
    print(f"Epoch {t + 1}\n" + "-" * 60)
    train(train_dataloader, model, optimizer)
    test(test_dataloader, model)
print("Done!")

Epoch 1
------------------------------------------------------------
loss: 0.706619  [    0/ 1000]
loss: 0.706619  [    0/ 1000]
loss: 0.662974  [  160/ 1000]
loss: 0.662974  [  160/ 1000]
loss: 0.654089  [  320/ 1000]
loss: 0.654089  [  320/ 1000]
loss: 0.648575  [  480/ 1000]
loss: 0.648575  [  480/ 1000]
loss: 0.674853  [  640/ 1000]
loss: 0.674853  [  640/ 1000]
loss: 0.615691  [  800/ 1000]
loss: 0.615691  [  800/ 1000]
loss: 0.577467  [  960/ 1000]
loss: 0.577467  [  960/ 1000]
Train: Avg loss: 0.6429, Accuracy: 62.3%
Train: Avg loss: 0.6429, Accuracy: 62.3%
Test:  Avg loss: 0.5206, Accuracy: 76.0%

Epoch 2
------------------------------------------------------------
Test:  Avg loss: 0.5206, Accuracy: 76.0%

Epoch 2
------------------------------------------------------------
loss: 0.535371  [    0/ 1000]
loss: 0.535371  [    0/ 1000]
loss: 0.275484  [  160/ 1000]
loss: 0.419958  [  320/ 1000]
loss: 0.417217  [  480/ 1000]
loss: 0.228642  [  640/ 1000]
loss: 0.224788  [  800/ 100

In [14]:
# Save the model
torch.save(model.state_dict(), "distilbert_sentiment.pth")
print("Model saved as 'distilbert_sentiment.pth'")

Model saved as 'distilbert_sentiment.pth'


In [15]:
# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.load_state_dict(torch.load("distilbert_sentiment.pth"))
model = model.to(device)
model.eval()
print("Model loaded successfully")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully


In [16]:
# Get predictions for entire test set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.numpy())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Create confusion matrix
cm = confusion_matrix(all_labels, all_preds)

# Plot confusion matrix
fig = go.Figure(
    data=go.Heatmap(
        z=cm,
        x=classes,
        y=classes,
        colorscale="Blues",
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 14},
        hoverongaps=False,
    )
)

fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Label",
    yaxis_title="True Label",
    width=600,
    height=550,
)

fig.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))


Classification Report:
              precision    recall  f1-score   support

    Negative       0.84      0.73      0.78       104
    Positive       0.75      0.85      0.80        96

    accuracy                           0.79       200
   macro avg       0.79      0.79      0.79       200
weighted avg       0.80      0.79      0.79       200



In [17]:
# Calculate metrics per class
precision, recall, f1, support = precision_recall_fscore_support(
    all_labels, all_preds, average=None
)

# Calculate per-class accuracy
class_correct = np.diag(cm)
class_total = cm.sum(axis=1)
accuracy = class_correct / class_total

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(
    go.Bar(
        name="Accuracy",
        x=classes,
        y=accuracy,
        text=[f"{a:.3f}" for a in accuracy],
        textposition="auto",
    )
)

fig.add_trace(
    go.Bar(
        name="Precision",
        x=classes,
        y=precision,
        text=[f"{p:.3f}" for p in precision],
        textposition="auto",
    )
)

fig.add_trace(
    go.Bar(
        name="Recall",
        x=classes,
        y=recall,
        text=[f"{r:.3f}" for r in recall],
        textposition="auto",
    )
)

fig.add_trace(
    go.Bar(
        name="F1-Score",
        x=classes,
        y=f1,
        text=[f"{f:.3f}" for f in f1],
        textposition="auto",
    )
)

fig.update_layout(
    title="Classification Metrics by Class",
    xaxis_title="Class",
    yaxis_title="Score",
    yaxis_range=[0, 1],
    barmode="group",
    width=800,
    height=500,
)

fig.show()

# Print overall metrics
overall_accuracy = (all_preds == all_labels).mean()
print(f"\nOverall Accuracy: {overall_accuracy:.2%}")
print(f"Average Precision: {precision.mean():.3f}")
print(f"Average Recall: {recall.mean():.3f}")
print(f"Average F1-Score: {f1.mean():.3f}")


Overall Accuracy: 79.00%
Average Precision: 0.795
Average Recall: 0.792
Average F1-Score: 0.790


In [18]:
# Test the model on custom reviews
def predict_sentiment(text, model, tokenizer):
    model.eval()
    encoding = tokenizer(
        text, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        prediction = outputs.logits.argmax(dim=-1).item()
        probabilities = torch.softmax(outputs.logits, dim=-1)[0]

    return classes[prediction], probabilities.cpu().numpy()


# Test with custom examples
test_reviews = [
    "This movie was absolutely brilliant! Best film I've seen in years.",
    "Terrible waste of time. The plot made no sense and the acting was awful.",
    "It was okay, nothing special but not bad either.",
    "A masterpiece of cinema! Every scene was perfect.",
    "Boring and predictable. I fell asleep halfway through.",
]

print("Custom Review Predictions:")
print("=" * 80)
for review in test_reviews:
    sentiment, probs = predict_sentiment(review, model, tokenizer)
    print(f"\nReview: {review}")
    print(f"Predicted: {sentiment}")
    print(f"Probabilities: Negative={probs[0]:.3f}, Positive={probs[1]:.3f}")
    print("-" * 80)

Custom Review Predictions:

Review: This movie was absolutely brilliant! Best film I've seen in years.
Predicted: Positive
Probabilities: Negative=0.072, Positive=0.928
--------------------------------------------------------------------------------

Review: This movie was absolutely brilliant! Best film I've seen in years.
Predicted: Positive
Probabilities: Negative=0.072, Positive=0.928
--------------------------------------------------------------------------------

Review: Terrible waste of time. The plot made no sense and the acting was awful.
Predicted: Negative
Probabilities: Negative=0.955, Positive=0.045
--------------------------------------------------------------------------------

Review: Terrible waste of time. The plot made no sense and the acting was awful.
Predicted: Negative
Probabilities: Negative=0.955, Positive=0.045
--------------------------------------------------------------------------------

Review: It was okay, nothing special but not bad either.
Predicted: 