In [2]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import numpy as np
import random

# Use MPS on macOS if available, else GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
print("Using device:", device)


Using device: mps


In [3]:

# Hugging Face "imdb" dataset loads a dictionary with "train" and "test"
imdb = load_dataset("imdb")
# imdb['train'] has 25k examples; imdb['test'] has 25k examples
print(imdb)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [4]:
train_texts = imdb['train']['text']
train_labels = imdb['train']['label']

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.1,
    random_state=42,
    stratify= train_labels
)

print(f"Train set size: {len(train_texts)}")
print(f"Val   set size: {len(val_texts)}")
print(f"Test  set size: {len(imdb['test'])}")


Train set size: 22500
Val   set size: 2500
Test  set size: 25000


In [5]:

# We'll use the standard "bert-base-uncased" tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

# Because IMDB reviews can be somewhat long, let's allow up to e.g. 256 tokens
MAX_LENGTH = 256

def tokenize_function(texts):
    return tokenizer(
        texts,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True
    )

# Tokenize each split
train_encodings = tokenize_function(train_texts)
val_encodings   = tokenize_function(val_texts)
test_encodings  = tokenize_function(imdb['test']['text'])


In [6]:
# Convert tokenized outputs into PyTorch Datasets
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset   = IMDbDataset(val_encodings,   val_labels)
test_dataset  = IMDbDataset(test_encodings,  imdb['test']['label'])


In [7]:


class CustomBERTModel(nn.Module):
    """
    BERT encoder (frozen) + 3-layer feedforward classifier on top.
    Output dimension = 2 (positive vs. negative).
    """
    def __init__(self, pretrained_model_name="bert-base-uncased", num_classes=2):
        super().__init__()
        
        # Load pretrained BERT
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        
        # Freeze all BERT weights so we only train the classifier layers
        for param in self.bert.parameters():
            param.requires_grad = False
        # 4-layer classifier: 768 -> 512 -> 256 -> 128 -> 2
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)  # 2 classes for IMDB (pos vs neg)
        )
    def forward(self, input_ids, attention_mask, labels=None):
        # Get last-layer hidden states & pooler from BERT
        # outputs: BaseModelOutputWithPoolingAndNoAttention
        outputs = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask
        )
        
        # "pooled_output" is the [CLS] representation
        pooled_output = outputs.pooler_output  # shape [batch_size, 768]
        
        # Pass pooled_output through our classifier
        logits = self.classifier(pooled_output)  # shape [batch_size, 2]
        
        return logits


In [8]:
# Instantiate our custom model
model = CustomBERTModel(pretrained_model_name="bert-base-uncased", num_classes=2)
model.to(device)


CustomBERTModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [9]:

batch_size = 8

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    shuffle=False
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False
)

print("Dataloaders ready.")


Dataloaders ready.


In [10]:

import torch.optim as optim

# We'll use cross-entropy for 2-class classification
criterion = nn.CrossEntropyLoss()

# Only the classifier params are trainable. Let's confirm:
trainable_params = [p for p in model.parameters() if p.requires_grad]
print(f"Number of trainable parameters = {sum(p.numel() for p in trainable_params)}")

optimizer = optim.Adam(trainable_params, lr=1e-4)

# For reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)


Number of trainable parameters = 558210


In [11]:


EPOCHS = 2  # For demo; you might want more in practice

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0.0
    total_train_correct = 0
    total_examples = 0
    
    for batch in train_loader:
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        # Forward
        logits = model(input_ids, attention_mask)  # [batch_size, 2]
        
        # Loss
        loss = criterion(logits, labels)
        
        # Backprop
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        
        # Compute training accuracy
        preds = torch.argmax(logits, dim=1)
        correct = (preds == labels).sum().item()
        total_train_correct += correct
        total_examples += labels.size(0)
    
    avg_train_loss = total_train_loss / len(train_loader)
    train_acc = total_train_correct / total_examples
    
    # ------------------------------
    # Validation
    # ------------------------------
    model.eval()
    total_val_loss = 0.0
    total_val_correct = 0
    total_val_examples = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids      = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels         = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_val_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1)
            correct = (preds == labels).sum().item()
            total_val_correct += correct
            total_val_examples += labels.size(0)
    
    avg_val_loss = total_val_loss / len(val_loader)
    val_acc = total_val_correct / total_val_examples
    
    print(f"Epoch [{epoch+1}/{EPOCHS}]")
    print(f"  Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val   Loss: {avg_val_loss:.4f} | Val   Acc: {val_acc:.4f}")
    print("-------------------------------------------------------")


Epoch [1/2]
  Train Loss: 0.5607 | Train Acc: 0.7029
  Val   Loss: 0.4592 | Val   Acc: 0.7892
-------------------------------------------------------
Epoch [2/2]
  Train Loss: 0.4800 | Train Acc: 0.7719
  Val   Loss: 0.4267 | Val   Acc: 0.8020
-------------------------------------------------------


In [12]:

model.eval()
total_test_correct = 0
total_test_examples = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids      = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels         = batch['labels'].to(device)
        
        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        correct = (preds == labels).sum().item()
        
        total_test_correct += correct
        total_test_examples += labels.size(0)

test_acc = total_test_correct / total_test_examples
print(f"Test Accuracy = {test_acc:.4f}  (on {total_test_examples} test samples)")


Test Accuracy = 0.8050  (on 25000 test samples)


In [13]:
# let's try it
example_texts = [
    "This movie was absolutely wonderful! The acting was incredible and I loved it.",
    "Terrible film. The script was awful and the characters were boring."
]

model.eval()
tokens = tokenizer(
    example_texts, 
    max_length=256, 
    padding="max_length", 
    truncation=True, 
    return_tensors="pt"
).to(device)

with torch.no_grad():
    logits = model(tokens["input_ids"], tokens["attention_mask"])
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

print("Predictions (0=NEG, 1=POS):", predictions)
for text, pred in zip(example_texts, predictions):
    sentiment = "POSITIVE" if pred == 1 else "NEGATIVE"
    print(f"\nReview: {text}\n => Sentiment: {sentiment}")


Predictions (0=NEG, 1=POS): [1 0]

Review: This movie was absolutely wonderful! The acting was incredible and I loved it.
 => Sentiment: POSITIVE

Review: Terrible film. The script was awful and the characters were boring.
 => Sentiment: NEGATIVE
