In [12]:
# ✅ 1. Install Necessary Packages
!pip install -q transformers datasets

# ✅ 2. Import Libraries
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
import numpy as np

# ✅ 3. Load IMDb Dataset
dataset = load_dataset("imdb")

# ✅ 4. Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# ✅ 5. Custom PyTorch Dataset class
class IMDbDataset(Dataset):
    def __init__(self, split, max_len=256):
        self.data = dataset[split]
        self.max_len = max_len

    def __len__(self):
        return 2000 if len(self.data) > 2000 else len(self.data)  # for quick test

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]
        encoding = tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# ✅ 6. Prepare Dataloaders
train_dataset = IMDbDataset("train")
test_dataset = IMDbDataset("test")

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# ✅ 7. Load BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ 8. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# ✅ 9. Training Loop
model.train()
for epoch in range(2):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"✅ Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

# ✅ 10. Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f"🎯 Test Accuracy: {correct / total:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Epoch 1: Loss = 0.0183
✅ Epoch 2: Loss = 0.0002
🎯 Test Accuracy: 1.0000


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.

