<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Likhitha/Transformer_From_Scratch_md.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install -U transformers datasets
!pip install --upgrade transformers

import re
import random
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import classification_report, accuracy_score
import torch
import torch.nn.functional as F
import transformers
print(transformers.__version__)


from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from transformers.trainer_utils import EvalPrediction
from typing import Dict

# ---------- Settings ----------
MODEL_NAME = "bert-base-uncased"
MAX_LENGTH = 128
RANDOM_SEED = 42
NUM_LABELS = 2

# Training hyperparameters (tune as needed)
NUM_EPOCHS = 2
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
OUTPUT_DIR = "./imdb_bert_results"

# ---------- Reproducibility ----------
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

4.54.1


In [19]:
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")
train_raw = dataset["train"]
test_raw = dataset["test"]
print(f"Train size: {len(train_raw)}, Test size: {len(test_raw)}")

# Quick sample
print("\nSample review (raw):")
print(train_raw[0]["text"][:400])
print("Label:", train_raw[0]["label"])

Loading IMDb dataset...
Train size: 25000, Test size: 25000

Sample review (raw):
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student name
Label: 0


In [3]:
def clean_text(text: str) -> str:
    """
    Basic cleaning: remove HTML tags, extra whitespace, and repetitive special chars.
    Keep it conservative to avoid removing sentiment cues.
    """
    if not isinstance(text, str):
        return ""
    # remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # replace URLs
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    # remove weird repeated whitespace and newlines
    text = re.sub(r"\s+", " ", text).strip()
    # optionally remove solitary non-alphanumeric chars except punctuation useful for sentiment
    # keep basic punctuation: . , ! ? ' "
    text = re.sub(r"[^0-9A-Za-z\.\,\!\?\:;\'\"\s\-]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Example cleaning
print("\nCleaned sample:")
print(clean_text(train_raw[0]["text"])[:400])


Cleaned sample:
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself. The plot is centered around a young Swedish drama student named Lena who 


In [4]:
print("\nLoading tokenizer:", MODEL_NAME)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)


Loading tokenizer: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
def preprocess_batch(batch):
    texts = [clean_text(t) for t in batch["text"]]
    enc = tokenizer(
        texts,
        truncation=True,
        padding=False,  # padding handled later by DataCollator
        max_length=MAX_LENGTH,
        return_attention_mask=True,
    )
    enc["labels"] = batch["label"]
    return enc

# Use datasets.map for efficient tokenization (batched)
print("\nTokenizing train dataset (this may take a minute)...")
train_tokenized = train_raw.map(
    preprocess_batch, batched=True, batch_size=256, remove_columns=["text"]
)
print("Tokenized train features:", train_tokenized.column_names)

print("\nTokenizing test dataset...")
test_tokenized = test_raw.map(
    preprocess_batch, batched=True, batch_size=256, remove_columns=["text"]
)
print("Tokenized test features:", test_tokenized.column_names)

# Set format for PyTorch
train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Tokenizing train dataset (this may take a minute)...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenized train features: ['label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']

Tokenizing test dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenized test features: ['label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels']


In [6]:
print("\nLoading model:", MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)



Loading model: bert-base-uncased


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def compute_metrics(eval_pred: EvalPrediction) -> Dict:
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    # We'll return a small dict needed by Trainer plus extras accessible programmatically
    # Note: Trainer expects scalar values; include accuracy and macro_f1 as primary metrics
    precision_neg = report["0"]["precision"]
    recall_neg = report["0"]["recall"]
    f1_neg = report["0"]["f1-score"]
    precision_pos = report["1"]["precision"]
    recall_pos = report["1"]["recall"]
    f1_pos = report["1"]["f1-score"]
    macro_f1 = (f1_neg + f1_pos) / 2.0
    accuracy = accuracy_score(labels, preds)
    return {
        "accuracy": accuracy,
        "macro_f1": macro_f1,
        "precision_pos": precision_pos,
        "recall_pos": recall_pos,
        "f1_pos": f1_pos,
        "precision_neg": precision_neg,
        "recall_neg": recall_neg,
        "f1_neg": f1_neg,
    }