In [1]:
import re
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    Trainer, TrainingArguments,
    DataCollatorWithPadding,
    logging
)
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [3]:
# Check for GPU availability
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# 1. Load IMDb dataset from Hugging Face
dataset = load_dataset("imdb")
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
print(f"Training set size: {len(train_df)}")
print(f"Test set size    : {len(test_df)}")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Training set size: 25000
Test set size    : 25000


In [5]:
print("Label distribution (train):\n", train_df["label"].value_counts())
print("Label distribution (test):\n", test_df["label"].value_counts())

Label distribution (train):
 label
0    12500
1    12500
Name: count, dtype: int64
Label distribution (test):
 label
0    12500
1    12500
Name: count, dtype: int64


In [6]:
# 2. Text cleaning function
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric chars except spaces
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

In [7]:
train_df["cleaned_text"] = train_df["text"].apply(clean_text)
test_df["cleaned_text"] = test_df["text"].apply(clean_text)

In [8]:
# 3. Tokenization and Dataset class
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
from torch.utils.data import Dataset

In [10]:
class IMDbDataset(Dataset):
    def __init__(self, df):
        self.encodings = tokenizer(
            df["cleaned_text"].tolist(),
            truncation=True,
            padding=True,
            max_length=128
        )
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = IMDbDataset(train_df)
test_dataset = IMDbDataset(test_df)

KeyboardInterrupt: 

In [11]:
num_labels = 2
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.to(device);

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [12]:
# 5. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # disable external logging by default
    seed=42,
    save_total_limit=1,
)

In [13]:
# 6. Data collator (helps dynamic padding in batches)
data_collator = DataCollatorWithPadding(tokenizer)

In [16]:
# 7. Metrics function for evaluation
import evaluate
accuracy_metric = evaluate.load("accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)
    return {"accuracy": acc["accuracy"]}

In [18]:
# 8. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
print("Starting the training")
trainer.train()

Starting the training
{'loss': 0.4145, 'grad_norm': 10.015922546386719, 'learning_rate': 4.201855406269994e-05, 'epoch': 0.3198976327575176}
{'loss': 0.3339, 'grad_norm': 4.364368915557861, 'learning_rate': 3.4021113243761995e-05, 'epoch': 0.6397952655150352}
{'loss': 0.3028, 'grad_norm': 14.418339729309082, 'learning_rate': 2.602367242482406e-05, 'epoch': 0.9596928982725528}
{'eval_loss': 0.29469093680381775, 'eval_accuracy': 0.878, 'eval_runtime': 177.6336, 'eval_samples_per_second': 140.739, 'eval_steps_per_second': 4.402, 'epoch': 1.0}
{'loss': 0.1916, 'grad_norm': 16.769746780395508, 'learning_rate': 1.8026231605886118e-05, 'epoch': 1.2795905310300704}
{'loss': 0.1766, 'grad_norm': 10.06059741973877, 'learning_rate': 1.0028790786948176e-05, 'epoch': 1.599488163787588}
{'loss': 0.1699, 'grad_norm': 12.416043281555176, 'learning_rate': 2.0313499680102367e-06, 'epoch': 1.9193857965451055}


In [None]:
print("\nEvaluating model on test set...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

In [None]:
print("\nGenerating predictions on test set...")
pred_output = trainer.predict(test_dataset)
preds = np.argmax(pred_output.predictions, axis=1)
labels = pred_output.label_ids

In [None]:
print("\nClassification Report:\n")
print(classification_report(labels, preds, target_names=["negative", "positive"]))

In [None]:
cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["negative", "positive"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

In [None]:
compare_df = pd.DataFrame({
    "text": test_df["cleaned_text"],
    "actual_label": labels,
    "predicted_label": preds
})

In [None]:
print("\nSample predictions:\n")
print(compare_df.sample(10).to_string(index=False))

In [15]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
