# Below is a comprehensive, runnable solution implementing a Transformer-based sentiment classification on the Yelp Polarity Reviews dataset using BERT with Hugging Face’s Transformers and Datasets libraries. It covers:

- Data loading and cleaning

- Dataset preparation with tokenization

- Fine-tuning a BERT classifier

- Evaluation with classification report and confusion matrix

- Sample inspection of predictions

In [None]:
# Imports
import re
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import matplotlib.pyplot as plt

# Check device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

# 1. Load Yelp Polarity dataset
dataset = load_dataset("yelp_polarity")
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

print(f"Train set size: {len(train_df)}, Test set size: {len(test_df)}")
print("Class distribution in train: \n", train_df['label'].value_counts())
print("Class distribution in test: \n", test_df['label'].value_counts())

# 2. Define text cleaning function
def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # remove mentions if any
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # remove punctuations
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# 3. Prepare Dataset class for tokenization
from torch.utils.data import Dataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class YelpDataset(Dataset):
    def __init__(self, df):
        self.encodings = tokenizer(df['cleaned_text'].tolist(), truncation=True, padding=True, max_length=128)
        self.labels = df['label'].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = YelpDataset(train_df)
test_dataset = YelpDataset(test_df)

# 4. Load BERT for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device);

# 5. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    logging_steps=500,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    report_to='none',
    seed=42
)

# 6. Data collator (handles padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 7. Define compute_metrics function for evaluation
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)
    return acc

# 8. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 9. Train the model
trainer.train()

# 10. Evaluate on test set
eval_results = trainer.evaluate()
print(f"\nEvaluation results: {eval_results}")

# 11. Predict on test set and classification report
predictions_output = trainer.predict(test_dataset)
preds = np.argmax(predictions_output.predictions, axis=1)
labels = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(labels, preds, target_names=['Negative', 'Positive']))

# 12. Plot confusion matrix
cm = confusion_matrix(labels, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

# 13. Inspect some predictions with text
results_df = pd.DataFrame({
    'text': test_df['cleaned_text'],
    'actual_label': labels,
    'predicted_label': preds
})

print("\nSample predictions:")
print(results_df.sample(10))



## Notes:
- The model is fine-tuned for 2 epochs; you can increase epochs for better accuracy depending on runtime and compute.

- max_length=128 in tokenization balances between context/capacity and speed.

- The compute_metrics function computes accuracy during evaluation.

- You can further extend the analysis by inspecting misclassified examples or experimenting with learning rates and batch sizes. 