<a href="https://colab.research.google.com/github/anis-mselmi/Restaurant-Review-Sentiment-Analysis/blob/main/Restaurant_Review_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install transformers datasets evaluate accelerate scikit-learn sentencepiece


In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline
)
import evaluate
from sklearn.metrics import classification_report, accuracy_score

print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")


In [None]:
dataset_name = "yelp_polarity"
raw_datasets = load_dataset(dataset_name)

print(raw_datasets)
print(raw_datasets["train"][0])


In [None]:
num_labels = len(raw_datasets["train"].unique("label"))
print("Number of labels:", num_labels)


In [15]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
max_length = 256

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_length)

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    [c for c in tokenized_datasets["train"].column_names if c not in ["input_ids","attention_mask","label"]]
)
tokenized_datasets.set_format("torch")


KeyboardInterrupt: 

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)


In [None]:
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=preds, references=labels)["accuracy"]
    f1 = metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "f1": f1}


In [None]:
training_args = TrainingArguments(
    output_dir="./distilbert-sentiment",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16 if torch.cuda.is_available() else 8,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [None]:
train_result = trainer.train()
trainer.save_model("./distilbert-sentiment")


In [None]:
metrics = trainer.evaluate(tokenized_datasets["test"])
print("Evaluation:", metrics)

preds_output = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(preds_output.predictions, axis=-1)
labels = preds_output.label_ids

print("Accuracy:", accuracy_score(labels, preds))
print("\nClassification Report:\n")
print(classification_report(labels, preds, digits=4))


In [None]:
sentiment_pipeline = pipeline(
    "text-classification",
    model="./distilbert-sentiment",
    tokenizer=tokenizer,
    return_all_scores=False,
    device=0 if torch.cuda.is_available() else -1
)

custom_reviews = [
    "The tacos were amazing and service was great!",
    "Food was cold and staff were rude. I will not come back.",
    "Decent meal for the price, nothing special."
]

preds = sentiment_pipeline(custom_reviews)
for text, res in zip(custom_reviews, preds):
    print(f"Review: {text}\n -> label: {res['label']}, score: {res['score']:.4f}\n")
