In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import nltk
import re
import swifter

nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Load dataset
df = pd.read_json("data/Electronics_5.json", lines=True)

# Filter & map ratings to sentiment
def map_sentiment(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"

df['sentiment'] = df['overall'].apply(map_sentiment)
df['reviewText'] = df['reviewText'].astype(str)


In [None]:
# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text) # rimozione punteggiatura
    text = re.sub(r"\d+", "", text) # rimozione numeri
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    return text

df['cleaned'] = df['reviewText'].swifter.apply(clean_text)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned'], df['sentiment'], test_size=0.2, stratify=df['sentiment'], random_state=42
)


In [None]:
# Show comparison of original and cleaned reviews
comparison_df = df[['reviewText', 'cleaned', 'sentiment']]
print(comparison_df.head(10))  
comparison_df.head(10)         

In [None]:
# === Traditional Models ===

# TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)


In [None]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, train_labels)
lr_preds = lr.predict(X_test)

In [None]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, train_labels)
nb_preds = nb.predict(X_test)

In [None]:
def evaluate_model(name, preds):
    print(f"\n{name} Classification Report:")
    print(classification_report(test_labels, preds))
    cm = confusion_matrix(test_labels, preds, labels=["negative", "neutral", "positive"])
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=["Neg", "Neu", "Pos"], yticklabels=["Neg", "Neu", "Pos"])
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

evaluate_model("Logistic Regression", lr_preds)
evaluate_model("Naive Bayes", nb_preds)

# === Transformer-based Model ===

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

In [None]:
# Encode labels
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['sentiment'].map(label_map)

# Prepare dataset
tokenized_data = tokenizer(list(df['reviewText']), padding=True, truncation=True, return_tensors='pt')
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'label': df['label']
})
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Define Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

trainer.train()

# Evaluate BERT model
bert_preds = trainer.predict(dataset['test']).predictions.argmax(-1)
bert_labels = dataset['test']['label']
print("\nDistilBERT Classification Report:")
print(classification_report(bert_labels, bert_preds, target_names=["negative", "neutral", "positive"]))