In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import nltk
import spacy
import re
import swifter

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carminefa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load dataset
df = pd.read_json("data/Electronics_5.json", lines=True)

# Filter & map ratings to sentiment
def map_sentiment(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"

df['sentiment'] = df['overall'].apply(map_sentiment)
df['reviewText'] = df['reviewText'].astype(str)


In [None]:
# Utilizzo modello ottimizzato spaCy (solo tagger/tokenizer/lemmatizer)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Funzione di pulizia batch
def batch_clean(text_series):
    docs = nlp.pipe(text_series, batch_size=1000, n_process=2)
    return [
        " ".join(
            [t.lemma_ for t in doc if not t.is_stop and not t.is_punct and not t.like_num]
        )
        for doc in docs
    ]

# 4. Pulizia del testo
df['cleaned'] = batch_clean(df['reviewText'])

# 5. Split del dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned'], df['sentiment'], test_size=0.2, stratify=df['sentiment'], random_state=42
)

SyntaxError: unterminated string literal (detected at line 19) (2239326290.py, line 19)

In [None]:
# Show comparison of original and cleaned reviews
comparison_df = df[['reviewText', 'cleaned', 'sentiment']]
print(comparison_df.head(10))  
comparison_df.head(10)         

In [None]:
# === Traditional Models ===

# TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)


In [None]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, train_labels)
lr_preds = lr.predict(X_test)

In [None]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, train_labels)
nb_preds = nb.predict(X_test)

In [None]:
def evaluate_model(name, preds):
    print(f"\n{name} Classification Report:")
    print(classification_report(test_labels, preds))
    cm = confusion_matrix(test_labels, preds, labels=["negative", "neutral", "positive"])
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=["Neg", "Neu", "Pos"], yticklabels=["Neg", "Neu", "Pos"])
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

evaluate_model("Logistic Regression", lr_preds)
evaluate_model("Naive Bayes", nb_preds)

# === Transformer-based Model ===

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

In [None]:
# Encode labels
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['sentiment'].map(label_map)

# Prepare dataset
tokenized_data = tokenizer(list(df['reviewText']), padding=True, truncation=True, return_tensors='pt')
dataset = Dataset.from_dict({
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'label': df['label']
})
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Define Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

trainer.train()

# Evaluate BERT model
bert_preds = trainer.predict(dataset['test']).predictions.argmax(-1)
bert_labels = dataset['test']['label']
print("\nDistilBERT Classification Report:")
print(classification_report(bert_labels, bert_preds, target_names=["negative", "neutral", "positive"]))

## 🌐 Word2Vec Embedding e Confronto

In questa sezione, rappresentiamo ogni review come un vettore denso (dense embedding vector) generato da Word2Vec. 
Questo consente di catturare significati semantici e relazioni tra le parole, migliorando potenzialmente la capacità del classificatore.

**Passaggi principali:**
1. Preprocessing delle recensioni (tokenizzazione).
2. Addestramento modello Word2Vec sul dataset.
3. Conversione di ogni review in un vettore denso (media degli embedding delle parole).
4. Addestramento e valutazione di un classificatore (es. Logistic Regression) usando questi vettori.
5. Confronto delle metriche con il modello precedente (TF-IDF).


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Tokenizzazione
tokenized_reviews = [word_tokenize(review.lower()) for review in df['text']]

# Addestramento modello Word2Vec
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)

# Media degli embedding per ogni documento
def document_vector(doc):
    words = [word for word in word_tokenize(doc.lower()) if word in w2v_model.wv]
    return np.mean(w2v_model.wv[words], axis=0) if words else np.zeros(100)

X_w2v = np.array([document_vector(doc) for doc in df['text']])
y = df['label']

# Train/test split
from sklearn.model_selection import train_test_split
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

# Classificatore
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_w2v, y_train)
y_pred = clf.predict(X_test_w2v)

# Valutazione
print(classification_report(y_test, y_pred))