In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


In [None]:
# Carica il dataset
file_path = 'input/output.csv'
data = pd.read_csv(file_path)

data = data.dropna(subset=['content'])
print("Numero di righe prima del filtro:", len(data))

def is_english_alphabet(text):
    return bool(re.match(r'^[A-Za-z0-9\s.,!?\'"]+$', text))


data['is_english'] = data['content'].apply(is_english_alphabet)
data = data[data['is_english']]
print("Numero di righe dopo il filtro:", len(data))

print("Distribuzione delle classi:\n", data['is_positive'].value_counts())

Numero di righe prima del filtro: 200723
Numero di righe dopo il filtro: 141278
Distribuzione delle classi:
 is_positive
Positive    73178
Negative    68100
Name: count, dtype: int64


In [4]:
# Feature di interesse
X = data['content']
y = data['is_positive']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [5]:
# Trasformare il testo in rappresentazioni numeriche
tfidf = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


## Addestramento Modello

In [6]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)


In [7]:
# Valutazione
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8236362778406946

Classification Report:
               precision    recall  f1-score   support

    Negative       0.85      0.77      0.81     20430
    Positive       0.80      0.87      0.84     21954

    accuracy                           0.82     42384
   macro avg       0.83      0.82      0.82     42384
weighted avg       0.83      0.82      0.82     42384



In [8]:
# Mostriamo alcune recensioni e la loro predizione
test_results = pd.DataFrame({
    'review': X_test,
    'predicted': y_pred,
    'actual': y_test
}).reset_index(drop=True)

print(test_results.head(10))

                                              review predicted    actual
0                  Source engine is cool and bright.  Positive  Positive
1  Great game I'll recommend the game to others a...  Positive  Positive
2      having aids is more enjoyable then this shite  Negative  Negative
3              Great game to play with your friends!  Positive  Positive
4                           Why is this game dead...  Negative  Positive
5                   one of the best games ever probs  Positive  Positive
6  yep, still fun after all these years. Been pla...  Positive  Positive
7  Old and no players but still better than fortnite  Positive  Positive
8             cs 1.6 is better but css is still good  Positive  Positive
9                     ONE OF THE BEST GAME EVER!!!!!  Positive  Positive


In [9]:
# Nuove recensioni
new_reviews = ["This game is amazing!", "It's a buggy mess."]
new_reviews_tfidf = tfidf.transform(new_reviews)

# Predizione
predictions = model.predict(new_reviews_tfidf)
print(predictions)


['Positive' 'Negative']


## Testing dati

In [11]:

new_file_path = 'input/new_reviews.csv'
new_data = pd.read_csv(new_file_path)

new_reviews_tfidf = tfidf.transform(new_data['content'])

# Predizione
new_predictions = model.predict(new_reviews_tfidf)

# Aggiungi le predizioni al dataframe
new_data['predicted'] = new_predictions

# Mostra le recensioni con le predizioni
print("\nNuove recensioni con predizioni:\n")
print(new_data[['content', 'predicted']].to_string(index=False))



Nuove recensioni con predizioni:

                                                         content predicted
           This product is amazing! It exceeded my expectations.  Positive
                I am very disappointed. The quality is terrible.  Negative
     Fast delivery and great customer service. Highly recommend!  Negative
     The item arrived broken and the support team was unhelpful.  Negative
                     Good value for money. I would buy it again.  Negative
            Terrible experience. Will not order from here again.  Negative
                       Absolutely love it! Perfect for my needs.  Positive
                              Not as described. Very misleading.  Negative
           Fantastic quality and easy to use. Worth every penny.  Positive
        The worst purchase I have ever made. Avoid at all costs.  Negative
       The packaging was great, and the product works perfectly.  Positive
      Awful customer service. They never responded to my emails. 