In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Load the data
yelp_data = pd.read_csv('yelp_labelled.txt', sep="\t", names=['text', 'sentiment'])
yelp_data.dropna(inplace=True)  # Drop missing values

# Separate the data into train and test sets
X = yelp_data['text']
y = yelp_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Bag-of-Words with Naive Bayes (MultinomialNB)
bow_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Train and evaluate
bow_pipeline.fit(X_train, y_train)
y_pred_bow = bow_pipeline.predict(X_test)
print("Bag-of-Words Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_bow))
print(classification_report(y_test, y_pred_bow))

# TF-IDF with Naive Bayes (GaussianNB requires dense input)
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
    ('classifier', GaussianNB())
])

# Train and evaluate
tfidf_pipeline.fit(X_train, y_train)
y_pred_tfidf = tfidf_pipeline.predict(X_test)
print("TF-IDF Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))


Bag-of-Words Naive Bayes
Accuracy: 0.795
              precision    recall  f1-score   support

         0.0       0.76      0.84      0.80        96
         1.0       0.84      0.75      0.79       104

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.79       200
weighted avg       0.80      0.80      0.79       200

TF-IDF Naive Bayes
Accuracy: 0.695
              precision    recall  f1-score   support

         0.0       0.73      0.57      0.64        96
         1.0       0.67      0.81      0.73       104

    accuracy                           0.69       200
   macro avg       0.70      0.69      0.69       200
weighted avg       0.70      0.69      0.69       200

