In [1]:
import pandas
import pickle
import cleantext
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
with open("data/processed_df.pickle", "rb") as data:
    df = pickle.load(data)
    
with open("data/train_features.pickle", "rb") as data:
    train_features = pickle.load(data)

with open("data/test_features.pickle", "rb") as data:
    test_features = pickle.load(data)

with open("data/train_labels.pickle", "rb") as data:
    train_labels = pickle.load(data)

with open("data/test_labels.pickle", "rb") as data:
    test_labels = pickle.load(data)

with open("data/tfidf.pickle", "rb") as data:
    tfidf = pickle.load(data)

In [3]:
mnb_classifier = MultinomialNB()
mnb_classifier.fit(train_features, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [4]:
mnb_predictor = mnb_classifier.predict(test_features)

## Check for accuracy

In [5]:
train_accuracy = accuracy_score(train_labels, mnb_classifier.predict(train_features))
print(train_accuracy)

0.82984


In [6]:
test_accuracy = accuracy_score(test_labels, mnb_predictor)
print(test_accuracy)

0.82824


In [7]:
print(classification_report(test_labels, mnb_predictor))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83     12457
           1       0.83      0.82      0.83     12543

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



In [8]:
print(confusion_matrix(test_labels, mnb_predictor))

[[10394  2063]
 [ 2231 10312]]


## Test model on reviews not in dataset

In [9]:
sentiment_map = {"Negative":0, "Positive":1}

def get_sentiment(text):
    sentiment_id = mnb_classifier.predict(tfidf.transform([text]))
    return get_name(sentiment_id)

def get_name(sentiment_id):
    for sentiment, id_ in sentiment_map.items():
        if id_ == sentiment_id:
            return sentiment
    
def sentiment_analysis(text):
    text = cleantext.clean(text, all = True)
    sentiment = get_sentiment(text)
    print(f'Sentiment: {sentiment}')

In [13]:
# 1 star review
neg_review = "The candy is awful and they totally scam you on the price. The jar is only the size of a soda can, and looks absolutely nothing like the picture. I will not be buying this again. If your gonna sell something that small make the price around $5."

sentiment_analysis(neg_review)

Sentiment: Negative


In [11]:
# 5 star review
pos_review = "This candy has a very good flavor. It is quite unlike anything commonly available from the supermarket candy aisle.<br /><br />I live in the middle of the corn-belt, so if you have access to a \"real\" candy store you may not find it so unique."

sentiment_analysis(pos_review)

Sentiment: Positive
