In [133]:
# Import Packages
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time

In [134]:
# Read in data to pandas dataframes
train = pd.read_csv('training.txt', header = None)
train = train.rename(columns={0: "text", 1: "good"})
X_train = train['text']
y_train = train['good']
test = pd.read_csv('testing.txt', header = None)
test = test.rename(columns={0: "text", 1: "good"})
X_test = test['text']
y_test = test['good']

In [144]:
# Modify the default english stop words, to not include "no" and "not"
from sklearn.feature_extraction import text
custom_stop = set(text.ENGLISH_STOP_WORDS)
custom_stop.remove('not')
custom_stop.remove('no')

In [145]:
# Run Naive bayes algorithm
start_nb = time.time()
cv = CountVectorizer(stop_words = custom_stop)
train_cv = cv.fit_transform(X_train)
test_cv = cv.transform(X_test)
mnb = MultinomialNB()
mnb.fit(train_cv, y_train)
predictions = mnb.predict(test_cv)
end_nb = time.time()

In [146]:
# Report results
print('Accuracy Metrics:')
print('Accuracy:', accuracy_score(y_test, predictions))
print('Precision:', precision_score(y_test, predictions))
print('Recall:', recall_score(y_test, predictions))
print('Time:', (end_nb - start_nb), 'seconds')

Accuracy Metrics:
Accuracy: 0.8500100060036022
Precision: 0.8574679354571784
Recall: 0.8365287588294652
Time: 5.8811211585998535 seconds


In [151]:
# Print 5 most important negative and positive words
n = 5
class_labels = mnb.classes_
feature_names =cv.get_feature_names()
diff_neg = sorted(zip(np.exp(mnb.feature_log_prob_[0]) - np.exp(mnb.feature_log_prob_[1]), feature_names),reverse=True)[:n]
diff_pos = sorted(zip(np.exp(mnb.feature_log_prob_[1]) - np.exp(mnb.feature_log_prob_[0]), feature_names),reverse=True)[:n]
print("Important words in negative reviews")
for coef, feat in diff_neg:
    print(feat)
print("-----------------------------------------")
print("Important words in positive reviews")
for coef, feat in diff_pos:
    print(feat)

Important words in negative reviews
not
no
just
bad
game
-----------------------------------------
Important words in positive reviews
great
fun
best
love
good


In [152]:
com = input('Enter a new review to predict its sentiment: ')
print('----------------------------------------------------------------------------------')
com_series = pd.Series(com)
user_cv = cv.transform(com_series)
com_prediction = mnb.predict(user_cv)
if(com_prediction == 1):
    print("Prediction is positive review")
else:
    print("Prediction is negative review")

Enter a new review to predict its sentiment: this game was not good
----------------------------------------------------------------------------------
Prediction is negative review
