In [1]:
# Import Packages
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time

In [2]:
# Read in data to pandas dataframes
train = pd.read_csv('training.txt', header = None)
train = train.rename(columns={0: "text", 1: "good"})
X_train = train['text']
y_train = train['good']
test = pd.read_csv('testing.txt', header = None)
test = test.rename(columns={0: "text", 1: "good"})
X_test = test['text']
y_test = test['good']

In [3]:
# Run Naive bayes algorithm
start_nb = time.time()
cv = CountVectorizer(stop_words = "english")
train_cv = cv.fit_transform(X_train)
test_cv = cv.transform(X_test)
mnb = MultinomialNB()
mnb.fit(train_cv, y_train)
predictions = mnb.predict(test_cv)
end_nb = time.time()

In [4]:
# Report results
print('Accuracy:', accuracy_score(y_test, predictions))
print('Precision:', precision_score(y_test, predictions))
print('Recall:', recall_score(y_test, predictions))
print('Time:', (end_nb - start_nb), 'seconds')

Accuracy: 0.8460076045627376
Precision: 0.8505747126436781
Recall: 0.8363269424823411
Time: 5.734030723571777 seconds


In [5]:
neg_class_prob_sorted = mnb.feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = mnb.feature_log_prob_[1, :].argsort()[::-1]

print(np.take(cv.get_feature_names(), neg_class_prob_sorted[:10]))
print(np.take(cv.get_feature_names(), pos_class_prob_sorted[:10]))

['game' 'just' 'like' 'games' 'play' 'time' 'don' 'really' 'good' 'fun']
['game' 'like' 'games' 'play' 'just' 'great' 'good' 'fun' 'time' 'really']


In [6]:
def important_features(vectorizer,classifier,n=10):
    class_labels = classifier.classes_
    feature_names =vectorizer.get_feature_names()
    diff_neg = sorted(zip(np.exp(classifier.feature_log_prob_[0]) - np.exp(classifier.feature_log_prob_[1]), feature_names),reverse=True)[:n]
    diff_pos = sorted(zip(np.exp(classifier.feature_log_prob_[1]) - np.exp(classifier.feature_log_prob_[0]), feature_names),reverse=True)[:n]

    print("Important words in negative reviews")

    for coef, feat in diff_neg:
        print(feat)

    print("-----------------------------------------")
    print("Important words in positive reviews")

    for coef, feat in diff_pos:
        print(feat)


    

In [7]:
important_features(cv,mnb,5)

Important words in negative reviews
just
game
bad
money
don
-----------------------------------------
Important words in positive reviews
great
fun
best
love
good


In [9]:
com = input('Enter a new review to predict its sentiment: ')
print('----------------------------------------------------------------------------------')
com_series = pd.Series(com)
user_cv = cv.transform(com_series)
com_prediction = mnb.predict(user_cv)
if(com_prediction == 1):
    print("Prediction is positive review")
else:
    print("Prediction is negative review")

Enter a new review to predict its sentiment: not the best game out there
----------------------------------------------------------------------------------
Prediction is positive review
