In [142]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [143]:
import nltk
import numpy as np
from sklearn.utils import shuffle

In [144]:
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [145]:
wordnet_lemmatizer = WordNetLemmatizer()

In [146]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [147]:
# from nltk.corpus import stopwords
# stopwords.words('english')

In [148]:
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(), features='html5lib')
positive_reviews = positive_reviews.findAll('review_text')

In [149]:
negative_reviews = BeautifulSoup(open('electronics/negative.review').read(), features='html5lib')
negative_reviews = negative_reviews.findAll('review_text')

In [150]:
# import nltk
# nltk.download('punkt')

In [151]:
# t = positive_reviews[0]
# nltk.tokenize.word_tokenize(t.text)

In [152]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [153]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [154]:
# import nltk
# nltk.download('wordnet')

In [155]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
print("Length of word index map: ", len(word_index_map))

Length of word index map:  10950


In [156]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x

In [157]:
N = len(positive_tokenized) + len(negative_tokenized)

In [158]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [159]:
orig_reviews, data = shuffle(orig_reviews, data)

In [160]:
X = data[:,:-1]
y = data[:,-1]

In [161]:
X_train = X[:-100,]
y_train = y[:-100,]
X_test = X[-100:,]
y_test = y[-100:,]

In [162]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [163]:
print("Train accuracy: ", model.score(X_train, y_train))
print("Test accuracy: ", model.score(X_test, y_test))

Train accuracy:  0.7889473684210526
Test accuracy:  0.72


In [164]:
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

unit -0.6935921505296557
bad -0.7858194467278309
cable 0.7086086684224746
time -0.7098574094388858
've 0.6500090753228062
month -0.698716990803417
sound 1.1094332567567013
lot 0.7173555822879234
you 0.966805563268342
n't -1.9988707616204693
easy 1.8258145082749617
quality 1.4758444161255868
company -0.5685874311539166
item -0.8736159276745926
wa -1.5246736843015476
perfect 0.9475780059370699
fast 0.9365960029534408
ha 0.7104662273086592
price 2.759414665118117
value 0.5561283494330639
money -1.079085979759981
memory 0.9591542351924246
picture 0.6374432009003663
buy -0.9011083429966401
bit 0.6179072265320875
happy 0.6677758379058613
pretty 0.760676965279075
doe -1.3133971480868674
highly 1.06845440270625
recommend 0.7341123317382232
fit 0.5242329315335003
customer -0.6037751948498684
support -0.869308465633903
little 0.9800292445086
returned -0.7955713325302244
excellent 1.3805399092824542
love 1.0648198836359057
home 0.5377267201159748
week -0.7704642373838138
size 0.5346661528567712
u

In [165]:
preds = model.predict(X)
P = model.predict_proba(X)[:,1]

In [166]:
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

In [167]:
for i in range(N):
    p = P[i]
    Y = y[i]
    if Y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif Y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [168]:
print("Most wrong positive review prob = {}, pred = {}".format(minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review prob = {}, pred = {}".format(maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review prob = 0.34953533049044305, pred = 0.0

A device like this either works or it doesn't.  This one happens to work

Most wrong negative review prob = 0.602798660943189, pred = 1.0

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

