In [28]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range



import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup


### Import the stopwords
Stop words: Stop Words are words which do not contain important significance to be used in Search Queries. 

Mostly they are words that are commonly used in the English language such as 'as, the, be, are' etc.



In [29]:
wordnet_lemmatizer = WordNetLemmatizer()

# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# note: an alternative source of stopwords
# from nltk.corpus import stopwords
# stopwords.words('english')

#### load the data


In [30]:
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

### Create balanced classes 
Since there are more positive reviews than negative reviews

#### Method 1
cut the positive group

#### Method 2
oversample the negative reviews

In [31]:
# we can also oversample the negative reviews
diff = len(positive_reviews) - len(negative_reviews)
idxs = np.random.choice(len(negative_reviews), size=diff)
extra = [negative_reviews[i] for i in idxs]
negative_reviews += extra

### tokenize the text using nltk's tokenizer

In [32]:

# notice how it doesn't downcase, so It != it
# not only that, but do we really want to include the word "it" anyway?
# you can imagine it wouldn't be any more common in a positive review than a negative review
# so it might only add noise to our model.
# so let's create a function that does all this pre-processing for us

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

In [33]:

# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0 # create a word-to-index map so that we can create our word-frequency vectors later
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

# positive_tokenized
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [45]:
word_index_map

{'purchased': 0,
 'this': 1,
 'unit': 2,
 'due': 3,
 'frequent': 4,
 'blackout': 5,
 'power': 6,
 'supply': 7,
 'bad': 8,
 'run': 9,
 'cable': 10,
 'modem': 11,
 'router': 12,
 'lcd': 13,
 'monitor': 14,
 'minute': 15,
 'time': 16,
 'save': 17,
 'shut': 18,
 'equally': 19,
 'electronics': 20,
 'receiving': 21,
 'clean': 22,
 'feel': 23,
 'investment': 24,
 'minor': 25,
 'compared': 26,
 'loss': 27,
 'valuable': 28,
 'data': 29,
 'failure': 30,
 'equipment': 31,
 'spike': 32,
 'irregular': 33,
 'amazon': 34,
 'business': 35,
 'day': 36,
 'apc': 37,
 'back-ups': 38,
 '500': 39,
 'recommendation': 40,
 'employee': 41,
 'mine': 42,
 "'ve": 43,
 'month': 44,
 'functioned': 45,
 'properly': 46,
 'unexpected': 47,
 'interruption': 48,
 "'ll": 49,
 'gladly': 50,
 'arises': 51,
 'pro': 52,
 'plug': 53,
 'spacing': 54,
 'adapter': 55,
 'simple': 56,
 'design': 57,
 'cord': 58,
 'con': 59,
 'line': 60,
 'conditioning': 61,
 'usually': 62,
 'expensive': 63,
 'option': 64,
 'wish': 65,
 'separate':

In [34]:
# negative_tokenized
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1


In [50]:
# how many word we take into account for following sentiment analysis
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 11078


### Create Vector

In [36]:
"""
Return a feature vector for each tokens/sentence
"""
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [55]:
#positive_tokenized

In [57]:
#negative_tokenized

In [52]:

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [53]:
data

array([[0.02272727, 0.06818182, 0.02272727, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.08333333, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04545455, 0.        ,
        0.        ],
       [0.        , 0.05769231, 0.        , ..., 0.        , 0.01923077,
        0.        ]])

## Model training

In [38]:
# shuffle the data and create train/test splits
# try it multiple times!
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))




Train accuracy: 0.7768421052631579
Test accuracy: 0.72


In [39]:
# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)


unit -0.604562963983653
bad -0.7408214950495857
cable 0.6636096291620306
time -0.7767619490939867
've 0.7052625666438089
month -0.8169243772939156
sound 1.146497246164248
lot 0.7691833911846235
you 0.9303981802399945
n't -1.994263542190409
easy 1.7194660298106978
quality 1.5678467867890533
company -0.572643825463746
card -0.6239550718761896
item -0.8798083370561465
wa -1.6983905181422518
perfect 0.9650828649898215
fast 0.9355294704572702
ha 0.6519210071782662
price 2.7162404525370305
value 0.5665064940899066
money -1.0237412228195157
memory 1.0031773324780202
picture 0.5838180689803475
buy -0.8806020052280938
bit 0.63462903159244
happy 0.6406290810895101
pretty 0.8131824762044009
doe -1.1070775855868595
highly 0.9541976718013944
recommend 0.6653655359639062
customer -0.6552627452594338
support -0.8432377574371589
little 0.9398524764247875
returned -0.7819794372439891
excellent 1.3795699979075915
love 1.1640165965701539
home 0.506451279748155
week -0.7578044487849013
size 0.519776226495

In [40]:
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)


In [41]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p


In [42]:
print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)


Most wrong positive review (prob = 0.35921384811374646, pred = 0.0):

I didn't buy this on Amazon but wanted to say this device is great. The only bad thing was MY laptop is old!  Can't go wrong with this one

Most wrong negative review (prob = 0.5955030628185785, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

