# Review Sentiment Analysis using Python(NLTK/SKLEARN/pandas/numpy/matplotlib/seaborn)

# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html

In [None]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

import nltk
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup

from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [13]:
#Get data
positive_reviews = BeautifulSoup(open('electronics/positive.review').read(),'lxml')
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read(),'lxml')
negative_reviews = negative_reviews.findAll('review_text')

In [14]:
#Preprocessing text
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens
    

In [15]:
#create word to index map
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

len(word_index_map): 11088


In [16]:
#Create input matrics
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [18]:
#Split training and test data
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [19]:
#Train the model
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))


Train accuracy: 0.77
Test accuracy: 0.71


In [27]:
#Visualise the weights of each word
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

this -0.5109992334184089
unit -0.742151946098836
bad -0.7565371225048386
cable 0.689348721547398
time -0.6773290237530438
've 0.795374488830696
month -0.7809503613677934
pro 0.5139099816975801
sound 0.9345898200785533
lot 0.7416545573492085
you 1.0658408211963322
n't -2.0815694985020086
easy 1.684462834092636
quality 1.4632943134891379
company -0.5255210786427638
item -1.0529991143373667
wa -1.6419524434130834
perfect 1.0294701308540695
fast 0.8534701271407522
ha 0.7535072290568456
price 2.608298908117422
value 0.5006946733396478
money -1.1566580199365881
memory 0.8651598909608144
picture 0.5630680221882312
buy -0.9922594789801147
... -0.6035680325538164
bit 0.6821438304789723
happy 0.555391260297876
pretty 0.7269573657307102
doe -1.1345663396020595
highly 0.9059579946148338
recommend 0.6147256994393698
customer -0.6994712823175423
support -0.8481823594691484
little 0.8915102108491149
returned -0.783499422448722
excellent 1.3018632489417676
love 1.2183812063511084
useless -0.5040189733