In [104]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
#XML parser
from bs4 import BeautifulSoup
from future.utils import iteritems

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankushmalhotra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ankushmalhotra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Stemming using Lemmatizer

In [96]:
wordnet_lemmatizer = WordNetLemmatizer()

stopwords = set(w.rstrip() for w in open('stopwords.txt'))

positive_reviews = BeautifulSoup(open('electronics/positive.review').read(),"lxml")
#Looking at only one key: review_text
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('electronics/negative.review').read(),"lxml")
negative_reviews = negative_reviews.findAll('review_text')

#Shuffle Positive reviews so that both the reviews are of the same size
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]



In [97]:
#Create a word of each index
#Create dictionary of word index map

def my_tokenizer(s):
    #Downcasing words
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    #list apprehensions to keep only words >2
    tokens = [t for t in tokens if len(t) > 2]
    #using lemmatizer
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    #remove stopwords
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

word_index_map = {}
current_index = 0

#Save tokenized array to be used later
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:   
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

    

In [98]:
#Take each Token and create a data array using word proportion
# Create Input Matrices
def tokens_to_vector(tokens, label):
    x=np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        #set at that index
        x[i] += 1
    #Divide by total    
    x = x / x.sum()
    #Set last element to the label
    x[-1] = label
    return x       
            

In [110]:
# N will be total no of examples and assign these
N = len(positive_tokenized) + len(negative_tokenized)

#Initialize array of all zeros which is N by D+1
data = np.zeros((N, len(word_index_map) + 1))
#Counter for which sample I'm looking at:
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1
    
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1    

In [111]:
# Shuffle before we get our train and test set
np.random.shuffle(data)
# Y is all rows except the last column
X = data[:, :-1]
# Y is all rows of the last column i.e. label
Y = data[:, -1]




In [112]:
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate:",model.score(Xtest, Ytest))

Classification rate: 0.62


In [109]:
#
threshold = 0.5
for word, index in iteritems(word_index_map):
#for word, index in word_index_map.iteritems():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

sound 0.9547016023622208
you 1.0835896432702175
easy 1.7791946385959172
doe -1.1170864397838267
n't -2.0286295653171984
price 2.754531257284509
love 1.099817187450529
cable 0.7624926225342726
pretty 0.6841467744831123
ha 0.699579628412729
wa -1.6765371631794004
unit -0.571095426571133
little 0.8853924050901284
home 0.5026170924635789
using 0.6967476826835325
buy -0.852841237164346
quality 1.4398488360859814
speaker 0.7235013943978353
recommend 0.6413041640888376
perfect 1.0052642030063477
laptop 0.5221957882249669
excellent 1.350721409826073
customer -0.67991017327908
support -0.8412880123714355
highly 1.015842601194178
money -0.9602682449010491
comfortable 0.6445728095137714
time -0.7185195099712163
happy 0.6186195727927006
bit 0.5493155031125083
then -1.1136532294399455
look 0.524439303947066
fast 0.9371202440621758
've 0.7208653035316015
expected 0.5588848573484712
month -0.7887835997071287
pro 0.5147404697087933
memory 0.9999661245176761
lot 0.7487920445183717
try -0.61362177713894