## Binary Bag_of_Words

#### Turning raw text into feature vector

In [1]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression

In [2]:
wordnet_lemmatizer = WordNetLemmatizer() # it turns words into their base form, using this to make our vocabulary size not too large.
positive_reviews = BeautifulSoup(open('positive.review').read(), "lxml")
positive_reviews = positive_reviews.find_all("review_text")
negative_reviews = BeautifulSoup(open('negative.review').read(), "lxml")
negative_reviews = negative_reviews.find_all("review_text")
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)] # Just to balance the class, making the number positive review and negative review equal



In [22]:
# stopwords = list(w for w in open('stopwords.txt'))
with open('stopwords.txt') as fl:
    stopwords = fl.read()

stopwords = [word.strip().lower().replace('"',"") for word in stopwords.split(",")]

## Just to check that everything is going appropriately or not.

In [5]:
"n't" in stopwords

True

## Method to return "word tokens" in review sentences

In [6]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t)>2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [7]:
# word_index_map will be the vocabulary size.
word_index_map = {} # need to create the index for each word, so that each word will have each own index in final data word vector
# this will also help us to get the size of our final data word vecctor and will map the words to indices
current_index = 0 # counter that will get increase when it get a new word 
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

  


## Taking each token to create a data array in the form of bunch of numbers

In [8]:
# positive_tokenized
def tokens_to_vector(tokens,label): # putting the labels and the vectors in the same array just to make shuffle more easier.
    x = np.zeros(len(word_index_map)+1) # vocabulary size is equal to data vector size and +1 is for labels
    for t in tokens:
        i = word_index_map[t] # getting the index from the word index map
        x[i] += 1 # setting x at that index
    x = x/x.sum() # getting the average of occuring a word in an index.
    x[-1]=label
    return x

In [9]:
N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_index_map)+1)) 
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i = i+1
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1
    
np.random.shuffle(data)
x = data[:, :-1] # All rows and all the columns except the last column which is label
y = data[:, -1] # All the rows with last column which is label
xtrain = x[:-100,] # minus 100 rows and all the 
ytrain = y[:-100,]
xtest = x[-100:,]
ytest = y[-100:,]

model = LogisticRegression()
model.fit(xtrain,ytrain)
print "Classification rate:", model.score(xtest,ytest)
    

Classification rate: 0.77


In [10]:
# list(data[1999])

In [11]:
# print len(data)
# print len(word_index_map)
# word_index_map

In [12]:
threshold = 0.5
for word, index in word_index_map.iteritems():
    weight = model.coef_[0][index]
    if weight>threshold or weight< -threshold:
        print word, weight
    

unit -0.5538737696797057
best 0.9792344875129372
worked -0.7581884795813263
easy 1.4439601733015197
happy 0.5347551467198292
time -0.6313886125055916
love 0.848270359705569
case 0.525655789080087
returned -0.6619931124055823
cable 0.618907865723112
small 0.6048746763878722
customer -0.6518549963426044
try -0.5375538439006626
great 3.5093123482606945
perfect 0.8570875694686895
waste -0.8817182223218818
highly 0.8340928016688753
return -0.972467073187257
not -4.1161120327743825
support -0.835128690132794
price 2.2018010341416554
using 0.5073433764523109
lot 0.5155157910497682
poor -0.7152157018103458
month -0.6174339851965599
tried -0.7370379826542071
pretty 0.5274772661671989
used 0.9593442876617211
quality 1.2537249244650759
need 0.632337777756183
problem 0.5272195753756331
speaker 0.7587159827493186
well 1.0011250098231845
even -0.7552539965289927
recommend 0.6699039442494071
bad -0.6490519982322411
item -0.7183917249026279
thing -0.8460307127564552
first -0.6296493227880874
little 0.

## How to improve the sentiment analysis?
#### By using recursive neural network, Accuracy is relative not absolute.