# Sentiment Analysis of Movie Ratings by Artificial Neural Network (ANN)

by Bingwen Zhang

## First, we load the data into memory.

In [9]:
g = open('reviews.txt', 'r')
reviews = list(map(lambda x:x[:-1], g.readlines()))
g.close()

g = open('labels.txt', 'r')
labels = list(map(lambda x:x[:-1], g.readlines()))
g.close()

print(labels[0]+":\n"+reviews[0])
len(reviews) == len(labels)

positive:
bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   


True

## Second, we collect all the words, and frequencies by the way

In [10]:
from collections import Counter
import numpy as np
n = len(labels)
word = Counter()
for i in range(n):
    for s in reviews[i].lower().split(" "):
        word[s] += 1
print(word.most_common(80))
print(len(word))

[('', 1111930), ('the', 336713), ('.', 327192), ('and', 164107), ('a', 163009), ('of', 145864), ('to', 135720), ('is', 107328), ('br', 101872), ('it', 96352), ('in', 93968), ('i', 87623), ('this', 76000), ('that', 73245), ('s', 65361), ('was', 48208), ('as', 46933), ('for', 44343), ('with', 44125), ('movie', 44039), ('but', 42603), ('film', 40155), ('you', 34230), ('on', 34200), ('t', 34081), ('not', 30626), ('he', 30138), ('are', 29430), ('his', 29374), ('have', 27731), ('be', 26957), ('one', 26789), ('all', 23978), ('at', 23513), ('they', 22906), ('by', 22546), ('an', 21560), ('who', 21433), ('so', 20617), ('from', 20498), ('like', 20276), ('there', 18832), ('her', 18421), ('or', 18004), ('just', 17771), ('about', 17374), ('out', 17113), ('if', 16803), ('has', 16790), ('what', 16159), ('some', 15747), ('good', 15143), ('can', 14654), ('more', 14251), ('she', 14223), ('when', 14182), ('very', 14069), ('up', 13291), ('time', 12724), ('no', 12717), ('even', 12651), ('my', 12503), ('woul

## Third, we map the word to index in a vector, to prepare the data for ANN

In [12]:
word2idx = {}
dic = list(word.keys())
for i in range(len(dic)):
    word2idx[dic[i]] = i
dic_size = len(dic)

def review2vec(review):
    vec = np.zeros(dic_size)
    for s in review.lower().split(" "):
        vec[word2idx[s]] += 1
    return vec

def label2num(label):
    if label == 'POSITIVE':
        return 1;
    else:
        return 0;
    
print(review2vec(reviews[0]))
print(label2num(labels[0]))

[ 4.  5.  4. ...,  0.  0.  0.]
0


In [13]:
word = set()
for i in range(n):
    for s in reviews[i].lower().split(" "):
        word.add(s)
print(len(word))

lab = set()
for i in range(n):
    lab.add(labels[i])
print(lab)

word2idx = {}
for i, s in enumerate(word):
    word2idx[s] = i
print(word2idx)

74074
{'positive', 'negative'}


Luckily, Numpy takes care of this for us. If you multiply a row vector array with a column vector array, it will multiply the first element in the column by each element in the row vector and set that as the first row in a new 2D array. This continues for each element in the column vector, so you get a 2D array that has shape (len(column_vector), len(row_vector)).

In [51]:
a = review2vec(reviews[0])
W1 = np.random.normal(0.0, 1.0, (10, dic_size))
b = np.dot(W1, a)
print(b.shape)
c = W1 * a
print(c.shape)
print(1/(1+2)**2)
d = 1.0 / (1.0 + np.exp(-a))
print((d / (1.0 + d)**2).shape)
print(np.dot(a,d).shape)
print(np.dot(a.T, d).shape)
print((a.T * d.T).shape)
e = np.random.normal(0.0, 1.0, 3)
d = np.random.normal(0.0, 1.0, 4)
print((e * d[:,None]))

(10,)
(10, 74074)
0.1111111111111111
(74074,)
()
()
(74074,)
[[-1.03942126  1.31915     0.3005822 ]
 [-0.87067725  1.10499366  0.25178443]
 [ 1.1959818  -1.51784407 -0.34585674]
 [-0.81100448  1.02926176  0.23452812]]


## Fourth, we build a three layer Sentiment Network and plug in the functions we just wrote

In [25]:
class ANN(object):
    def __init__(self, reviews, labels, hidden_nodes, learning_rate):
        assert len(reviews) == len(labels)
        n = len(reviews)      
        np.random.seed(1) #guarantee same results for each run
        #count words
        word = set()
        for i in range(n):
            for s in reviews[i].split(" "):
                word.add(s)               
        #count labels       
        lab = set()
        for i in range(n):
            lab.add(labels[i])
             
        #map words to index
        self.word2idx = {}
        for i, s in enumerate(word):
            self.word2idx[s] = i
        
        #map labels to number:
        self.label2num = {}
        self.num2label = {}
        for i, s in enumerate(lab):
            self.label2num[s] = i
            self.num2label[i] = s
        
        self.input_nodes = len(word)
        self.hidden_nodes = hidden_nodes
        self.output_nodes = 1
        self.learning_rate = learning_rate
        self.W1 = np.random.normal(0.0, (hidden_nodes * self.input_nodes) ** -0.5, (hidden_nodes, self.input_nodes))
        self.W2 = np.random.normal(0.0, (self.output_nodes * hidden_nodes) ** -0.5, (self.output_nodes, hidden_nodes))
        self.input = np.zeros(self.input_nodes)
        self.label = 0
        
    def getInput(self, review):
        self.input *= 0
        for s in review.split(" "):
            self.input[self.word2idx[s]] = 1
            
    def getLabel(self, label):
        self.label = self.label2num[label]

    def sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))
    def sigmoid_deriv(self, x):
        tmp = self.sigmoid(x)
        return x / (1.0 + x)**2
    
    def calc_label(self, output):
        idx = 0
        n_label = len(self.num2label)
        minv = n_label
        for i in range(n_label):
            if (np.abs(output - i * 1.0 / n_label) < minv):
                minv = np.abs(output - i * 1.0 / n_label)
                idx = i
        return self.num2label[idx]
            
    def train(self, reviews, labels):
        assert len(reviews) == len(labels)
        n = len(reviews)
        correct = 0
        start = time.time()
        for i in range(n):
            # forward propogation
            self.getInput(reviews[i])
            self.getLabel(labels[i])
            
            # calculate output for hidden layer an output layer
            layer_1 = np.dot(self.W1, self.input) #no non-linearity in the hidden layer
            
            layer_2 = self.sigmoid(np.dot(self.W2, layer_1))# output layer
            
            ## back propogation
            delta_output = layer_2 * (1 - layer_2) * (self.label - layer_2)
            delta_hidden = np.dot(self.W2.T, delta_output)
        
            self.W2 += self.learning_rate * np.outer(delta_output, layer_2) #hidden_output * delta_output[:,None]
            self.W1 += self.learning_rate * np.outer(delta_hidden, self.input) #self.input * delta_hidden[:,None]
                
            if self.calc_label(layer_2) == labels[i]:
                correct += 1
            reviews_per_second = i / float(time.time() - start)    
            sys.stdout.write("\rProgress:" + str(100 * i/float(n))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("") 
    
    def test(self, reviews, labels):
        assert len(reviews) == len(labels)
        n = len(reviews)
        correct = 0
        start = time.time()
        for i in range(n):
            if self.run(reviews[i]) == labels[i]:
                correct += 1
            reviews_per_second = i / float(time.time() - start + 0.000001)
            sys.stdout.write("\rProgress:" + str(100 * i/float(n))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
        
        
    def run(self, review):
        self.getInput(review)#update the second variable is dummy
        # run for the review
        layer_1 = np.dot(self.W1, self.input) #no non-linearity in the hidden layer
            
        layer_2 = self.sigmoid(np.dot(self.W2, layer_1))
        return self.calc_label(layer_2)

# Fifth, we train and test the ANN

Here we first build training data set and testing data set. Instead of K-fold, here we only choose 80% of total data samples randomly as the training data and the rest serves as testing data.

In [18]:
train = np.random.choice(n, size = (int)(0.8 * n), replace=False)
test = [i for i in range(n) if i not in train]
train_reviews = [reviews[i] for i in train]
train_labels = [labels[i] for i in train]
test_reviews = [reviews[i] for i in test]
test_labels = [labels[i] for i in test]

In [26]:
import sys

# set the paramters
hidden_nodes = 20
learning_rate = 0.01
ann = ANN(reviews, labels, hidden_nodes, learning_rate)

In [27]:
import time
ann.train(train_reviews, train_labels)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:12.5% Speed(reviews/sec):81.68 #Correct:1339 #Trained:2501 Training Accuracy:53.5%
Progress:25.0% Speed(reviews/sec):81.75 #Correct:2921 #Trained:5001 Training Accuracy:58.4%
Progress:37.5% Speed(reviews/sec):81.73 #Correct:4685 #Trained:7501 Training Accuracy:62.4%
Progress:50.0% Speed(reviews/sec):81.50 #Correct:6569 #Trained:10001 Training Accuracy:65.6%
Progress:62.5% Speed(reviews/sec):81.41 #Correct:8540 #Trained:12501 Training Accuracy:68.3%
Progress:75.0% Speed(reviews/sec):81.37 #Correct:10542 #Trained:15001 Training Accuracy:70.2%
Progress:87.5% Speed(reviews/sec):81.04 #Correct:12560 #Trained:17501 Training Accuracy:71.7%
Progress:99.9% Speed(reviews/sec):80.92 #Correct:14579 #Trained:20000 Training Accuracy:72.8%

In [29]:
ann.test(test_reviews, test_labels)

Progress:99.9% Speed(reviews/sec):944.9% #Correct:4084 #Tested:5000 Testing Accuracy:81.6%

Notice that the words with most occurences are 
[('', 1111930), ('the', 336713), ('.', 327192), ('and', 164107), ('a', 163009), ('of', 145864), ('to', 135720), ('is', 107328), ('br', 101872), ('it', 96352), ('in', 93968), ('i', 87623), ('this', 76000), ('that', 73245), ('s', 65361), ('was', 48208), ('as', 46933), ('for', 44343), ('with', 44125), ('movie', 44039), ('but', 42603), ('film', 40155), ('you', 34230), ('on', 34200), ('t', 34081), ('not', 30626), ('he', 30138), ('are', 29430), ('his', 29374), ('have', 27731), ('be', 26957), ('one', 26789), ('all', 23978), ('at', 23513), ('they', 22906), ('by', 22546), ('an', 21560), ('who', 21433), ('so', 20617), ('from', 20498)]
So instead of using weight, we should use an indicator function 1 for exisits 0 for not. Otherwise, these words will be dominant factors in the network and thus leading to meaningless results.

# Speeding Up the optimization

Notice that the we have a large input vector (size > 70000) with most zero entries, we so can speed up the optimization process by using this.

In [30]:
class ANN_speed_up(object):
    def __init__(self, reviews, labels, hidden_nodes, learning_rate):
        assert len(reviews) == len(labels)
        n = len(reviews)      
        np.random.seed(1) #guarantee same results for each run
        #count words
        word = set()
        for i in range(n):
            for s in reviews[i].split(" "):
                word.add(s)               
        #count labels       
        lab = set()
        for i in range(n):
            lab.add(labels[i])
             
        #map words to index
        self.word2idx = {}
        for i, s in enumerate(word):
            self.word2idx[s] = i
        
        #map labels to number:
        self.label2num = {}
        self.num2label = {}
        for i, s in enumerate(lab):
            self.label2num[s] = i
            self.num2label[i] = s
        
        self.input_nodes = len(word)
        self.hidden_nodes = hidden_nodes
        self.output_nodes = 1
        self.learning_rate = learning_rate
        self.W1 = np.random.normal(0.0, (hidden_nodes * self.input_nodes) ** -0.5, (hidden_nodes, self.input_nodes))
        self.W2 = np.random.normal(0.0, (self.output_nodes * hidden_nodes) ** -0.5, (self.output_nodes, hidden_nodes))
        self.input = np.zeros(self.input_nodes)
        self.label = 0
        self.index_set = set()
    def getInput(self, review):
        self.input *= 0
        self.index_set = set()
        for s in review.split(" "):
            self.input[self.word2idx[s]] = 1
            self.index_set.add(self.word2idx[s])
            
    def getLabel(self, label):
        self.label = self.label2num[label]
        
    def sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))
    def sigmoid_deriv(self, x):
        tmp = self.sigmoid(x)
        return x / (1.0 + x)**2
    
    def calc_label(self, output):
        idx = 0
        n_label = len(self.num2label)
        minv = n_label
        for i in range(n_label):
            if (np.abs(output - i * 1.0 / n_label) < minv):
                minv = np.abs(output - i * 1.0 / n_label)
                idx = i
        return self.num2label[idx]
            
    def train(self, reviews, labels):
        assert len(reviews) == len(labels)
        n = len(reviews)
        
        correct = 0
        start = time.time() -0.00001
        layer_1 = np.zeros(self.hidden_nodes)
        for i in range(n):
            # forward propogation
            self.getInput(reviews[i])
            self.getLabel(labels[i])
            
            # calculate output for hidden layer an output layer
            #layer_1 = np.dot(self.W1, self.input) #no non-linearity in the hidden layer
            layer_1 *= 0
            for j in self.index_set:
                layer_1 += self.W1[:,j]
            
            layer_2 = self.sigmoid(np.dot(self.W2, layer_1))# output layer
            
            ## back propogation
            delta_output = layer_2 * (1 - layer_2) * (self.label - layer_2)
            delta_hidden = np.dot(self.W2.T, delta_output)
        
            self.W2 += self.learning_rate * np.outer(delta_output, layer_2) #hidden_output * delta_output[:,None]
            #self.W1 += self.learning_rate * np.outer(delta_hidden, self.input) #self.input * delta_hidden[:,None]
            for j in self.index_set:
                self.W1[:,j] += delta_hidden
            
            if self.calc_label(layer_2) == labels[i]:
                correct += 1
            reviews_per_second = i / float(time.time() - start)    
            sys.stdout.write("\rProgress:" + str(100 * i/float(n))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("") 
    
    def test(self, reviews, labels):
        assert len(reviews) == len(labels)
        n = len(reviews)
        correct = 0
        start = time.time()
        for i in range(n):
            if self.run(reviews[i]) == labels[i]:
                correct += 1
            reviews_per_second = i / float(time.time() - start)
            sys.stdout.write("\rProgress:" + str(100 * i/float(n))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
        
        
    def run(self, review):
        self.getInput(review)#update the second variable is dummy
        # run for the review
        layer_1 = np.dot(self.W1, self.input) #no non-linearity in the hidden layer
            
        layer_2 = self.sigmoid(np.dot(self.W2, layer_1))
        return self.calc_label(layer_2)

In [31]:
import sys

# set the paramters
hidden_nodes = 20
learning_rate = 0.01
ann = ANN_speed_up(reviews, labels, hidden_nodes, learning_rate)

In [32]:
import time
ann.train(train_reviews, train_labels)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:12.5% Speed(reviews/sec):1005. #Correct:1728 #Trained:2501 Training Accuracy:69.0%
Progress:25.0% Speed(reviews/sec):1012. #Correct:3631 #Trained:5001 Training Accuracy:72.6%
Progress:37.5% Speed(reviews/sec):1021. #Correct:5562 #Trained:7501 Training Accuracy:74.1%
Progress:50.0% Speed(reviews/sec):1017. #Correct:7565 #Trained:10001 Training Accuracy:75.6%
Progress:62.5% Speed(reviews/sec):1018. #Correct:9551 #Trained:12501 Training Accuracy:76.4%
Progress:75.0% Speed(reviews/sec):1021. #Correct:11553 #Trained:15001 Training Accuracy:77.0%
Progress:87.5% Speed(reviews/sec):1019. #Correct:13579 #Trained:17501 Training Accuracy:77.5%
Progress:99.9% Speed(reviews/sec):1021. #Correct:15596 #Trained:20000 Training Accuracy:77.9%

In [33]:
ann.test(test_reviews, test_labels)

Progress:99.9% Speed(reviews/sec):888.7% #Correct:3924 #Tested:5000 Testing Accuracy:78.4%

Here we train multiple iterations.

In [29]:
ann.train(train_reviews * 2, train_labels * 2) 

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:6.25% Speed(reviews/sec):857.6 #Correct:2055 #Trained:2501 Training Accuracy:82.1%
Progress:12.5% Speed(reviews/sec):860.1 #Correct:4115 #Trained:5001 Training Accuracy:82.2%
Progress:18.7% Speed(reviews/sec):869.9 #Correct:6191 #Trained:7501 Training Accuracy:82.5%
Progress:25.0% Speed(reviews/sec):860.7 #Correct:8256 #Trained:10001 Training Accuracy:82.5%
Progress:31.2% Speed(reviews/sec):859.1 #Correct:10359 #Trained:12501 Training Accuracy:82.8%
Progress:37.5% Speed(reviews/sec):860.0 #Correct:12425 #Trained:15001 Training Accuracy:82.8%
Progress:43.7% Speed(reviews/sec):859.7 #Correct:14520 #Trained:17501 Training Accuracy:82.9%
Progress:50.0% Speed(reviews/sec):861.8 #Correct:16574 #Trained:20001 Training Accuracy:82.8%
Progress:56.2% Speed(reviews/sec):860.3 #Correct:18653 #Trained:22501 Training Accuracy:82.8%
Progress:62.5% Speed(reviews/sec):861.4 #Correct:20766 #Trained:25001 Training

In [30]:
ann.test(test_reviews, test_labels)

Progress:99.9% Speed(reviews/sec):672.7% #Correct:4198 #Tested:5000 Testing Accuracy:83.9%

# Let us see if some words are useless for classifying reviews

In [44]:
pos = Counter()
neg = Counter()
total = set()

for i in range(len(reviews)):
    if labels[i] == 'positive':
        for s in reviews[i].split(" "):
            pos[s] += 1
            total.add(s)
    else:
        for s in reviews[i].split(" "):
            neg[s] += 1
            total.add(s)

ratio = Counter()
for s in total:
    pos_neg = pos[s] / (neg[s] + 1.0)
    if pos_neg >= 1:
        ratio[s] = np.log(pos_neg)
    else:
        ratio[s] = np.log(pos_neg + 0.01) #0.01 is for dealing with zero ratio

In [55]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [62]:
hist, edges = np.histogram(list(map(lambda x:x[1], ratio.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

This is log-ratio. log(positive count / negative count), if this quantity is close to zero (either > 0 or < 0), it means that positive count $\approx$ negative count. So these word can be ingored. Here we choose to discard the words between [-1, 1].

In [64]:
hist, edges = np.histogram(list(map(lambda x:x[1], word.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Total Word Frequencies")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [67]:
dic = set()
for s in total:
    if ratio[s] > 1 or ratio[s] < -1:
        dic.add(s)
print(len(dic))

29916


In [95]:
class ANN_noise_reduction(object):
    def __init__(self, reviews, labels, hidden_nodes, learning_rate, min_count, ratio_thresh):
        assert len(reviews) == len(labels)
        n = len(reviews)      
        np.random.seed(1) #guarantee same results for each run
        
        pos = Counter()
        neg = Counter()
        total = set()
        for i in range(len(reviews)):
            if labels[i] == 'positive':
                for s in reviews[i].split(" "):
                    pos[s] += 1
                    total.add(s)
            else:
                for s in reviews[i].split(" "):
                    neg[s] += 1
                    total.add(s)

        #word is a set containing useful words
        word = set()
        for s in total:
            if pos[s] + neg[s] < min_count:
                continue
            pos_neg = pos[s] / (neg[s] + 1.0)
            if pos_neg >= 1:
                ratio = np.log(pos_neg)
            else:
                ratio = np.log(pos_neg + 0.01) #0.01 is for dealing with zero ratio
            if ratio > ratio_thresh or ratio < -ratio_thresh:
                word.add(s)
        self.word = word
        print(len(self.word))
        
        #count labels       
        lab = set()
        for i in range(n):
            lab.add(labels[i])
             
        #map words to index
        self.word2idx = {}
        for i, s in enumerate(word):
            self.word2idx[s] = i
        
        #map labels to number:
        self.label2num = {}
        self.num2label = {}
        for i, s in enumerate(lab):
            self.label2num[s] = i
            self.num2label[i] = s
        
        self.input_nodes = len(word)
        self.hidden_nodes = hidden_nodes
        self.output_nodes = 1
        self.learning_rate = learning_rate
        self.W1 = np.random.normal(0.0, (hidden_nodes * self.input_nodes) ** -0.5, (hidden_nodes, self.input_nodes))
        self.W2 = np.random.normal(0.0, (self.output_nodes * hidden_nodes) ** -0.5, (self.output_nodes, hidden_nodes))
        self.input = np.zeros(self.input_nodes)
        self.label = 0
        self.index_set = set()
    def getInput(self, review):
        self.input *= 0
        self.index_set = set()
        for s in review.split(" "):
            if s in self.word:
                self.input[self.word2idx[s]] = 1
                self.index_set.add(self.word2idx[s])
            
    def getLabel(self, label):
        self.label = self.label2num[label]
        
    def sigmoid(self, x):
        return 1.0 / (1.0 + np.exp(-x))
    def sigmoid_deriv(self, x):
        tmp = self.sigmoid(x)
        return x / (1.0 + x)**2
    
    def calc_label(self, output):
        idx = 0
        n_label = len(self.num2label)
        minv = n_label
        for i in range(n_label):
            if (np.abs(output - i * 1.0 / n_label) < minv):
                minv = np.abs(output - i * 1.0 / n_label)
                idx = i
        return self.num2label[idx]
            
    def train(self, reviews, labels):
        assert len(reviews) == len(labels)
        n = len(reviews)
        
        correct = 0
        start = time.time() -0.00001
        layer_1 = np.zeros(self.hidden_nodes)
        for i in range(n):
            # forward propogation
            self.getInput(reviews[i])
            self.getLabel(labels[i])
            
            # calculate output for hidden layer an output layer
            #layer_1 = np.dot(self.W1, self.input) #no non-linearity in the hidden layer
            layer_1 *= 0
            for j in self.index_set:
                layer_1 += self.W1[:,j]
            
            layer_2 = self.sigmoid(np.dot(self.W2, layer_1))# output layer
            
            ## back propogation
            delta_output = layer_2 * (1 - layer_2) * (self.label - layer_2)
            delta_hidden = np.dot(self.W2.T, delta_output)
        
            self.W2 += self.learning_rate * np.outer(delta_output, layer_2) #hidden_output * delta_output[:,None]
            #self.W1 += self.learning_rate * np.outer(delta_hidden, self.input) #self.input * delta_hidden[:,None]
            for j in self.index_set:
                self.W1[:,j] += delta_hidden
            
            if self.calc_label(layer_2) == labels[i]:
                correct += 1
            reviews_per_second = i / float(time.time() - start)    
            sys.stdout.write("\rProgress:" + str(100 * i/float(n))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("") 
    
    def test(self, reviews, labels):
        assert len(reviews) == len(labels)
        n = len(reviews)
        correct = 0
        start = time.time()
        for i in range(n):
            if self.run(reviews[i]) == labels[i]:
                correct += 1
            reviews_per_second = i / float(time.time() - start)
            sys.stdout.write("\rProgress:" + str(100 * i/float(n))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
        
        
    def run(self, review):
        self.getInput(review)#update the second variable is dummy
        # run for the review
        layer_1 = np.dot(self.W1, self.input) #no non-linearity in the hidden layer
            
        layer_2 = self.sigmoid(np.dot(self.W2, layer_1))
        return self.calc_label(layer_2)

In [107]:
import sys

# set the paramters
hidden_nodes = 20
learning_rate = 0.01
ann = ANN_noise_reduction(reviews, labels, hidden_nodes, learning_rate, min_count = 0.0, ratio_thresh = 1.0)

29916


In [110]:
ann.train(train_reviews, train_labels) 

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:12.5% Speed(reviews/sec):3430. #Correct:2182 #Trained:2501 Training Accuracy:87.2%
Progress:25.0% Speed(reviews/sec):3587. #Correct:4411 #Trained:5001 Training Accuracy:88.2%
Progress:37.5% Speed(reviews/sec):3591. #Correct:6647 #Trained:7501 Training Accuracy:88.6%
Progress:50.0% Speed(reviews/sec):3597. #Correct:8884 #Trained:10001 Training Accuracy:88.8%
Progress:62.5% Speed(reviews/sec):3613. #Correct:11116 #Trained:12501 Training Accuracy:88.9%
Progress:75.0% Speed(reviews/sec):3600. #Correct:13364 #Trained:15001 Training Accuracy:89.0%
Progress:87.5% Speed(reviews/sec):3624. #Correct:15576 #Trained:17501 Training Accuracy:89.0%
Progress:99.9% Speed(reviews/sec):3641. #Correct:17808 #Trained:20000 Training Accuracy:89.0%

In [111]:
ann.test(test_reviews, test_labels) 

Progress:99.9% Speed(reviews/sec):2176.% #Correct:4154 #Tested:5000 Testing Accuracy:83.0%

# Here we try to rank correlation for a word

In [37]:
def get_close(word):    
    rel = Counter()
    if word in ann.word2idx.keys():       
        widx = ann.word2idx[word]
        for w in ann.word2idx.keys():
            rel[w] = np.dot(ann.W1[:,ann.word2idx[w]], ann.W1[:,widx])
    return rel.most_common()  

In [38]:
get_close('good')

[('best', 3.2620190490269154),
 ('great', 2.6048321281372928),
 ('fun', 2.5217074110235447),
 ('love', 2.5146345625128763),
 ('loved', 2.4675160876550009),
 ('also', 2.4311402746652111),
 ('perfect', 2.3583288247663075),
 ('still', 2.2923023468066255),
 ('excellent', 2.0767564040378561),
 ('may', 2.0311936242243194),
 ('very', 2.0279915690467041),
 ('young', 1.86201151478755),
 ('enjoy', 1.8354066169456642),
 ('heart', 1.7836132367740278),
 ('today', 1.7770942702852508),
 ('will', 1.7731225160047306),
 ('favorite', 1.7622213512770974),
 ('amazing', 1.62755806391811),
 ('different', 1.6124190490071018),
 ('enjoyed', 1.61019024829762),
 ('wonderful', 1.6023455569449268),
 ('job', 1.5700479784885306),
 ('well', 1.5589075717312346),
 ('brilliant', 1.5388336964829321),
 ('masterpiece', 1.5050806981764733),
 ('always', 1.4670057960543756),
 ('before', 1.4280806871593161),
 ('think', 1.4195431840169881),
 ('everyone', 1.4151688762694381),
 ('fantastic', 1.3962741200729611),
 ('strong', 1.3807

In [39]:
get_close('bad')

[('worst', 59.608489528356991),
 ('bad', 48.255815250173654),
 ('awful', 37.62258674404783),
 ('waste', 36.686397062478179),
 ('instead', 34.589384629833049),
 ('nothing', 33.034483798892488),
 ('stupid', 31.120332370882821),
 ('worse', 31.090806557755805),
 ('script', 30.66360942353074),
 ('poor', 30.364253578670816),
 ('boring', 29.754804659485949),
 ('minutes', 26.704794227843816),
 ('no', 26.227697400352383),
 ('unfortunately', 25.775368248591104),
 ('terrible', 24.536384902095506),
 ('money', 24.488526026168262),
 ('oh', 23.741291627270307),
 ('ridiculous', 23.602760865531042),
 ('bunch', 22.635237325321743),
 ('over', 22.593850974683882),
 ('looks', 22.500114533005085),
 ('br', 22.275953089345148),
 ('save', 21.118471677982821),
 ('avoid', 20.94761911136009),
 ('lame', 20.663593991095958),
 ('plot', 20.641501643874076),
 ('was', 20.371493554627662),
 ('horrible', 20.098607432325455),
 ('comes', 19.742236503807632),
 ('dull', 19.731115580610666),
 ('none', 19.696870042135789),
 ('

In [47]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ra in ratio.most_common(500):
    if(word in ann.word2idx.keys()):
        words_to_visualize.append(word)
    
for word, ra in list(reversed(ratio.most_common()))[0:500]:
    if(word in ann.word2idx.keys()):
        words_to_visualize.append(word)

In [50]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in ratio.keys():
        vectors_list.append(ann.W1[:,ann.word2idx[word]])
        if(ratio[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [53]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [56]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize))

p.scatter(x="x1", y="x2", size=8, source=source,color=colors_list)

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
