In [1]:
# Create the dataset
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [2]:
len(reviews)

25000

In [3]:
reviews[0]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   '

In [4]:
labels[0]

'POSITIVE'

In [5]:
from collections import Counter
import numpy as np

In [6]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [7]:
# Loop over all the words in all the reviews and increment the counts

for review, label in zip(reviews, labels):
        for word in review.split(' '):
            total_counts[word]+=1
            if(label == 'POSITIVE'):
                positive_counts[word]+=1
            else:
                negative_counts[word]+=1

In [8]:
positive_counts.most_common(20)

[('', 550468),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937)]

In [9]:
negative_counts.most_common(20)

[('', 561462),
 ('.', 167538),
 ('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327),
 ('i', 46880),
 ('in', 43753),
 ('this', 40920),
 ('that', 37615),
 ('s', 31546),
 ('was', 26291),
 ('movie', 24965),
 ('for', 21927),
 ('but', 21781),
 ('with', 20878)]

In [10]:
# Create Counter object to store positive/negative ratios
pos_neg_ratios = Counter()

# Examine the counts of the most common words in positive reviews
# positive_counts.most_common()

for word,count in list(total_counts.most_common()):
    if(count>500):
        pos_neg_ratio = positive_counts[word]/float(negative_counts[word]+1)
        pos_neg_ratios[word]=pos_neg_ratio         

In [11]:
for word,ratio in pos_neg_ratios.most_common():
    if(ratio>1):
        pos_neg_ratios[word]=np.log(ratio)
    else:
        pos_neg_ratios[word]=-np.log((1/(ratio+0.01)))

In [12]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common(20)

[('superb', 1.7091514458966952),
 ('wonderful', 1.5645425925262093),
 ('fantastic', 1.5048433868558566),
 ('excellent', 1.4647538505723599),
 ('amazing', 1.3919815802404802),
 ('powerful', 1.2999662776313934),
 ('favorite', 1.2668956297860055),
 ('perfect', 1.246742480713785),
 ('brilliant', 1.2287554137664785),
 ('perfectly', 1.1971931173405572),
 ('loved', 1.1563661500586044),
 ('highly', 1.1420208631618658),
 ('tony', 1.1397491942285991),
 ('today', 1.1050431789984001),
 ('unique', 1.0881409888008142),
 ('beauty', 1.050410186850232),
 ('greatest', 1.0248947127715422),
 ('portrayal', 1.0189810189761024),
 ('incredible', 1.0061677561461084),
 ('sweet', 0.98966110487955483)]

In [13]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common(20)))

[('sweet', 0.98966110487955483),
 ('incredible', 1.0061677561461084),
 ('portrayal', 1.0189810189761024),
 ('greatest', 1.0248947127715422),
 ('beauty', 1.050410186850232),
 ('unique', 1.0881409888008142),
 ('today', 1.1050431789984001),
 ('tony', 1.1397491942285991),
 ('highly', 1.1420208631618658),
 ('loved', 1.1563661500586044),
 ('perfectly', 1.1971931173405572),
 ('brilliant', 1.2287554137664785),
 ('perfect', 1.246742480713785),
 ('favorite', 1.2668956297860055),
 ('powerful', 1.2999662776313934),
 ('amazing', 1.3919815802404802),
 ('excellent', 1.4647538505723599),
 ('fantastic', 1.5048433868558566),
 ('wonderful', 1.5645425925262093),
 ('superb', 1.7091514458966952)]

In [14]:
# Create the input and label data

# Create set named "vocab" containing all of the words from all of the reviews
vocab=set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

74074


In [15]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [16]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [17]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [18]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [19]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1,min_count=100,polarity_cutoff=2.5):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels,min_count,polarity_cutoff)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels,min_count,polarity_cutoff):
        
        review_vocab = set()
        
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for review, label in zip(reviews, labels):
            for word in review.split(' '):
                total_counts[word]+=1
                if(label == 'POSITIVE'):
                    positive_counts[word]+=1
                else:
                    negative_counts[word]+=1
                
        for word,count in list(total_counts.most_common()):
            if(count>50):
                pos_neg_ratio = positive_counts[word]/float(negative_counts[word]+1)
                pos_neg_ratios[word]=pos_neg_ratio
        
        for word,ratio in pos_neg_ratios.most_common():
            if(ratio>1):
                pos_neg_ratios[word]=np.log(ratio)
            else:
                pos_neg_ratios[word]=-np.log((1/(ratio+0.01)))
        # Convert the vocabulary set to a list so we can access words via indices
        for review in reviews:
            for word in review.split(' '):
                if(total_counts[word]>min_count):
                    if(word in pos_neg_ratios.keys()):
                        if(np.abs(pos_neg_ratios[word])>polarity_cutoff):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        # TODO: populate label_vocab with all of the words in the given labels.
        #       There is no need to split the labels because each one is a single word.
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i,word in enumerate(review_vocab):
            self.word2index[word]=i
        # TODO: populate self.word2index with indices for all the words in self.review_vocab
        #       like you saw earlier in the notebook
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i,word in enumerate(label_vocab):
            self.label2index[word]=i
        # TODO: do the same thing you did for self.word2index and self.review_vocab, 
        #       but for self.label2index and self.label_vocab instead
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Store the number of nodes in input, hidden, and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights
        
        # TODO: initialize self.weights_0_1 as a matrix of zeros. These are the weights between
        #       the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
        
        # TODO: initialize self.weights_1_2 as a matrix of random values. 
        #       These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0,self.output_nodes**-0.5,
                                            (self.hidden_nodes,self.output_nodes))
        
        # TODO: Create the input layer, a two-dimensional matrix with shape 
        #       1 x input_nodes, with all values initialized to zero
        self.layer_1 = np.zeros((1,self.hidden_nodes))
    
        
    def update_input_layer(self,review):
        # TODO: You can copy most of the code you wrote for update_input_layer 
        #       earlier in this notebook. 
        #
        #       However, MAKE SURE YOU CHANGE ALL VARIABLES TO REFERENCE
        #       THE VERSIONS STORED IN THIS OBJECT, NOT THE GLOBAL OBJECTS.
        #       For example, replace "layer_0 *= 0" with "self.layer_0 *= 0"
        # clear out previous state by resetting the layer to be all 0s
        self.layer_0 *= 0
        # TODO: count how many times each word is used in the given review and store the results in layer_0 
        for word in review.split(' '):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]]=1
                
    def get_target_for_label(self,label):
        # TODO: Copy the code you wrote for get_target_for_label 
        #       earlier in this notebook. 
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        # TODO: Return the result of calculating the sigmoid activation function
        #       shown in the lectures
        return (1/ (1+np.exp(-x)) )
    
    def sigmoid_output_2_derivative(self,output):
        # TODO: Return the derivative of the sigmoid activation function, 
        #       where "output" is the original output from the sigmoid fucntion 
        return output*(1-output)

    def train(self, training_reviews_raw, training_labels):
        
        training_reviews=list()
        for review in training_reviews_raw:
            a=set()
            for word in review.split(' '):
                if(word in self.word2index.keys()):
                    a.add(self.word2index[word])
            training_reviews.append(list(a))
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0
        
        # Remember when we started for printing time statistics
        start = time.time()

        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # TODO: Get the next review and its correct label
            review=training_reviews[i]
            label=training_labels[i]
            
            # TODO: Implement the forward pass through the network. 
            #       That means use the given review to update the input layer, 
            #       then calculate values for the hidden layer,
            #       and finally calculate the output layer.
            # 
            #       Do not use an activation function for the hidden layer,
            #       but use the sigmoid activation function for the output layer.
            self.layer_1*=0
            for index in review:
                self.layer_1+=self.weights_0_1[index]
            label=self.get_target_for_label(label)
            output=self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
            # TODO: Implement the back propagation pass here. 
            #       That means calculate the error for the forward pass's prediction
            #       and update the weights in the network according to their
            #       contributions toward the error, as calculated via the
            #       gradient descent and back propagation algorithms you 
            #       learned in class.
            output_error=output-label
            output_error_term = output_error*self.sigmoid_output_2_derivative(output)
            hidden_error= output_error_term.dot(self.weights_1_2.T)
            hidden_error_term= hidden_error
            
            for index in review:
                self.weights_0_1[index] -= hidden_error[0]*self.learning_rate
            
            self.weights_1_2 -= self.learning_rate*self.layer_1.T.dot(output_error_term)
            # TODO: Keep track of correct predictions. To determine if the prediction was
            #       correct, check that the absolute value of the output error 
            #       is less than 0.5. If so, add one to the correct_so_far count.
            if( np.abs(output_error)<0.5):
                correct_so_far+=1
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            if(i % 2500 == 0 or i == len(training_reviews)-1):
                sys.stdout.write("\rProgress:" + str(100 * (i+1)/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            if(i % 2500 == 0 or i == len(testing_reviews)-1):
                sys.stdout.write("\rProgress:" + str(100 * (i+1)/float(len(testing_reviews)))[:4] \
                                 + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                                 + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                                 + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # TODO: Run a forward pass through the network, like you did in the
        #       "train" function. That means use the given review to 
        #       update the input layer, then calculate values for the hidden layer,
        #       and finally calculate the output layer.
        #
        #       Note: The review passed into this function for prediction 
        #             might come from anywhere, so you should convert it 
        #             to lower case prior to using it.
        
        self.layer_1*=0
        unique_indices=set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1+=self.weights_0_1[index]
        output=self.sigmoid(self.layer_1.dot(self.weights_1_2))
        if(output[0] > 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
        # TODO: The output layer should now contain a prediction. 
        #       Return `POSITIVE` for predictions greater-than-or-equal-to `0.5`, 
        #       and `NEGATIVE` otherwise.

In [20]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)

In [21]:
mlp.train(reviews[:-1000],labels[:-1000])

Progress:100.% Speed(reviews/sec):1436. #Correct:20461 #Trained:24000 Training Accuracy:85.2%

In [22]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:100.% Speed(reviews/sec):2179. #Correct:859 #Tested:1000 Testing Accuracy:85.9%

In [23]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)

In [24]:
mlp.train(reviews[:-1000],labels[:-1000])

Progress:100.% Speed(reviews/sec):8768. #Correct:20503 #Trained:24000 Training Accuracy:85.4%

In [25]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:100.% Speed(reviews/sec):4212. #Correct:822 #Tested:1000 Testing Accuracy:82.2%

In [26]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.5,learning_rate=0.01)

In [27]:
mlp.train(reviews[:-1000],labels[:-1000])

Progress:100.% Speed(reviews/sec):5337. #Correct:20870 #Trained:24000 Training Accuracy:86.9%

In [28]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:100.% Speed(reviews/sec):4248. #Correct:840 #Tested:1000 Testing Accuracy:84.0%

In [29]:
# Analysing the weights of neural network

In [30]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)

In [31]:
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:100.% Speed(reviews/sec):1192. #Correct:20334 #Trained:24000 Training Accuracy:84.7%

In [32]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [33]:
get_most_similar_words("excellent")[1:20]

[('perfect', 0.12548286087225946),
 ('amazing', 0.091827633925999727),
 ('today', 0.090223662694414203),
 ('wonderful', 0.089355976962214603),
 ('fun', 0.08750446667420686),
 ('great', 0.087141758882292059),
 ('best', 0.085810885617880611),
 ('liked', 0.07769762912384344),
 ('definitely', 0.076628781406966023),
 ('brilliant', 0.073423858769279052),
 ('loved', 0.073285428928122148),
 ('favorite', 0.072781136036160793),
 ('superb', 0.071736207178505054),
 ('fantastic', 0.070922191916266197),
 ('job', 0.069160617207634056),
 ('incredible', 0.066424077952614416),
 ('enjoyable', 0.06563256050288882),
 ('rare', 0.064819212662615089),
 ('highly', 0.063889453350970515)]

In [34]:
get_most_similar_words("terrible")[1:20]

[('awful', 0.12026847019691246),
 ('waste', 0.11945367265311002),
 ('poor', 0.092758887574435497),
 ('terrible', 0.091425387197727942),
 ('dull', 0.084209271678223591),
 ('poorly', 0.081241544516042041),
 ('disappointment', 0.080064759621368692),
 ('fails', 0.078599773723337527),
 ('disappointing', 0.07733948548032335),
 ('boring', 0.077127858748012867),
 ('unfortunately', 0.075502449705859065),
 ('worse', 0.070601835364194648),
 ('mess', 0.070564299623590399),
 ('stupid', 0.069484822832543022),
 ('badly', 0.066888903666228558),
 ('annoying', 0.065687021903374138),
 ('bad', 0.063093814537572138),
 ('save', 0.062880597495865748),
 ('disappointed', 0.062692353812072887)]

In [35]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [36]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0.5):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [37]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [38]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source,color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words