# AMC Reveiws Sentiment Classification
The purpose of this project is to use a DNN to classify the sentiments of reviews on movies, TV series, etc.

In [3]:
import numpy as np
import pandas as pd
import matplotlib as plt
from collections import Counter

## Load & Prepare Data

In [4]:
# Load the review fils and labels
with open('reviews.txt', 'r') as file:
    data = list(map(lambda x:x[:-1],file.readlines()))

with open('labels.txt', 'r') as file:
    labels = list(map(lambda x:x[:-1].upper(),file.readlines()))

In [5]:
print("There are ",len(data),"lines of reviews in our data set. Let's look at the first 2 reviews:")
DataSetFrame = pd.DataFrame(labels,data)
DataSetFrame.head(2)

There are  25000 lines of reviews in our data set. Let's look at the first 2 reviews:


Unnamed: 0,0
bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as teachers . my years in the teaching profession lead me to believe that bromwell high s satire is much closer to reality than is teachers . the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn t,POSITIVE
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane violent mob by the crazy chantings of it s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .,NEGATIVE


# Data Investigation
### How can we make a corelation between the reviews and the labels? 
Let's take a look at few of our data set

In [6]:
for i in range(7):
    print(labels[i]+':\t'+data[i][:90]+'...')

POSITIVE:	bromwell high is a cartoon comedy . it ran at the same time as some other programs about s...
NEGATIVE:	story of a man who has unnatural feelings for a pig . starts out with a opening scene that...
POSITIVE:	homelessness  or houselessness as george carlin stated  has been an issue for years but ne...
NEGATIVE:	airport    starts as a brand new luxury    plane is loaded up with valuable paintings  suc...
POSITIVE:	brilliant over  acting by lesley ann warren . best dramatic hobo lady i have ever seen  an...
NEGATIVE:	this film lacked something i couldn  t put my finger on at first charisma on the part of t...
POSITIVE:	this is easily the most underrated film inn the brooks cannon . sure  its flawed . it does...


In [7]:
positive_label_count = Counter()
negative_label_count = Counter()
total_count = Counter()

for i in range(len(data)):
    if(labels[i] == 'POSITIVE'):
        for word in data[i].split(' '):
            positive_label_count[word] += 1
            total_count[word] += 1
    else:
        for word in data[i].split(' '):
            negative_label_count[word] += 1
            total_count[word] += 1


In [8]:
print("Total number of vocabularies: ", len(total_count))

Total number of vocabularies:  74074


In [9]:
print(total_count['down'])

3728


In [10]:
print('There are ',len(negative_label_count),'words in the negetive reviews colloction')

There are  53635 words in the negetive reviews colloction


In [11]:
print('There are ',len(positive_label_count), 'words in the positive reviews collection')

There are  55214 words in the positive reviews collection


In [12]:
print('30 Most common words in positive: ')
print(list(positive_label_count.most_common())[:30])

30 Most common words in positive: 
[('', 550468), ('the', 173324), ('.', 159654), ('and', 89722), ('a', 83688), ('of', 76855), ('to', 66746), ('is', 57245), ('in', 50215), ('br', 49235), ('it', 48025), ('i', 40743), ('that', 35630), ('this', 35080), ('s', 33815), ('as', 26308), ('with', 23247), ('for', 22416), ('was', 21917), ('film', 20937), ('but', 20822), ('movie', 19074), ('his', 17227), ('on', 17008), ('you', 16681), ('he', 16282), ('are', 14807), ('not', 14272), ('t', 13720), ('one', 13655)]


In [13]:
print('30 Most common words in negative: ')
print(list(negative_label_count.most_common())[:30])

30 Most common words in negative: 
[('', 561462), ('.', 167538), ('the', 163389), ('a', 79321), ('and', 74385), ('of', 69009), ('to', 68974), ('br', 52637), ('is', 50083), ('it', 48327), ('i', 46880), ('in', 43753), ('this', 40920), ('that', 37615), ('s', 31546), ('was', 26291), ('movie', 24965), ('for', 21927), ('but', 21781), ('with', 20878), ('as', 20625), ('t', 20361), ('film', 19218), ('you', 17549), ('on', 17192), ('not', 16354), ('have', 15144), ('are', 14623), ('be', 14541), ('he', 13856)]


There are also so many common words in both collections of negetive and postive reviews, such as 'the' and 'to'. Words that are used more frequently in one collection over the other, can be more informative to our network.

In [14]:
print(list((negative_label_count - positive_label_count).most_common())[5:25])

[('this', 5840), ('bad', 5494), ('was', 4374), ('no', 3569), ('just', 3467), ('br', 3402), ('they', 3116), ('even', 2723), ('there', 2718), ('have', 2557), ('or', 2540), ('so', 2309), ('if', 2233), ('to', 2228), ('worst', 2228), ('like', 2200), ('be', 2125), ('not', 2082), ('that', 1985), ('don', 1868)]


In [15]:
print(list((positive_label_count - negative_label_count).most_common())[5:25])

[('as', 5683), ('his', 5080), ('a', 4367), ('great', 3779), ('very', 2541), ('her', 2527), ('he', 2426), ('with', 2369), ('s', 2269), ('best', 2225), ('well', 2151), ('love', 2149), ('also', 1942), ('life', 1770), ('film', 1719), ('has', 1582), ('story', 1572), ('who', 1495), ('by', 1448), ('she', 1335)]


It is obvious that words like 'bad' and 'worst' are among the most common words of negatvie labels, excluded in positive reviews. On the other hand, words like 'well' and 'great' are exculded from negetive reviews, but among most common words of positive reviews. 


In [16]:
pos_neg_ratios = Counter()
for word,cnt in positive_label_count.most_common():
    if (cnt > 100):
        pos_neg_ratios[word] += positive_label_count[word]/(negative_label_count[word]+1)

In [17]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))
print("Pos-to-neg ratio for 'awful' = {}".format(pos_neg_ratios["awful"]))
print("Pos-to-neg ratio for 'and' = {}".format(pos_neg_ratios["and"]))

Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218
Pos-to-neg ratio for 'awful' = 0.10783055198973042
Pos-to-neg ratio for 'and' = 1.2061678272793268


In [18]:
pos_neg_ratios_log = Counter()

for word,ratio in pos_neg_ratios.most_common():
    if ratio > 1:
        pos_neg_ratios_log[word] += np.log(pos_neg_ratios[word])
    else:
        pos_neg_ratios_log[word] += (-np.log(1/(pos_neg_ratios[word]+0.01)))

In [19]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios_log["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios_log["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios_log["terrible"]))
print("Pos-to-neg ratio for 'awful' = {}".format(pos_neg_ratios_log["awful"]))
print("Pos-to-neg ratio for 'and' = {}".format(pos_neg_ratios_log["and"]))

Pos-to-neg ratio for 'the' = 0.05902269426102881
Pos-to-neg ratio for 'amazing' = 1.3919815802404802
Pos-to-neg ratio for 'terrible' = -1.6742829939664696
Pos-to-neg ratio for 'awful' = -2.1385076866397488
Pos-to-neg ratio for 'and' = 0.18744824888788403


Now words with positive and over 1.0 ratio are definitly carrying more postive sentiment and neutrials are the one with ratio more closely to 0.0. Negetive ratios are more accured in our negetive collections , therefore they are carrying more negetive sentiment.

In [20]:
pos_neg_ratios_log.most_common()

[('edie', 4.6913478822291435),
 ('paulie', 4.0775374439057197),
 ('felix', 3.1527360223636558),
 ('polanski', 2.8233610476132043),
 ('matthau', 2.8067217286092401),
 ('victoria', 2.6810215287142909),
 ('mildred', 2.6026896854443837),
 ('gandhi', 2.5389738710582761),
 ('flawless', 2.451005098112319),
 ('superbly', 2.2600254785752498),
 ('perfection', 2.1594842493533721),
 ('astaire', 2.1400661634962708),
 ('captures', 2.0386195471595809),
 ('wonderfully', 2.0218960560332353),
 ('powell', 1.9783454248084671),
 ('brosnan', 1.9547990964725592),
 ('lily', 1.9203768470501485),
 ('bakshi', 1.9029851043382795),
 ('lincoln', 1.9014583864844796),
 ('refreshing', 1.8551812956655511),
 ('breathtaking', 1.8481124057791867),
 ('bourne', 1.8478489358790986),
 ('delightful', 1.8002701588959635),
 ('flynn', 1.7996646487351682),
 ('andrews', 1.7764919970972666),
 ('beautifully', 1.7626953362841438),
 ('elvira', 1.7397031072720019),
 ('underrated', 1.7197859696029656),
 ('gripping', 1.7165360479904674),


In [21]:
list(reversed(pos_neg_ratios_log.most_common()[0:30]))

[('superb', 1.7091514458966952),
 ('gripping', 1.7165360479904674),
 ('underrated', 1.7197859696029656),
 ('elvira', 1.7397031072720019),
 ('beautifully', 1.7626953362841438),
 ('andrews', 1.7764919970972666),
 ('flynn', 1.7996646487351682),
 ('delightful', 1.8002701588959635),
 ('bourne', 1.8478489358790986),
 ('breathtaking', 1.8481124057791867),
 ('refreshing', 1.8551812956655511),
 ('lincoln', 1.9014583864844796),
 ('bakshi', 1.9029851043382795),
 ('lily', 1.9203768470501485),
 ('brosnan', 1.9547990964725592),
 ('powell', 1.9783454248084671),
 ('wonderfully', 2.0218960560332353),
 ('captures', 2.0386195471595809),
 ('astaire', 2.1400661634962708),
 ('perfection', 2.1594842493533721),
 ('superbly', 2.2600254785752498),
 ('flawless', 2.451005098112319),
 ('gandhi', 2.5389738710582761),
 ('mildred', 2.6026896854443837),
 ('victoria', 2.6810215287142909),
 ('matthau', 2.8067217286092401),
 ('polanski', 2.8233610476132043),
 ('felix', 3.1527360223636558),
 ('paulie', 4.0775374439057197)

## Building the Neural Net


In [22]:
vocabulary_set = set(total_count)
vocab_size = len(vocabulary_set)

In [23]:
print('We have total of ',vocab_size, 'unique words in our data set')

We have total of  74074 unique words in our data set


In [24]:
layer_0 = np.zeros(shape=(1,vocab_size))

In [25]:
layer_0.shape

(1, 74074)

In [26]:
word_to_count = {}
for i,words in enumerate(vocabulary_set):
    word_to_count[words] = i


In [27]:
word_to_count

{'': 0,
 'misserably': 1,
 'cheats': 2,
 'mangles': 3,
 'teenkill': 4,
 'spearheads': 5,
 'domaine': 6,
 'kirilian': 7,
 'joviality': 8,
 'kimberely': 9,
 'toreton': 10,
 'whiny': 11,
 'neutrally': 12,
 'clipped': 13,
 'sodom': 14,
 'throughline': 15,
 'ahah': 16,
 'blotter': 17,
 'embroider': 18,
 'bombardier': 19,
 'bauman': 20,
 'entire': 21,
 'queda': 22,
 'disengorges': 23,
 'transcript': 24,
 'saleem': 25,
 'adeline': 26,
 'colossus': 27,
 'dureyea': 28,
 'swooned': 29,
 'neutered': 30,
 'cheeee': 31,
 'maffia': 32,
 'leiberman': 33,
 'ordained': 34,
 'jacket': 35,
 'special': 36,
 'covers': 37,
 'gladiatorial': 38,
 'denero': 39,
 'nutcases': 40,
 'questing': 41,
 'tyros': 42,
 'gremlin': 43,
 'develop': 44,
 'ladislaw': 45,
 'politicos': 46,
 'fictitional': 47,
 'foiled': 48,
 'genji': 49,
 'geograpically': 50,
 'policial': 51,
 'homelife': 52,
 'laworder': 53,
 'philosophized': 54,
 'finalize': 55,
 'wsj': 56,
 'paycheque': 57,
 'fratricidal': 58,
 'lotof': 59,
 'sleepaway': 6

In [28]:
def update_input_layer(review):
    global layer_0
    # clear out previous state by resetting the layer to be all 0s
    layer_0 *= 0
    for word in (review.split(' ')):
        layer_0[0][word_to_count[word]] += 1

In [29]:
update_input_layer(data[0])
layer_0

array([[ 18.,   0.,   0., ...,   0.,   0.,   0.]])

In [30]:
def get_target_for_label(label):
    if label == 'POSITIVE':
        return 1
    else:
        return 0


In [31]:
labels[1]

'NEGATIVE'

In [32]:
get_target_for_label(labels[1])

0

### Neural Network

In [154]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        # TODO: populate review_vocab with all of the words in the given reviews
        #       Remember to split reviews into individual words 
        #       using "split(' ')" instead of "split()".
        for review in reviews:
            review_vocab.update(review.split(' '))
        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        # TODO: populate label_vocab with all of the words in the given labels.
        #       There is no need to split the labels because each one is a single word.
        for label in labels:
            label_vocab.update(label.upper())
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        # TODO: populate self.word2index with indices for all the words in self.review_vocab
        #       like you saw earlier in the notebook
        for i,word in enumerate(self.review_vocab):
            self.word2index[word] = i
            
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        # TODO: do the same thing you did for self.word2index and self.review_vocab, 
        #       but for self.label2index and self.label_vocab instead
        for i,label in enumerate(self.label_vocab):
            self.label2index[label] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Store the number of nodes in input, hidden, and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights
        
        # TODO: initialize self.weights_0_1 as a matrix of zeros. These are the weights between
        #       the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
        
        # TODO: initialize self.weights_1_2 as a matrix of random values. 
        #       These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0,self.output_nodes**-0.5,
                                            (self.hidden_nodes, self.output_nodes))
        
        # TODO: Create the input layer, a two-dimensional matrix with shape 
        #       1 x input_nodes, with all values initialized to zero
        self.layer_0 = np.zeros((1,input_nodes))
#         print(type(self.weights_0_1))
        
    def update_input_layer(self,review):
        # TODO: You can copy most of the code you wrote for update_input_layer 
        #       earlier in this notebook. 
        #
        #       However, MAKE SURE YOU CHANGE ALL VARIABLES TO REFERENCE
        #       THE VERSIONS STORED IN THIS OBJECT, NOT THE GLOBAL OBJECTS.
        #       For example, replace "layer_0 *= 0" with "self.layer_0 *= 0"
        
        self.layer_0 *= 0
        for word in (review.split(' ')):
            if (word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] += 1
                
    def get_target_for_label(self,label):
        # TODO: Copy the code you wrote for get_target_for_label 
        #       earlier in this notebook. 
        if (label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        # TODO: Return the result of calculating the sigmoid activation function
        #       shown in the lectures
        return (1 /(1 + np.exp(-x)))
    
    def sigmoid_output_2_derivative(self,output):
        # TODO: Return the derivative of the sigmoid activation function, 
        #       where "output" is the original output from the sigmoid fucntion 
        return output * (1 - output)

    def train(self, training_reviews, training_labels):
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0
        
        # Remember when we started for printing time statistics
        start = time.time()

        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # TODO: Get the next review and its correct label
            review, label = training_reviews[i], training_labels[i]
            # TODO: Implement the forward pass through the network. 
            #       That means use the given review to update the input layer, 
            #       then calculate values for the hidden layer,
            #       and finally calculate the output layer.
            # 
            #       Do not use an activation function for the hidden layer,
            #       but use the sigmoid activation function for the output layer.
            self.update_input_layer(review)
            hidden_inputs = self.layer_0.dot(self.weights_0_1)
            hidden_outputs = np.dot(hidden_inputs, self.weights_1_2)
            final_output = self.sigmoid(hidden_outputs)
            # TODO: Implement the back propagation pass here. 
            #       That means calculate the error for the forward pass's prediction
            #       and update the weights in the network according to their
            #       contributions toward the error, as calculated via the
            #       gradient descent and back propagation algorithms you 
            #       learned in class.
            error = final_output - self.get_target_for_label(label)
            output_error_term = error * self.sigmoid_output_2_derivative(final_output)
            
            hidden_error = np.dot(output_error_term,self.weights_1_2.T)
            hidden_error_term = hidden_error 
            print((self.layer_0.T.shape), (hidden_error_term.shape))
            delta_weights_0_1 = self.learning_rate * np.dot(self.layer_0.T,hidden_error_term)
            delta_weights_1_2 = self.learning_rate * np.dot(output_error_term, hidden_inputs.T)
            
            self.weights_0_1 -= delta_weights_0_1 / len(training_reviews)
            self.weights_1_2 -= delta_weights_1_2 / len(training_reviews)
            # TODO: Keep track of correct predictions. To determine if the prediction was
            #       correct, check that the absolute value of the output error 
            #       is less than 0.5. If so, add one to the correct_so_far count.
            if final_output >= 0.5 and label == 'POSITIVE':
                correct_so_far += 1
            elif final_output < 0.5 and label == 'NEGETIVE':
                correct_so_far += 1
                
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # TODO: Run a forward pass through the network, like you did in the
        #       "train" function. That means use the given review to 
        #       update the input layer, then calculate values for the hidden layer,
        #       and finally calculate the output layer.
        #
        #       Note: The review passed into this function for prediction 
        #             might come from anywhere, so you should convert it 
        #             to lower case prior to using it.
        self.update_input_layer(review.lower())
        hidden_inputs = np.dot(self.layer_0, self.weights_0_1)
        hidden_outputs = np.dot(hidden_inputs, self.weights_1_2)
        final_output = self.sigmoid(hidden_outputs)
        # TODO: The output layer should now contain a prediction. 
        #       Return `POSITIVE` for predictions greater-than-or-equal-to `0.5`, 
        #       and `NEGATIVE` otherwise.
        if final_output >= 0.5:
            return 'POSITIVE'
        else:
            return 'NEGETIVE'
        
        

In [155]:
mlp = SentimentNetwork(data[:-1000],labels[:-1000], learning_rate=0.1)

In [156]:
(mlp.weights_1_2).dtype
type(data[0])

str

In [157]:
# mlp.test(data[-1000:],labels[-1000:])

In [158]:
mlp = SentimentNetwork(data[:-1000],labels[:-1000], learning_rate=0.01)
mlp.train(data[:-1000],labels[:-1000])

(72810, 1) (1, 10)


ValueError: shapes (1,1) and (10,1) not aligned: 1 (dim 1) != 10 (dim 0)