In [2]:
# Emotion Detection using NN architecture

In [3]:
# Consist of Below Modules
#     1) Training a word vector generation model (such as Word2Vec)
#     2) Creating an ID's matrix for the training set
#     3) LSTM implementation
#     4) Training 
#     5) Testing

In [4]:
# Module 1 : Training a word Vector Generation Model

In [5]:
# GloVe, a word vector generation model. The matrix will contain 400,000 word vectors, each with a dimensionality of 50.

In [6]:
import numpy as np
wordsList = np.load('wordsList.npy')
print('Loaded the word list!')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
SeqLength = 42
counter = 0
for p in wordsList:
    print(p)
    counter = counter+1
    if counter >= SeqLength:                
        break

print("---------------------------------------")
wordVectors = np.load('wordVectors.npy')
print(wordVectors[:10])
print ('Loaded the word vectors!')

Loaded the word list!
0
,
.
of
to
and
in
a
"
's
for
-
that
on
is
was
said
with
he
as
it
by
at
(
)
from
his
''
``
an
be
has
are
have
but
were
not
this
who
they
had
i
---------------------------------------
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00

In [7]:
# dimensions of the vocabulary list and the embedding matrix.

In [8]:
print(len(wordsList))
print(wordVectors.shape)

400000
(400000, 50)


In [9]:
# search word list for a word like "movies", and then access its corresponding vector through the embedding matrix.

In [10]:
moviesIndex = wordsList.index('i')
print(moviesIndex)
wordVectors[moviesIndex]

41


array([  1.18910000e-01,   1.52549997e-01,  -8.20730031e-02,
        -7.41439998e-01,   7.59169996e-01,  -4.83280003e-01,
        -3.10090005e-01,   5.14760017e-01,  -9.87079978e-01,
         6.17570011e-04,  -1.50429994e-01,   8.37700009e-01,
        -1.07969999e+00,  -5.14599979e-01,   1.31879997e+00,
         6.20069981e-01,   1.37789994e-01,   4.71080005e-01,
        -7.28740022e-02,  -7.26750016e-01,  -7.41159976e-01,
         7.52629995e-01,   8.81799996e-01,   2.95610011e-01,
         1.35479999e+00,  -2.57010007e+00,  -1.35230005e+00,
         4.58799988e-01,   1.00680006e+00,  -1.18560004e+00,
         3.47370005e+00,   7.78980017e-01,  -7.29290009e-01,
         2.51020014e-01,  -2.61559993e-01,  -3.46839994e-01,
         5.58409989e-01,   7.50980020e-01,   4.98299986e-01,
        -2.68229991e-01,  -2.74430006e-03,  -1.82980001e-02,
        -2.80959994e-01,   5.53179979e-01,   3.77059989e-02,
         1.85550004e-01,  -1.50250003e-01,  -5.75119972e-01,
        -2.66710013e-01,

In [11]:
# taking an input sentence and then constructing the its vector representation.
# input sentence "I thought the movie was incredible and inspiring". 

In [12]:
maxSeqLength = 10 #Maximum length of sentence
numDimensions = 300 #Dimensions for each word vector
inputSentence = np.zeros((maxSeqLength), dtype='int32')
inputSentence[0] = wordsList.index("i")
inputSentence[1] = wordsList.index("thought")
inputSentence[2] = wordsList.index("the")
inputSentence[3] = wordsList.index("movie")
inputSentence[4] = wordsList.index("was")
inputSentence[5] = wordsList.index("incredible")
inputSentence[6] = wordsList.index("and")
inputSentence[7] = wordsList.index("inspiring")
#inputSentence[8] and inputSentence[9] are going to be 0
print(inputSentence.shape)
print(inputSentence) #Shows the row index for each word

(10,)
[    41    804 201534   1005     15   7446      5  13767      0      0]


In [13]:
# to get the word vectors, we can use Tensorflow's embedding lookup function.
# The 10 x 50 output contain the 50 dimensional word vectors for each of the 10 words in the sequence.

In [14]:
import tensorflow as tf
with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors,inputSentence).eval())

[[  1.18910000e-01   1.52549997e-01  -8.20730031e-02  -7.41439998e-01
    7.59169996e-01  -4.83280003e-01  -3.10090005e-01   5.14760017e-01
   -9.87079978e-01   6.17570011e-04  -1.50429994e-01   8.37700009e-01
   -1.07969999e+00  -5.14599979e-01   1.31879997e+00   6.20069981e-01
    1.37789994e-01   4.71080005e-01  -7.28740022e-02  -7.26750016e-01
   -7.41159976e-01   7.52629995e-01   8.81799996e-01   2.95610011e-01
    1.35479999e+00  -2.57010007e+00  -1.35230005e+00   4.58799988e-01
    1.00680006e+00  -1.18560004e+00   3.47370005e+00   7.78980017e-01
   -7.29290009e-01   2.51020014e-01  -2.61559993e-01  -3.46839994e-01
    5.58409989e-01   7.50980020e-01   4.98299986e-01  -2.68229991e-01
   -2.74430006e-03  -1.82980001e-02  -2.80959994e-01   5.53179979e-01
    3.77059989e-02   1.85550004e-01  -1.50250003e-01  -5.75119972e-01
   -2.66710013e-01   9.21209991e-01]
 [  4.27619994e-01  -1.14689998e-01   1.05060004e-02  -5.46620011e-01
    8.90550017e-01   1.92629993e-01  -6.53739989e-01 

In [15]:
# Module 2 : Creating an ID's matrix for our training set

In [16]:
# Reading dataset files from directory
# finding total and average number of words in each review

In [18]:
from os import listdir
from os.path import isfile, join
positiveFiles = ['positiveReviews/' + f for f in listdir('positiveReviews/') if isfile(join('positiveReviews/', f))]
negativeFiles = ['negativeReviews/' + f for f in listdir('negativeReviews/') if isfile(join('negativeReviews/', f))]
numWords = []
for pf in positiveFiles:
    with open(pf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)
print(positiveFiles)
print('Positive files finished')

for nf in negativeFiles:
    with open(nf, "r", encoding='utf-8') as f:
        line=f.readline()
        counter = len(line.split())
        numWords.append(counter)  
print('Negative files finished')

numFiles = len(numWords)
print('The total number of files is', numFiles)
print('The total number of words in the files is', sum(numWords))
print('The average number of words in the files is', sum(numWords)/len(numWords))

['positiveReviews/0_9.txt', 'positiveReviews/10000_8.txt', 'positiveReviews/10001_10.txt', 'positiveReviews/10002_7.txt', 'positiveReviews/10003_8.txt', 'positiveReviews/10004_8.txt', 'positiveReviews/10005_7.txt', 'positiveReviews/10006_7.txt', 'positiveReviews/10007_7.txt', 'positiveReviews/10008_7.txt', 'positiveReviews/10009_9.txt', 'positiveReviews/1000_8.txt', 'positiveReviews/10010_7.txt', 'positiveReviews/10011_9.txt', 'positiveReviews/10012_8.txt', 'positiveReviews/10013_7.txt', 'positiveReviews/10014_8.txt', 'positiveReviews/10015_8.txt', 'positiveReviews/10016_8.txt', 'positiveReviews/10017_9.txt', 'positiveReviews/10018_8.txt', 'positiveReviews/10019_8.txt', 'positiveReviews/1001_8.txt', 'positiveReviews/10020_8.txt', 'positiveReviews/10021_8.txt', 'positiveReviews/10022_7.txt', 'positiveReviews/10023_9.txt', 'positiveReviews/10024_9.txt', 'positiveReviews/10025_9.txt', 'positiveReviews/10026_7.txt', 'positiveReviews/10027_7.txt', 'positiveReviews/10028_10.txt', 'positiveRe

Negative files finished
The total number of files is 25000
The total number of words in the files is 5844680
The average number of words in the files is 233.7872


In [19]:
maxSeqLength = 250

In [20]:
# reviews in each file

In [21]:
fname = positiveFiles[3]
with open(fname) as f:
    for lines in f:
        print(lines)
        exit

This is easily the most underrated film inn the Brooks cannon. Sure, its flawed. It does not give a realistic view of homelessness (unlike, say, how Citizen Kane gave a realistic view of lounge singers, or Titanic gave a realistic view of Italians YOU IDIOTS). Many of the jokes fall flat. But still, this film is very lovable in a way many comedies are not, and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive. Its not The Fisher King, but its not crap, either. My only complaint is that Brooks should have cast someone else in the lead (I love Mel as a Director and Writer, not so much as a lead).


In [22]:
# Data preprocessing

In [23]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

In [24]:
firstFile = np.zeros((maxSeqLength), dtype='int32')
with open(fname) as f:
    indexCounter = 0
    line=f.readline()
    cleanedLine = cleanSentences(line)
    split = cleanedLine.split()
    for word in split:
        try:
            firstFile[indexCounter] = wordsList.index(word)
        except ValueError:
            firstFile[indexCounter] = 399999 #Vector for unknown words
        indexCounter = indexCounter + 1
firstFile

array([    37,     14,   2407, 201534,     96,  37314,    319,   7158,
       201534,   6469,   8828,   1085,     47,   9703,     20,    260,
           36,    455,      7,   7284,   1139,      3,  26494,   2633,
          203,    197,   3941,  12739,    646,      7,   7284,   1139,
            3,  11990,   7792,     46,  12608,    646,      7,   7284,
         1139,      3,   8593,     81,  36381,    109,      3, 201534,
         8735,    807,   2983,     34,    149,     37,    319,     14,
          191,  31906,      6,      7,    179,    109,  15402,     32,
           36,      5,      4,   2933,     12,    138,      6,      7,
          523,     59,     77,      3, 201534,     96,   4246,  30006,
          235,      3,    908,     14,   4702,   4571,     47,     36,
       201534,   6429,    691,     34,     47,     36,  35404,    900,
          192,     91,   4499,     14,     12,   6469,    189,     33,
         1784,   1318,   1726,      6, 201534,    410,     41,    835,
      

In [25]:
# Converting each file to id matrix 
# for each of 25,000 reviews and get a 25000 x 250 matrix.

In [26]:
# ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
# fileCounter = 0
# for pf in positiveFiles:
#    with open(pf, "r") as f:
#        indexCounter = 0
#        line=f.readline()
#        cleanedLine = cleanSentences(line)
#        split = cleanedLine.split()
#        for word in split:
#            try:
#                ids[fileCounter][indexCounter] = wordsList.index(word)
#            except ValueError:
#                ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
#            indexCounter = indexCounter + 1
#            if indexCounter >= maxSeqLength:
#                break
#        fileCounter = fileCounter + 1 

# for nf in negativeFiles:
#    with open(nf, "r") as f:
#        indexCounter = 0
#        line=f.readline()
#        cleanedLine = cleanSentences(line)
#        split = cleanedLine.split()
#        for word in split:
#            try:
#                ids[fileCounter][indexCounter] = wordsList.index(word)
#            except ValueError:
#                ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
#            indexCounter = indexCounter + 1
#            if indexCounter >= maxSeqLength:
#                break
#        fileCounter = fileCounter + 1 
# #Pass into embedding function and see if it evaluates. 

# np.save('idsMatrix', ids)

In [27]:
ids = np.load('idsMatrix.npy')
ids[0]

array([174943,    152,     14,      7,   7362,   2841,     20,   1421,
           22, 201534,    215,     79,     19,     77,     68,   1009,
           59,    164,    214,    125,     19, 399999,    192,   1678,
           82,      6, 201534,   3174,   8104,    410,    285,      4,
          733,     12, 174943, 399999,  15303,     14,    181,   2386,
            4,   2532,     73,     14, 399999, 201534,  14170,      4,
         3981,   7980, 201534,  34401,    543,     38,     86,    253,
          248,    131,     44,  22495, 399999,  31166, 201534,  91887,
            3, 201534,   1115,    794,     64,   9794,    285,      3,
       201534,    888,     41,   1522,      5,     44,    543,     61,
           41,    822, 201534,   1942,      6,     42,      7,   1283,
         2648,    977,      4,   6292,    135, 201534,    164,     41,
         1040,   3151,     22,    152,      7,   2392, 399999, 399999,
       399999,    187,      4,  11739,     48,      3,    392,   2562,
      

In [28]:
#helper function

In [29]:
from random import randint

def getTrainBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,11499)
            labels.append([1,0])
        else:
            num = randint(13499,24999)
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(11499,13499)
        if (num <= 12499):
            labels.append([1,0])
        else:
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

In [30]:
#intialization of variables 

In [31]:
batchSize = 24
lstmUnits = 64
numClasses =2
iterations = 100000

In [32]:
#creating placeholder for input and labels

In [33]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


In [34]:
#creating three dimensional data variable to hold embedded lookup matrix

In [35]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)
data.shape

TensorShape([Dimension(24), Dimension(250), Dimension(50)])

In [36]:
#feeding both the LSTM cell and the 3-D tensor full of input data into a function called tf.nn.dynamic_rnn. 
#This function is in charge of unrolling the whole network and creating a pathway for the data to flow through the RNN graph.

In [37]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

In [38]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [39]:
#Next, we’ll define correct prediction and accuracy metrics to track how the network is doing. 
#The correct prediction formulation works by looking at the index of the maximum value of the 2 output values, 
#and then seeing whether it matches with the training labels.

In [40]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [41]:
#We’ll define a standard cross entropy loss with a softmax layer put on top of the final prediction values. 
#For the optimizer, we’ll use Adam and the default learning rate of .001.

In [42]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [43]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

In [44]:
#Training

In [45]:
# sess = tf.InteractiveSession()
# saver = tf.train.Saver()
# sess.run(tf.global_variables_initializer())

# for i in range(iterations):
#    #Next Batch of reviews
#    nextBatch, nextBatchLabels = getTrainBatch();
#    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
#    #Write summary to Tensorboard
#    if (i % 50 == 0):
#        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
#        writer.add_summary(summary, i)

#    #Save the network every 10,000 training iterations
#    if (i % 10000 == 0 and i != 0):
#        save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#        print("saved to %s" % save_path)
# writer.close()

In [46]:
#Loading a pretrained model involves defining another Tensorflow session, creating a Saver object,
#and then using that object to call the restore function. This function takes into 2 arguments, 
#one for the current session, and one for the name of the saved model.

In [47]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models'))

INFO:tensorflow:Restoring parameters from models\pretrained_lstm.ckpt-90000


In [48]:
#Then we’ll load some movie reviews from our test set.

In [49]:
iterations = 10
for i in range(iterations):
    nextBatch, nextBatchLabels = getTestBatch();
    print("Accuracy for this batch:", (sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})) * 100)

Accuracy for this batch: 87.5
Accuracy for this batch: 79.1666686535
Accuracy for this batch: 70.8333313465
Accuracy for this batch: 75.0
Accuracy for this batch: 79.1666686535
Accuracy for this batch: 83.3333313465
Accuracy for this batch: 75.0
Accuracy for this batch: 87.5
Accuracy for this batch: 83.3333313465
Accuracy for this batch: 83.3333313465


In [50]:
#Testing a Pre-Trained Network

In [51]:
def getSentenceMatrix(sentence):
    arr = np.zeros([batchSize, maxSeqLength])
    sentenceMatrix = np.zeros([batchSize,maxSeqLength], dtype='int32')
    cleanedSentence = cleanSentences(sentence)
    split = cleanedSentence.split()
    for indexCounter,word in enumerate(split):
        try:
            sentenceMatrix[0,indexCounter] = wordsList.index(word)
        except ValueError:
            sentenceMatrix[0,indexCounter] = 399999 #Vector for unknown words
    return sentenceMatrix

In [52]:
inputText = "Srk films are getting shittier every year and his Hollywood counterparts leonardo DiCaprio tom cruise Jonny depp etc. are making such good films . Come on he could do better being the second richest actor in the world. Even Salman and Aamir are trynna do good scripted films . What is he even thinking"
inputMatrix = getSentenceMatrix(inputText)

In [1]:
predictedSentiment = sess.run(prediction, {input_data: inputMatrix})[0]
# predictedSentiment[0] represents output score for positive sentiment
# predictedSentiment[1] represents output score for negative sentiment

print(predictedSentiment[0])
print(predictedSentiment[1])

if (predictedSentiment[0] > predictedSentiment[1]):
    print("Positive Sentiment")
else:
    print("Negative Sentiment")

NameError: name 'sess' is not defined

In [67]:
secondInputText = "The movie was not excellent"
secondInputMatrix = getSentenceMatrix(secondInputText)

In [68]:
predictedSentiment = sess.run(prediction, {input_data: secondInputMatrix})[0]

print(predictedSentiment[0])
print(predictedSentiment[1])

if (predictedSentiment[0] > predictedSentiment[1]):
    print("Positive Sentiment")
else:
    print("Negative Sentiment")

2.06147
-3.83645
Positive Sentiment
