# Sentiment Analysis

This notebook documents some of my (imitative) experiences trying to implement and understand neural network approaches to sentiment analysis.

First i'm going to try implement a CNN using Keras and Tensorflow


In [61]:
import numpy as np
import sklearn
from keras.datasets import imdb
from matplotlib import pyplot

#keras utilities
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# import and register tensorflow session #
import tensorflow as tf
sess = tf.Session()

# import keras and register with tensorflow session #
from keras import backend as K
K.set_session(sess)

In [62]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

# concat together test and train sets for full set of data/labels #
X = numpy.concatenate((X_train, X_test), axis=0)
y = numpy.concatenate((y_train, y_test), axis=0)

In [63]:
# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

Training data: 
(50000,)
(50000,)


In [64]:
imdb.load_data(num_words=5000)

((array([ list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
         list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 

In [65]:
X_train = sequence.pad_sequences(X_train, maxlen=500)
X_test = sequence.pad_sequences(X_test, maxlen=500)

In [66]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

In [78]:
# Pad vectors < 500 words with zeros, else truncate
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print(X_train)

[[   0    0    0 ...,   19  178   32]
 [   0    0    0 ...,   16  145   95]
 [   0    0    0 ...,    7  129  113]
 ..., 
 [   0    0    0 ...,    4 3586    2]
 [   0    0    0 ...,   12    9   23]
 [   0    0    0 ...,  204  131    9]]


In [70]:
# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               4000250   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
Total params: 4,160,501
Trainable params: 4,160,501
Non-trainable params: 0
_________________________________________________________________
None


In [71]:
# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
41s - loss: 0.5726 - acc: 0.6542 - val_loss: 0.3268 - val_acc: 0.8620
Epoch 2/2
40s - loss: 0.2270 - acc: 0.9108 - val_loss: 0.2857 - val_acc: 0.8803
Accuracy: 88.03%


Interesting to see at about what frequency index unambiguously domain specific terms take over. I imagine the more varied datasets integrated into the model, the more generalised terms are pushed to the top of the frequency indexes

In [127]:
# Test string to measure sentiment #
test_str = 'Hmm, maybe it’s time to finally learn a framework, Roger thought. He had been dabbling in JavaScript for a few months. Why not see what all the framework fuss was about? After skimming the article — which included words like “intuitive” and “flexible” and “versatile” — Roger was sold.'

# We need to get the word index data to convert the freq indexes back to the assosciated word #
word_index = imdb.get_word_index()

# convert to frequency representation by word index #
test_encoded = [[word_index[w] for w in test_str if w in word_index]]

print(test_encoded)

[[1980, 1980, 1980, 3, 5132, 500, 960, 10, 827, 63164, 587, 827, 10, 1980, 960, 827, 1601, 1206, 10, 3360, 3, 2011, 2011, 5132, 2011, 960, 3, 1476, 3360, 3, 1206, 1476, 3, 1980, 960, 1989, 1601, 1476, 2292, 1601, 1328, 960, 1476, 827, 2020, 1601, 1203, 1328, 2020, 827, 960, 2020, 3, 1092, 500, 960, 960, 3360, 1092, 3, 500, 500, 2011, 10, 3360, 1328, 10, 3360, 3, 1961, 3, 1145, 1476, 10, 1654, 827, 1206, 1601, 1476, 3, 1206, 960, 1989, 1980, 1601, 3360, 827, 2020, 587, 2020, 5132, 3360, 1601, 827, 587, 960, 960, 1989, 2020, 3, 827, 3, 2011, 2011, 827, 2020, 960, 1206, 1476, 3, 1980, 960, 1989, 1601, 1476, 2292, 1206, 1203, 587, 587, 1989, 3, 587, 3, 500, 1601, 1203, 827, 1206, 827, 960, 1476, 587, 2292, 10, 1980, 1980, 10, 3360, 1328, 827, 2020, 960, 3, 1476, 827, 10, 1145, 2011, 960, 1989, 2020, 10, 1145, 2020, 10, 3360, 1145, 2011, 1203, 1092, 960, 1092, 1989, 1601, 1476, 1092, 587, 2011, 10, 2292, 960, 10, 3360, 827, 1203, 10, 827, 10, 1961, 960, 3, 3360, 1092, 1206, 2011, 960, 1742,

In [128]:
# Pad the sequence to make it an equal length array #
test_padded = sequence.pad_sequences(test_encoded, maxlen=max_words)

print(test_padded)

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [129]:
# Flatten the array #
vector = np.array([test_padded.flatten()])
print(vector)

# replace word encodings out of range with 0 #
vector[vector > 5000] = 0

#it was already flat #

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [130]:
print(model)

<keras.models.Sequential object at 0x7f0c1335e080>


In [131]:
# Predict the sentiment value of the vector-representation of our phrase #
model.predict(vector)

array([[ 0.97481537]], dtype=float32)

In [124]:
# Test string to measure sentiment #
test_str = 'I could not keep pace with all these literary folk as they glanced from subject to subject and entered into deep dispute, or made conversation sparkle with epigrams and happy witticisms.'

# We need to get the word index data to convert the freq indexes back to the assosciated word #
word_index = imdb.get_word_index()

# convert to frequency representation by word index #
test_encoded = [[word_index[w] for w in test_str if w in word_index]]

# Pad the sequence to make it an equal length array #
test_padded = sequence.pad_sequences(test_encoded, maxlen=max_words)

# Flatten the array #
vector = np.array([test_padded.flatten()])
print(vector)

# replace word encodings out of range with 0 #
vector[vector > 5000] = 0

#it was already flat #

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [125]:
# Predict the sentiment value of the vector-representation of our phrase #
model.predict(vector)

array([[ 0.97321588]], dtype=float32)

In [120]:
# Test string to measure sentiment #
test_str = 'bad angry mean rude'
# We need to get the word index data to convert the freq indexes back to the assosciated word #
word_index = imdb.get_word_index()

# convert to frequency representation by word index #
test_encoded = [[word_index[w] for w in test_str if w in word_index]]

# Pad the sequence to make it an equal length array #
test_padded = sequence.pad_sequences(test_encoded, maxlen=max_words)

# Flatten the array #
vector = np.array([test_padded.flatten()])
print(vector)

# replace word encodings out of range with 0 #
vector[vector > 5000] = 0

#it was already flat #

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [122]:
# Predict the sentiment value of the vector-representation of our phrase #
model.predict(vector)

array([[ 0.62631774]], dtype=float32)

the sentiment for custom excerpts seems to be really ambiguous. We should try rolling it on an actual dataset and comparing it to the output of google's sentiment engine