In [1]:
import numpy as np
import sys
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense, Activation, Input

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
reviews = np.array(reviews)
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
labels = np.array(labels)
g.close()

In [3]:
print('Total Reviews: ', len(reviews))

Total Reviews:  25000


In [4]:
print('Example: ')
i = np.random.randint(len(reviews))
print('Review: ')
print(reviews[i])
print('Sentiment: ')
print(labels[i])

Example: 
Review: 
whoever says pokemon is stupid can die . this movie is superlative . i even shead a tear when celebei died . i don  t cry much  this film is a touching animated thriller .  br    br   in this fourth installment of pokemon  ash and friends must stop the bad jerk from making celebei the ultimate evil weapon with his dark ball . in the time  sam and celebei travel through time and continuously are hunted by game hunters . i like the part with the double battle and sam has the apricorn pokeball  if you  ve played pokemon gold  silver  or crystal  you know what it is .   br    br   i also enjoyed having miramax in charge instead of warner brothers . putting the mini movie at the end was a great idea . the pokemon in this movie come to life more than ever .  
Sentiment: 
POSITIVE


In [5]:
print("labels.txt \t : \t reviews.txt\n")
print_review_and_label(21934)
print_review_and_label(5297)
print_review_and_label(4998)

labels.txt 	 : 	 reviews.txt

POSITIVE	:	excellent episode movie ala pulp fiction .  days   suicides . it doesnt get more...
NEGATIVE	:	if you haven  t seen this  it  s terrible . it is pure trash . i saw this about ...
POSITIVE	:	this schiffer guy is a real genius  the movie is of excellent quality and both e...


In [6]:
total_counts = Counter()

In [7]:
for i in range(len(reviews)):
    for word in reviews[i].split(" "):
        total_counts[word] += 1

In [13]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print('Vocabulary size: ',vocab_size)

Vocabulary size:  74074


In [14]:
word2index = {}

for i,word in enumerate(vocab):
    word2index[word] = i

In [15]:
X = np.zeros((len(reviews),vocab_size))
y = np.zeros((len(reviews),1))

In [16]:
for i in range(len(reviews)):
    review = reviews[i]
    for word in review.split(" "):
        X[i][word2index[word]] += 1
        if labels[i] == 'POSITIVE': y[i][0] = 1
        else: y[i][0] = 0


In [17]:
y = np.asarray(y, dtype='int')
print(y[0][0])
print(y[1][0])

1
0


In [18]:
def shuffle(x,y):
    indices = np.arange(x.shape[0])
    rand_indices = np.random.shuffle(indices)
    x[indices] = x[rand_indices]
    y[indices] = y[rand_indices]
    return x,y
X, y = shuffle(X,y)

In [23]:
model = Sequential()
model.add(Dense(10, activation='relu', input_dim=vocab_size, kernel_initializer='TruncatedNormal',bias_initializer='zeros'))
model.add(Dense(1, activation='sigmoid', kernel_initializer='TruncatedNormal',bias_initializer='zeros'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 10)                740750    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 740,761
Trainable params: 740,761
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
hist = model.fit(X[0:15000], y[0:15000],
          batch_size=128,
          epochs=6,
          validation_data=(X[15000:20000], y[15000:20000]), 
          )

Train on 15000 samples, validate on 5000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [26]:
score = model.evaluate(X[:15000], y[:15000])
score2 = model.evaluate(X[15000:20000], y[15000:20000])
score3 = model.evaluate(X[20000:],y[20000:])

print("Train Accuracy: "+str(score[1]*100)+"%")
print("Validation Accuracy: "+str(score2[1]*100)+"%")
print("Test Accuracy: "+str(score3[1]*100)+"%")

Train Accuracy: 99.48666666666666%
Validation Accuracy: 95.02000000000001%
Test Accuracy: 96.2%


In [27]:
#Testing
def predict(rev):
    te = np.zeros([1,vocab_size])
    for word in rev.split(" "):
        te[0][word2index[word]] += 1
    pred = np.squeeze(np.around(model.predict(te)))
    if pred == 1: print(rev+': Positive')
    else: print(rev+': Negative')

predict('brilliantly pathetic movie')
predict('awesome film')
predict('could have better screenplay however lacks in drama')
predict('screenplay was good but story was written badly')
predict('good story and screenplay overall a good movie')

brilliantly pathetic movie: Negative
awesome film: Positive
could have better screenplay however lacks in drama: Negative
screenplay was good but story was written badly: Negative
good story and screenplay overall a good movie: Positive
