---
### Load in and visualize the data

In [1]:
import numpy as np

# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [3]:
from string import punctuation

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()

In [5]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [7]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [8]:
# outlier review stats
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [9]:
print('Number of reviews before removing outliers: ', len(reviews_ints))

## remove any reviews/labels with zero length from the reviews_ints list.

# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


In [10]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [15]:
# Test your implementation!

seq_length = 200

features = pad_sequences(reviews_ints, maxlen=seq_length,  padding='pre', value=0.0)

## test statements - do not change - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
#print(features[:30,:10])


In [55]:
## split data into training, validation, and test data (features and labels, x and y)

from sklearn.model_selection import train_test_split

split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)
train_x, test_x, train_y, test_y = train_test_split(features, encoded_labels, test_size= 1 - split_frac, random_state=1)
## print out the shapes of your resultant feature data
print(train_x.shape)
print(test_x.shape)

(20000, 200)
(5000, 200)


In [56]:
train_x.shape

(20000, 200)

In [57]:
from keras.layers import LSTM, Input, Dense, Dropout, Embedding
from keras.models import Sequential
from keras.layers import Activation
from keras.optimizers import Adam
from keras.losses import binary_crossentropy

# TODO: Build the model architecture
embed_dim = 128
lstm_out = 196
max_features = len(vocab_to_int)+1
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = train_x.shape[1]))
model.add(Dropout(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 200, 128)          9481344   
_________________________________________________________________
dropout_9 (Dropout)          (None, 200, 128)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 197       
Total params: 9,736,341
Trainable params: 9,736,341
Non-trainable params: 0
_________________________________________________________________
None


In [60]:
model.fit(train_x, train_y, batch_size=32, epochs=4, validation_split=0.2)

Train on 16000 samples, validate on 4000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1a5db30978>

In [128]:
scores = model.evaluate(test_x, test_y, verbose=0)
print('Test accuracy:', scores[1])

Test accuracy: 0.8564


In [143]:
test_str = 'this movie is one of the best I have ever seen'
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints


In [144]:
test_ints = tokenize_review(test_str)
#padding the tweet to have exactly the same shape as `embedding_2` input
test_ints = pad_sequences(test_ints , maxlen=200, dtype='int32', value=0)

In [145]:
sentiment = model.predict(test_ints)[0]>0.5

In [146]:
if sentiment:
    print("Positive")
else:
    print("Negative")

Positive
