In [1]:
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function

import numpy as np

# dataset url: 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

In [9]:
sc.master

u'local[*]'

# Tokenizer

In [4]:
import re
TOKEN_REGEX = re.compile(r'[A-Za-z]+|[!?.:,()]')

def tokenize(text):
    data = TOKEN_REGEX.findall(text)
    return [x.lower() for x in data]

In [5]:
tokenize("I love you!")

[u'i', u'love', u'you', u'!']

# Read data

In [7]:
train_pos_rdd = sc.textFile('aclimdb/train/pos/*')
train_neg_rdd = sc.textFile('aclimdb/train/neg/*')
test_pos_rdd = sc.textFile('aclimdb/test/pos/*')
test_neg_rdd = sc.textFile('aclimdb/test/neg/*')

In [8]:
from pyspark.sql import Row

train_pos_df = train_pos_rdd.map(lambda l: Row(text=tokenize(l), sentiment=1)).toDF()
train_neg_df = train_neg_rdd.map(lambda l: Row(text=tokenize(l), sentiment=0)).toDF()
train_df = train_pos_df.union(train_neg_df)

# Vocabulary

In [71]:
top_words_n = 5000

top_words = train_df.rdd \
    .flatMap(lambda r: r.text) \
    .map(lambda w: (w, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .takeOrdered(top_words_n - 1, key=lambda x: -x[1])

In [72]:
vocabs = ['<UNK>'] + [w for w, _ in top_words[:top_words_n-1]]

In [73]:
start_index = 1
word_indices = {w: (i+start_index) for i, w in enumerate(vocabs)}
index_words = {(i+start_index): w for i, w in enumerate(vocabs)}

In [74]:
len(index_words)

5000

In [75]:
index_words.items()[:5]

[(1, u'<UNK>'), (2, u'the'), (3, u'.'), (4, u','), (5, u'and')]

In [76]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import *

b_word_indices = sc.broadcast(word_indices)

def encode_words(words):
    return [b_word_indices.value.get(w, 1) for w in words]

encode_words_udf = udf(encode_words, ArrayType(IntegerType()))

In [77]:
encoded_train_df = train_df.withColumn('indices', encode_words_udf(col('text')))

In [78]:
encoded_train_df.show(5)

+---------+--------------------+--------------------+
|sentiment|                text|             indices|
+---------+--------------------+--------------------+
|        1|[bromwell, high, ...|[1, 316, 9, 6, 10...|
|        1|[homelessness, (,...|[1, 25, 48, 1, 18...|
|        1|[brilliant, over,...|[527, 127, 121, 4...|
|        1|[this, is, easily...|[14, 9, 701, 2, 9...|
|        1|[this, is, not, t...|[14, 9, 29, 2, 78...|
+---------+--------------------+--------------------+
only showing top 5 rows



In [79]:
encoded_train_df.printSchema()

root
 |-- sentiment: long (nullable = true)
 |-- text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- indices: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [80]:
train_reviews = encoded_train_df.select('indices', 'sentiment').rdd \
    .map(lambda r: {'features': [r.indices], 'labels': [r.sentiment]}) \
    .reduce(lambda a, b: {f: a[f] + b[f] for f in ['features', 'labels']}) 

In [97]:
from keras.preprocessing import sequence
train_reviews['features'] = sequence.pad_sequences(train_reviews['features'], maxlen=500)
train_reviews['labels'] = np.array(train_reviews['labels'])

indices = np.random.permutation(25000)
train_reviews['features'] = train_reviews['features'][indices]
train_reviews['labels'] = train_reviews['labels'][indices]

# Train with Keras Model 

In [100]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

model = Sequential()
model.add(Embedding(5001, 32, input_length=500))
model.add(LSTM(output_dim=100, return_sequences=False))
model.add(Dense(1, activation='sigmoid')) # positive or negative -> 1 or 0
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 500, 32)       160032      embedding_input_4[0][0]          
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 100)           53200       embedding_4[0][0]                
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 1)             101         lstm_4[0][0]                     
Total params: 213,333
Trainable params: 213,333
Non-trainable params: 0
____________________________________________________________________________________________________
None


In [101]:
model.fit(train_reviews['features'] , train_reviews['labels'], nb_epoch=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x123c0fd10>

# Testing data 

In [115]:
test_pos_df = test_pos_rdd.map(lambda l: Row(text=tokenize(l), sentiment=1)).toDF()
test_neg_df = test_neg_rdd.map(lambda l: Row(text=tokenize(l), sentiment=0)).toDF()
test_df = test_pos_df.union(test_neg_df)

encoded_test_df = test_df.withColumn('indices', encode_words_udf(col('text')))
test_reviews = encoded_test_df.select('indices', 'sentiment').rdd \
    .map(lambda r: {'features': [r.indices], 'labels': [r.sentiment]}) \
    .reduce(lambda a, b: {f: a[f] + b[f] for f in ['features', 'labels']}) 

test_reviews['features'] = sequence.pad_sequences(test_reviews['features'], maxlen=500)
test_reviews['labels'] = np.array(test_reviews['labels'])

indices = np.random.permutation(25000)
test_reviews['features'] = test_reviews['features'][indices]
test_reviews['labels'] = test_reviews['labels'][indices]

In [116]:
scores = model.evaluate(test_reviews['features'], test_reviews['labels'], verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 86.43%


# Train more on batch

In [161]:
def review_batches(reviews, batch_size): 
    while True:
        indices = np.random.permutation(review_size)
        features = reviews['features'][indices]
        labels = reviews['labels'][indices]
        n_batch = review_size // batch_size
        offset = 0
        for i in range(n_batch):
            yield features[offset:offset+batch_size], labels[offset:offset+batch_size]
            offset += batch_size

next_reviews = review_batches(test_reviews, 32)

In [162]:
X_batch, y_batch = next(next_reviews)
X_batch, y_batch

(array([[  0,   0,   0, ...,  15,  57,   3],
        [  0,   0,   0, ...,  79,   1,   3],
        [ 50,  96,   4, ...,  15, 198,   3],
        ..., 
        [  0,   0,   0, ..., 179,  64,   3],
        [  0,   0,   0, ...,   7, 681,   3],
        [  0,   0,   0, ..., 345,   1,   3]], dtype=int32),
 array([1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 1, 0]))

In [159]:
loss, acc = model.train_on_batch(X_batch, y_batch)
print(loss, acc)

0.540145 0.78125


In [160]:
loss, acc = model.test_on_batch(X_batch, y_batch)
print(loss, acc)

0.522179 0.8125


# Predict text

In [150]:
def predict_sentiment(sentence):
    encoded_words = [word_indices.get(w, 1) for w in tokenize(sentence)]
    test_features = sequence.pad_sequences(np.array(encoded_words).reshape(1, -1), maxlen=500)
    logit = model.predict_on_batch(test_features)[0, 0]
    senti = 'positive' if logit > 0.5 else 'negative'
    return (logit, senti)

In [151]:
predict_sentiment('I love it! I really like this movie.')

(0.76548088, u'positive')

In [152]:
predict_sentiment('It is a bad movie! I do not like it!')

(0.47480059, u'negative')