In [1]:
import pandas as pd
import numpy as np

import os

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import gensim

Using TensorFlow backend.


In [2]:
#loading the data
df = pd.read_csv('imdb_data', encoding = 'utf-8')
df.sample(10)

Unnamed: 0,review,sentiment
26910,I found this episode to be one of funniest I'v...,1
1529,"This movie has it all. Great actors, good dial...",1
7020,I'm Egyptian. I have a green card. I have been...,1
4738,THE CELL fascinated me at first glance. I was ...,1
35164,"Crush provides a combination of drama, humor a...",1
12690,"Like a lot of the comments above me, also I th...",0
24490,Wow -- this movie was really bad! You talk abo...,0
6722,"Nazarin is some kind of saint,he wants to live...",1
27333,It is not every film's job to stimulate you su...,1
7765,"I absolutely adored this movie. For me, the be...",1


In [3]:
#creating train and test datasets
X_train = df.loc[:34999, 'review'].values
y_train = df.loc[:34999, 'sentiment'].values
X_test = df.loc[15000:, 'review'].values
y_test = df.loc[15000:, 'sentiment'].values

DATA PREPARATION

In [None]:
#word embeddings with Keras Tokenizer
tokenizer = Tokenizer()

total_reviews = X_train + X_test
tokenizer.fit_on_texts(total_reviews)

#pad sequences
review_lengths = [len(s.split()) for s in total_reviews]
#pad_length = int(np.mean(review_lengths))
pad_length = 100

#define vocabulary size
vocab_size = len(tokenizer.word_index)+1

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=pad_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=pad_length, padding='post')


#embedding layer
embedding_dim =100 #dimension of vector
Embedding(vocab_size, embedding_dim, input_length=pad_length)

<keras.layers.embeddings.Embedding at 0x1fa5d2921d0>

BUILDING MODEL

In [None]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=pad_length),
    GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          12613700  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 12,626,501
Trainable params: 12,626,501
Non-trainable params: 0
_________________________________________________________________


TRAINING MODEL

In [None]:
model.fit(X_train_pad, y_train, batch_size=128, epochs=10, verbose=2, validation_data=(X_test_pad, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 35000 samples, validate on 35000 samples
Epoch 1/10
 - 93s - loss: 0.5232 - acc: 0.7411 - val_loss: 0.3980 - val_acc: 0.8347
Epoch 2/10
 - 89s - loss: 0.3379 - acc: 0.8628 - val_loss: 0.3634 - val_acc: 0.8519
Epoch 3/10
 - 92s - loss: 0.2442 - acc: 0.9070 - val_loss: 0.3441 - val_acc: 0.8643
Epoch 4/10
 - 92s - loss: 0.1850 - acc: 0.9326 - val_loss: 0.3554 - val_acc: 0.8713
Epoch 5/10


In [None]:
#testing on some examples
sample_1 = 'This movie is fantastic! I really like it, because it is so exciting'
sample_2 = 'Very nice movie.'
sample_3 = 'What a bad movie that was. I would expect something much better.'
sample_4 = 'This movie really sucks. No action and plot is flat like table.'
sample_5 = 'Very interesting cinema. Great play of the actors.'

test_samples = [sample_1, sample_2, sample_3, sample_4, sample_5]
test_samples_tokens = tokenizer.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=pad_length)

#predict
model.predict(x=test_samples_tokens_pad)

Train word2vec Embedding

In [None]:
#The first step is to prepare the text corpus for learning the embedding by creating word tokens, removing punctuation, 
#removing stop words etc. The word2vec algorithm processes documents sentence by sentence.
tokenizer = RegexpTokenizer(r'\w+')

review_lines = list()
lines = df['review'].values.tolist()


for line in lines:
    tokens = tokenizer.tokenize(line)
    #convert to lower case
    tokens = [w.lower() for w in tokens]
    #remove tokens that are not alphabetic
    tokens = [t for t in tokens if t.isalpha]
    #filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    review_lines.append(tokens)
    
len(review_lines)

In [None]:
#train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=embedding_dim, window=5, workers=4, min_count=1)

#vocab size
words = list(model.wv.vocab)
print ('Vocabulary size is: {}'.format(len(words)))

Test Word2Vec Model

In [None]:
model.wv.most_similar('bad')

In [None]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

In [None]:
print(model.wv.doesnt_match('dog cat pig actor'.split()))

In [None]:
#save model
model.wv.save_word2vec_format('w2v_imdb_model', binary=False)


Use Pre-trained Embedding


In [None]:
#extract the word embedding from file
embeddings_index ={}
f = open(os.path.join('','w2v_imdb_model'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

convert word embedding into tokenized vector

In [None]:
#vectorize the text samples into a 2D integer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_lines)
sequences = tokenizer.texts_to_sequences(review_lines)

#pad sequences
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))
review_pad = pad_sequences(sequences, maxlen=pad_length)
sentiment = df['sentiment'].values
print('Shape of review sensor: ', review_pad.shape)
print('Shape of sentiment sensor: ', sentiment.shape)



In [None]:
#Now we will map embeddings from the loaded word2vec model for each word to the tokenizer.word_index vocabulary 
#and create a matrix with  word vectors.
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in word_index.items():
    if i > num_words: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        #words not found in embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
print(num_words)

In [None]:
#We are now ready with the trained embedding vector to be used directly in the embedding layer. 
#In the below code, the only change from previous model is using the embedding_matrix as input to the Embedding layer 
#and setting trainable = False, since the embedding is already learned.
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=pad_length,
                            trainable=False)

model = Sequential([
    embedding_layer,
    GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Training sentiment model

In [None]:
#split the data into training set and validation set
validation_split_ratio = 0.3
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(validation_split_ratio*review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

print('Shape of X_train_pad tensor: ', X_train_pad.shape)
print('Shape of y_train tensor: ', y_train.shape)
print('Shape of X_test_pad tensor: ', X_test_pad.shape)
print('Shape of y_test tensor: ', y_test.shape)

In [None]:
#training the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=10, verbose=2, validation_data=(X_test_pad, y_test))