# Import Movie Review Data

Set the seed

In [41]:
import numpy as np

In [42]:
np.random.seed(42)

Import the dataset as pandas dataframe

In [43]:
import pandas as pd

Data can be downloaded from Kaggle at the following URL

- https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [44]:
df = pd.read_csv('labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)

Split Data into Training and Test Data

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

# Build the Tokenizer

In [47]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [48]:
top_words = 10000

In [49]:
t = Tokenizer(num_words=top_words) # num_words -> Vocablury size

In [50]:
t.fit_on_texts(X_train.tolist())

# Prepare Training and Test Data

Get the word index for each of the word in the review

In [51]:
X_train = t.texts_to_sequences(X_train.tolist())

In [52]:
X_test = t.texts_to_sequences(X_test.tolist())

How many words in each review?

# Pad Sequences - Important

In [53]:
from tensorflow.python.keras.preprocessing import sequence

In [54]:
max_review_length = 300

In [55]:
X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,padding='post')

In [56]:
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')

# Build Embedding Matrix from Pre-Trained Word2Vec

Load pre-trained Gensim Embeddings

In [57]:
import gensim

In [58]:
word2vec = gensim.models.Word2Vec.load('word2vec-movie-50')

Embedding Size

In [59]:
embedding_vector_length = word2vec.wv.syn0.shape[1]

  """Entry point for launching an IPython kernel.


In [60]:
embedding_vector_length

50

Build matrix for current data

In [61]:
embedding_matrix = np.zeros((top_words + 1, embedding_vector_length))

In [62]:
embedding_matrix.shape

(10001, 50)

In [63]:
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > top_words:
        break
    if word in word2vec.wv.vocab:
        embedding_vector = word2vec.wv[word]
        embedding_matrix[i] = embedding_vector

In [64]:
max_review_length

300

# Build the Graph

In [65]:
from tensorflow.python.keras.models import Sequential

In [66]:
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten, LSTM

In [67]:
model = Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [68]:
model.add(Embedding(top_words + 1,
                    embedding_vector_length,
                    input_length=max_review_length,
                   weights=[embedding_matrix],
                   trainable=False)
         )

In [69]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 50)           500050    
Total params: 500,050
Trainable params: 0
Non-trainable params: 500,050
_________________________________________________________________


Embedding Layer Output - 
[Batch_Size , Review Length , Embedding_Size]

Add Layer with 100 LSTM Memory Units

In [70]:
model.add(LSTM(100))

In [71]:
model.add(Dense(1,activation='sigmoid'))

In [72]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

# Execute the graph

In [None]:
model.fit(X_train,y_train,
          epochs=10,
          batch_size=128,          
          validation_data=(X_test, y_test),
         verbose=1)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10