In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('movie_data.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [4]:
sentiments = df['sentiment']
reviews = df['review']

In [5]:
total_len = len(sentiments)

In [6]:
train_size = int(total_len*0.8)
test_size = total_len-train_size

print('Train size=',train_size)
print('Test size=',test_size)

Train size= 399
Test size= 100


In [7]:
train_x = df.loc[:train_size, 'review'].values
train_y = df.loc[:train_size, 'sentiment'].values

test_x = df.loc[train_size:total_len, 'review'].values
test_y = df.loc[train_size:total_len, 'sentiment'].values


In [8]:
print(type(sentiments))

<class 'pandas.core.series.Series'>


In [9]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


In [10]:
reviews = df['review'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)

In [11]:
max_length = max([ len(s.split()) for s in reviews])

In [12]:
print(max_length)

1148


In [13]:
vocab_size = len(tokenizer.word_index)+1
X_train_tokens = tokenizer.texts_to_sequences(train_x)
X_test_tokens = tokenizer.texts_to_sequences(test_x)


X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')


In [14]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Embedding

Using TensorFlow backend.


In [15]:
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1148, 100)         1351500   
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,364,301
Trainable params: 1,364,301
Non-trainable params: 0
_________________________________________________________________


In [17]:
print('Train')
model.fit(X_train_pad, train_y, epochs = 25, batch_size=128, validation_data=(X_test_pad, test_y), verbose=2)

Train
Train on 400 samples, validate on 100 samples
Epoch 1/25
 - 64s - loss: 0.6952 - acc: 0.5075 - val_loss: 0.6911 - val_acc: 0.5400
Epoch 2/25
 - 6s - loss: 0.6932 - acc: 0.5125 - val_loss: 0.6926 - val_acc: 0.5400
Epoch 3/25
 - 5s - loss: 0.6932 - acc: 0.4925 - val_loss: 0.6930 - val_acc: 0.5400
Epoch 4/25
 - 5s - loss: 0.6927 - acc: 0.5375 - val_loss: 0.6922 - val_acc: 0.5400
Epoch 5/25
 - 5s - loss: 0.6933 - acc: 0.4950 - val_loss: 0.6914 - val_acc: 0.5400
Epoch 6/25
 - 5s - loss: 0.6932 - acc: 0.5100 - val_loss: 0.6908 - val_acc: 0.5400
Epoch 7/25
 - 5s - loss: 0.6934 - acc: 0.5100 - val_loss: 0.6907 - val_acc: 0.5400
Epoch 8/25
 - 5s - loss: 0.6925 - acc: 0.5100 - val_loss: 0.6908 - val_acc: 0.5400
Epoch 9/25
 - 5s - loss: 0.6936 - acc: 0.5100 - val_loss: 0.6911 - val_acc: 0.5400
Epoch 10/25
 - 5s - loss: 0.6937 - acc: 0.5100 - val_loss: 0.6916 - val_acc: 0.5400
Epoch 11/25
 - 5s - loss: 0.6933 - acc: 0.5100 - val_loss: 0.6922 - val_acc: 0.5400
Epoch 12/25
 - 5s - loss: 0.6932

<keras.callbacks.History at 0x7fcba19d7f28>

In [19]:
import string 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [20]:
review_lines = list()
lines = df['review'].tolist()

In [21]:
for line in lines: 
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped  if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)
    
    
print(len(review_lines))

499


In [22]:
EMBEDDING_DIM = 100

In [23]:
import gensim 

model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=1, min_count=1)
words = list(model.wv.vocab)

print("Vocab size = ", len(words))

unable to import 'smart_open.gcs', disabling that module


Vocab size =  13203


In [24]:
model.wv.most_similar("good")

[('movie', 0.9999512434005737),
 ('film', 0.9999445080757141),
 ('nt', 0.9999390840530396),
 ('great', 0.9999366402626038),
 ('well', 0.999934196472168),
 ('like', 0.9999324679374695),
 ('see', 0.9999309778213501),
 ('also', 0.9999294281005859),
 ('people', 0.9999277591705322),
 ('even', 0.9999247789382935)]

In [25]:
# save model 
filename = 'imdb_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [26]:
import os 

embedding_index = {}
f = open(os.path.join('', 'imdb_embedding_word2vec.txt'), encoding='utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
    
f.close()

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_lines)
sequences = tokenizer.texts_to_sequences(review_lines)


word_index = tokenizer.word_index
print("Number of unique tokens", len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment = df['sentiment'].values

print("Shape of review tensor", review_pad.shape)
print("Shape of sentiment tensor", sentiment.shape)


Number of unique tokens 13203
Shape of review tensor (499, 1148)
Shape of sentiment tensor (499,)


In [30]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word , i in word_index.items():
    if(i>num_words):
        continue 
    embedding_vector = embedding_index.get(word)
    if(embedding_vector is not None):
        embedding_matrix[i] = embedding_vector

In [31]:
print(num_words)

13204


In [36]:
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(num_words, EMBEDDING_DIM, 
                            embeddings_initializer=Constant(embedding_matrix), 
                            input_length = max_length, 
                            trainable = False)

model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1148, 100)         1320400   
_________________________________________________________________
gru_3 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,333,201
Trainable params: 12,801
Non-trainable params: 1,320,400
_________________________________________________________________


In [40]:
VALIDATION_SPLIT = 0.2

indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT*review_pad.shape[0])


In [42]:
X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]

X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

In [43]:
print("Train....")
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train....
Train on 400 samples, validate on 99 samples
Epoch 1/25
 - 7s - loss: 0.6933 - acc: 0.5050 - val_loss: 0.6874 - val_acc: 0.5758
Epoch 2/25
 - 4s - loss: 0.6965 - acc: 0.4825 - val_loss: 0.6897 - val_acc: 0.5758
Epoch 3/25
 - 4s - loss: 0.6942 - acc: 0.5000 - val_loss: 0.6996 - val_acc: 0.4444
Epoch 4/25
 - 4s - loss: 0.6962 - acc: 0.5025 - val_loss: 0.7027 - val_acc: 0.4444
Epoch 5/25
 - 4s - loss: 0.6933 - acc: 0.5100 - val_loss: 0.6952 - val_acc: 0.4848
Epoch 6/25
 - 4s - loss: 0.6940 - acc: 0.5075 - val_loss: 0.6894 - val_acc: 0.5758
Epoch 7/25
 - 4s - loss: 0.6947 - acc: 0.5000 - val_loss: 0.6901 - val_acc: 0.5657
Epoch 8/25
 - 4s - loss: 0.6920 - acc: 0.5025 - val_loss: 0.6922 - val_acc: 0.5051
Epoch 9/25
 - 4s - loss: 0.6913 - acc: 0.5200 - val_loss: 0.6918 - val_acc: 0.5152
Epoch 10/25
 - 5s - loss: 0.6945 - acc: 0.5000 - val_loss: 0.6926 - val_acc: 0.5051
Epoch 11/25
 - 5s - loss: 0.6905 - acc: 0.5100 - val_loss: 0.6889 - val_acc: 0.5960
Epoch 12/25
 - 4s - loss: 0.69

<keras.callbacks.History at 0x7fcb7387b9b0>