In [1]:
# Dataset from: https://grouplens.org/datasets/movielens/
# Tutorial Link: https://keras.io/examples/structured_data/

In [None]:
import numpy as np
import os
import tensorflow as tf
import keras.layers as layers
import pandas as pd

Using TensorFlow backend.


In [None]:
import re

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding

In [None]:
df = pd.DataFrame()
df = pd.read_csv("movie-ratings.csv",encoding='utf-8')
df.head()
#df["review"] = df["review"].map(re.sub("[^a-zA-Z0-9 ]"," "))

Unnamed: 0,review,sentiment
0,"Not all films made in 1931 are this creaky, an...",1.0
1,"BIG FAT LIAR, in my opinion, is an absolutely ...",1.0
2,Hitchcock made at least 11 films about the ord...,1.0
3,This film sold for one-dollar at Wal-Mart on a...,0.0
4,No Holds Barred is that movie that when you we...,1.0


In [None]:
X_train = df.loc[:24999,'review'].values
y_train = df.loc[:24999,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:,'sentiment'].values

In [None]:
X_train[0]

'Not all films made in 1931 are this creaky, and the fact that this was "Best Picture" must have given even greater impetus to the development of television.<br /><br />Typical of all Ferber novels, it isn\'t possible to bring the entire story to the screen, to say nothing of developing character. Dix -- so stolid in the first third of the movie -- does an about face, but no one knows why and it makes no sense. And what is there about Dunne that makes makes her so stoical? Edna May Oliver\'s scenes are priceless, as usual.<br /><br />This film has a role to play in the history of cinema, but it is long and boring.'

In [None]:
tokenizer_obj = Tokenizer()
total_reviews = X_train+X_test
tokenizer_obj.fit_on_texts(total_reviews)

#pad_sequences
max_len = max([len(s.split()) for s in total_reviews])

#vocab_size
vocab_size = len(tokenizer_obj.word_index)+1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens,maxlen=max_len,padding="post")
X_test_pad = pad_sequences(X_test_tokens,maxlen=max_len,padding="post")

In [None]:
emd_dim = 100

model = Sequential()
model.add(Embedding(vocab_size,emd_dim,input_length=max_len))
model.add(layers.GRU(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(layers.Dense(1,activation="sigmoid"))

model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train_pad,y_train,batch_size=128,epochs=25,validation_data=(X_test_pad,y_test),verbose=1)

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
review_lines = list()
lines = df["review"].values.tolist()

table = str.maketrans('','',string.punctuation)
stop_words = set(stopwords.words("english"))
for line in lines:
    tokens = word_tokenize(line.lower())
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha() and word not in stop_words]
    review_lines.append(words)
len(review_lines)

50000

In [None]:
import gensim

In [None]:
emd_dim = 100

In [None]:
model = gensim.models.Word2Vec(sentences = review_lines,
                              size=emd_dim,
                              window=5,
                              workers=4,
                              min_count=1)
words = list(model.wv.vocab)
len(words)

134095

In [None]:
filename = ""
model.wv.save_word2vec_format(filename,binary=False)

In [None]:
import os

emb_index = {}
f = open("",encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:])
    emb_index[word] = coefs
f.close()

In [None]:
tkn = Tokenizer()
tkn.fit_on_texts(review_lines)
seq = tkn.texts_to_sequences(review_lines)

word_index = tkn.word_index
review_pad = pad_sequences(seq,maxlen=max_len)
sentiment = df["sentiment"].values

In [None]:
review_pad[0],sentiment[0]

(array([  0,   0,   0, ..., 309,  97, 226], dtype=int32), 1.0)

In [None]:
num_words = len(word_index)+1
emd_matrix = np.zeros((num_words,emd_dim))

for word, i  in word_index.items():
    if i > num_words:
        continue
    emd_vector = emb_index.get(word)
    if emd_vector is not None:
        emd_matrix[i] = emd_vector

In [None]:
num_words

134096

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# define model

model = Sequential()
emd_layer = Embedding(num_words,
                     emd_dim,
                     embeddings_initializer=Constant(emd_matrix),
                     input_length=max_len,
                     trainable=False)
model.add(emd_layer)
model.add(GRU(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))

model.compile(loss = "binary_crossentropy",optimizer="adam",metrics=["accuracy"])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2898, 100)         13409600  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 13,422,401
Trainable params: 12,801
Non-trainable params: 13,409,600
_________________________________________________________________


In [None]:
val_split = 0.2 
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_val_samples = int(val_split*review_pad.shape[0])

X_train_pad = review_pad[:-num_val_samples]
y_train = sentiment[:-num_val_samples]

X_test_pad = review_pad[-num_val_samples:]
y_test  = sentiment[-num_val_samples:]
y_test.shape

(10000,)

In [None]:
model.fit(X_train_pad,y_train,batch_size=128, epochs=10,verbose=1,
         validation_data=(X_test_pad,y_test))

Instructions for updating:
Use tf.cast instead.
Train on 40000 samples, validate on 10000 samples
Epoch 1/10