In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
imdb_master = pd.read_csv("data/imdb_master.csv")

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
imdb_master['review'] = imdb_master.review.str.lower()
imdb_master['review'] = imdb_master.review.str.strip()
imdb_master['review'] = imdb_master.review.str.replace("[0-9!\"#$%&'()*+,-./:;<=>?@[\]^_'`´{|}~]","")

In [None]:
review = imdb_master.review.values.tolist() #100k review

In [None]:
tok_review = [i.split() for i in review]  #[['sentence', 'one'],  ['sentence', 'two']]

In [None]:
filter_tok_review = list()
for i in tok_review:
    temp_list = list()
    for j in i:
        if j not in stop_words:
            temp_list.append(j)
    filter_tok_review.append(temp_list)

In [None]:
len(filter_tok_review)  #tok_review not containing the stop words

In [None]:
model_w2v = Word2Vec(
        filter_tok_review,
        size=100,
        window=4,
        min_count=1,
        workers=4,
        iter=10)

In [None]:
filename = 'imdb_master_embedding_word2vec_100k_100.txt'
model_w2v.wv.save_word2vec_format(filename, binary=False)

In [None]:
########################################################################################
########################## Learning Start ##############################################
########################################################################################

In [None]:
imdb = imdb_master[0:50000]

def label2int(x):
    if x == 'neg':
        return 0
    elif x == 'pos':
        return 1

vfunc = np.vectorize(label2int)

In [None]:
imdb.label.unique()
print(imdb[imdb.label == 0].count())
print(imdb[imdb.label == 1].count())

In [None]:
embedding_index = dict()
f = open(filename)
for count,line in enumerate(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
f.close()

In [None]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(filter_tok_review)
sequences = tokenizer_obj.texts_to_sequences(filter_tok_review)

In [None]:
max_length = max([len(s.split()) for s in review])

In [None]:
max_length

In [None]:
word_index = tokenizer_obj.word_index
review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment = imdb.label.values

In [None]:
#print(word_index)
#print(len(review_pad))
#print(len(review_pad[0]))
#print(sentiment)

In [None]:
print('found {} unique tokens'.format(len(word_index)))
print('shape of review tensor:', review_pad.shape)
print('shape of sentiment tensor:', sentiment.shape)

In [None]:
num_words = len(word_index) + 1
print(num_words)

embedding_matrix = np.zeros((num_words, 300))
#print(embedding_matrix)

for word, i in word_index.items():
    #print(word, i)
    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
print(embedding_matrix[1])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

In [None]:
model = Sequential()
embedding_layer = Embedding(num_words, 300,
                            embeddings_initializer = Constant(embedding_matrix),
                            input_length = max_length,
                            trainable = False
                           )
model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
VALIDATION_SPLIT = 0.2

indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT*review_pad.shape[0])
X_train_pad = review_pad[:-num_validation_samples]
Y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
Y_test = sentiment[-num_validation_samples:]

In [None]:
model.fit(X_train_pad, Y_train, batch_size=128, epochs=100, validation_data=(X_test_pad, Y_test), verbose=1)

In [None]:
test_sample = ['bad', 'good']  #sample for testing
test_sample_token = tokenizer_obj.texts_to_sequences(test_sample)
test_sample_token_pad = pad_sequences(test_sample_token, maxlen=max_length)

In [None]:
model.predict(x=test_sample_token_pad)