In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
import numpy as np

In [3]:
imdb, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [4]:
train_data, test_data = imdb["train"], imdb["test"]

In [5]:
training_sentences, training_labels = [], []
test_sentences, test_labels = [], []

for s, l in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())
    
for s, l in test_data:
    test_sentences.append(str(s.numpy()))
    test_labels.append(l.numpy())

In [6]:
training_labels[0].shape

()

In [7]:
training_labels[0]

0

In [8]:
training_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [9]:
training_labels_final = np.array(training_labels)
test_labels_final = np.array(test_labels)
training_labels_final.shape, test_labels_final.shape

((25000,), (25000,))

In [10]:
vocab_size = 10000
embedding_dim = 16
trunc_type = "post"
oov_tok = "<OOV>"
max_length = 120

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, truncating=trunc_type, maxlen=max_length)

testing_sequences = tokenizer.texts_to_sequences(test_sentences)
testing_pad = pad_sequences(testing_sequences, maxlen=max_length)

In [13]:
reverse_word_index = dict([(v, k) for (k, v) in word_index.items()])
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, "?") for i in text])
print(decode_review(padded[1]))
print(training_sentences[1])

? ? ? ? ? ? ? b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the <OOV> and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development was constant constantly slow and boring things seemed to happen but with no explanation of what was causing them or why i admit i may have missed part of the film but i watched the majority of it and everything just seemed to happen of its own <OOV> without any real concern for anything else i cant recommend this film at all '
b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. The plot development was constant. Constantly slow and boring. Things seemed to happen, but with no explanation of 

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

model.compile(loss='binary_crossentropy', optimizer="adam", metrics=["acc"])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           160000    
                                                                 
 flatten_1 (Flatten)         (None, 1920)              0         
                                                                 
 dense (Dense)               (None, 6)                 11526     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_pad, test_labels_final))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fb2b0b5a90>

In [19]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [21]:
import io
out_v = io.open('vecs.tsv', 'w', encoding="utf-8")
out_m = io.open('meta.tsv', 'w', encoding="utf-8")
for word_num in range(1, vocab_size):
    word = reverse_word_index.get(word_num)
    embeddings = weights[word_num]
    out_m.write(word+"\n")
    out_v.write("\t".join([str(x) for x in embeddings]) + "\n")
out_m.close()
out_v.close()