<a href="https://colab.research.google.com/github/aliakbarbadri/nlp-tf/blob/master/week2/week2-examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import io
from google.colab import files

# Lesson 1

In [0]:
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [0]:
train_data, test_data = imdb['train'], imdb['test']


In [0]:
train_sents = []
train_labels = []
test_sents = []
test_labels = []

for s,l in train_data:
  train_sents.append(str(s.numpy()))
  train_labels.append(l.numpy())

for s,l in test_data:
  test_sents.append(str(s.numpy()))
  test_labels.append(l.numpy())

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [0]:
vocab_size = 10000
max_len = 120
trunc_type = "post"
oov_token = "<OOV>"
embedding_dim = 16

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_sents)
train_word_index = tokenizer.word_index
train_seqs = tokenizer.texts_to_sequences(train_sents)
train_padded = pad_sequences(train_seqs,maxlen=max_len, truncating=trunc_type)

test_seqs = tokenizer.texts_to_sequences(test_sents)
test_padded = pad_sequences(test_seqs,maxlen=max_len)

In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1692b01080>

In [8]:
weights = model.layers[0].get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [9]:
reverse_word_index = dict([(value, key) for (key, value) in train_word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(train_padded[1])
print(decode_review(train_padded[1]))
print(train_sents[1])

[   0    0    0    0    0    0    0  256   28   78  585    6  815 2383
  317  109   19   12    7  643  696    6    4 2249    5  183  599   68
 1483  114 2289    3 4005   22    2    1    3  263   43 4754    4  173
  190   22   12 4126   11 1604 2383   87    2   20   14 1945    2  115
  950   14 1838 1367  563    3  365  183  477    6  602   19   17   61
 1845    5   51   14 4090   98   42  138   11  983   11  200   28 1059
  171    5    2   20   19   11  298    2 2182    5   10    3  285   43
  477    6  602    5   94  203    1  206  102  148 4450   16  228  336
   11 2510  392   12   20   32   31   47]
? ? ? ? ? ? ? b'i have been known to fall asleep during films but this is usually due to a combination of things including really tired being warm and comfortable on the <OOV> and having just eaten a lot however on this occasion i fell asleep because the film was rubbish the plot development was constant constantly slow and boring things seemed to happen but with no explanation of what w

In [0]:
vectors_file = io.open('vectors.tsv', 'w', encoding='utf-8')
words_file = io.open('words.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  words_file.write(word + "\n")
  vectors_file.write('\t'.join([str(x) for x in embeddings]) + "\n")
vectors_file.close()
words_file.close()

In [0]:
files.download("vectors.tsv")
files.download("words.tsv")