In [24]:
import io
import os
import shutil
import string
import re
import tensorflow as tf

# Word Embeddings

Word embeddings give us a way to use an efficient, dense representation in which similar words have a similar encoding. Importantly, you do not have to specify this encoding by hand. An embedding is a dense vector of floating point values (the length of the vector is a parameter you specify). 

Instead of specifying the values for the embedding manually, they are trainable parameters (weights learned by the model during training, in the same way a model learns weights for a dense layer). It is common to see word embeddings that are 8-dimensional (for small datasets), up to 1024-dimensions when working with large datasets. A higher dimensional embedding can capture fine-grained relationships between words, but takes more data to learn.

### The IMDB Dataset

We'll use the IMDB dataset to train a sentiment classifier model (classifies good / bad per review) and in the process learn the 
embeddings from scratch.

In [10]:
# Getting the data

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True, cache_dir='.', cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

['imdbEr.txt', 'test', 'imdb.vocab', 'README', 'train']

In [15]:
# Removing not needed data (unsupervised)

remove_dir = os.path.join("datasets/aclImdb/train", "unsup")
shutil.rmtree(remove_dir)

In [16]:
# Creating train / valid datasets from the data

batch_size = 1024
seed = 123

train_ds = tf.keras.utils.text_dataset_from_directory(
  "datasets/aclImdb/train", batch_size=batch_size, validation_split=0.2,
  subset="training", seed=seed
)

valid_ds = tf.keras.utils.text_dataset_from_directory(
  "datasets/aclImdb/train", batch_size=batch_size, validation_split=0.2,
  subset="validation", seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [18]:
# Reviewing the data

for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print (label_batch[i].numpy(), text_batch.numpy()[i])

0 b"Wow. Some movies just leave me speechless. This was undeniably one of those movies. When I left the theatre, not a single word came to my mouth. All I had was an incredible urge to slam my head against the theatre wall to help me forget about the last hour and a half. Unfortunately, it didn't work. Honestly, this movie has nothing to recommend. The humor was at the first grade level, at best, the acting was overly silly, and the plot was astronomically far-fetched. I hearby pledge never to see an other movie starring Chris Kattan or any other cast-member of SNL."
1 b'If any show in the last ten years deserves a 10, it is this rare gem. It allows us to escape back to a time when things were simpler and more fun. Filled with heart and laughs, this show keeps you laughing through the three decades of difference. The furniture was ugly, the clothes were colorful, and the even the drugs were tolerable. The hair was feathered, the music was accompanied by roller-skates, and in the words 

In [19]:
# Adding I/O related optimizations

train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = valid_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [23]:
# Exploring the embedding layer

# Keras embedding layer maps integers (word ids) to their corresponding dense vectors.
# For example, this embedding layer embeds 1,000 word vocabulary into 5 dimensions (per word)
embedding_layer = tf.keras.layers.Embedding(1000, 5)

# The embedding layer starts off with random weights. When we pass a list of integers to it (representing word ids), 
# we get back their corresponding embedded vectors.
result = embedding_layer(tf.constant([1,2,3]))
print (result.numpy())

print ("-" * 100)

# Since we'll be working in batches, we'll send to this layer a list of integers (representing ids from words
# of multiple sentences). The result will be a tensor of shape (batch_size, seq_length, embedding_dim)
result = embedding_layer(tf.constant([[1,2,3], [4,5,6]]))
# 2 sentences, each with 3 words, each word mapped to 5 dimensions
print (result.shape)



[[-0.04298798  0.03699524  0.00390378  0.0481695  -0.01569118]
 [-0.03501638  0.00071711  0.03650563 -0.01208202  0.04265844]
 [-0.02323675 -0.04148346  0.00130652 -0.03028941  0.03238395]]
----------------------------------------------------------------------------------------------------
(2, 3, 5)


In [26]:
# Text Preprocessing

# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  
  return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

vocab_size = 1000
sequence_length = 100

# Normalize, split and map strings to integers. This will keep the most frequent 1000 words
# and set all sequences to length of 100 (using padding and cropping).
vectorize_layer = tf.keras.layers.TextVectorization(
  standardize=custom_standardization,
  max_tokens=vocab_size,
  output_mode='int',
  output_sequence_length=sequence_length
)

# Get the text (no labels) to vectorize (only adapting, vectorization happens in the network)
text_ds = train_ds.map(lambda x,y : x)
vectorize_layer.adapt(text_ds)

In [27]:
# Building the model

embed_size = 16
model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),
  tf.keras.layers.GRU(32),
  tf.keras.layers.Dense(1, activation="sigmoid")
])

# Using tensorboard for visualizations
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

# Compile and fit
model_ckpt = tf.keras.callbacks.ModelCheckpoint("embedding_imdb", monitor="val_accuracy", save_best_only=True)
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model.fit(train_ds, validation_data=valid_ds, epochs=10, callbacks=[ model_ckpt , tensorboard_callback])

Epoch 1/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 2/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 3/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 4/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 5/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 6/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 7/10
Epoch 8/10


INFO:tensorflow:Assets written to: embedding_imdb/assets


Epoch 9/10
Epoch 10/10


In [30]:
# Working with the trained embeddings

# The embeddings are the weights of the embedding layer. The weight matrix is of shape (vocab_size, embedding_dimension) which
# makes sense as each word in our vocab has an embedded vector.
weights = model.get_layer('embedding_4').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

# Saving the embedding to disk so that we can upload them to embedding projector.
out_v = io.open("vectors.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(vocab):
  # skipping padding
  if index == 0:
    continue
  
  # writing the dense vector to TSV file
  vec = weights[index]
  out_v.write("\t".join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")

out_v.close()
out_m.close()

# Now go to Embedding Projector and upload the files to visualize the embeddings
