In [18]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.datasets import imdb

  

In [2]:
%load_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE


In [4]:
example = "May the winds of the world help you"
tokens = list(example.lower().split())
print(len(tokens))

8


In [5]:
vocab, index = {}, 1 
vocab["<pad>"] = 0

for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1
    vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'may': 1, 'the': 2, 'winds': 3, 'of': 4, 'world': 5, 'help': 6, 'you': 7}


In [6]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'may', 2: 'the', 3: 'winds', 4: 'of', 5: 'world', 6: 'help', 7: 'you'}


In [7]:
vectorized_tokens = [vocab[word] for word in tokens]
print(vectorized_tokens)

[1, 2, 3, 4, 2, 5, 6, 7]


In [11]:
window_size = 2

positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
    vectorized_tokens,
    vocabulary_size=int(vocab_size),
    window_size=window_size,
    negative_samples=0 , 
    seed=SEED,
) 
print(len(positive_skip_grams))

26


In [12]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")


(2, 6): (the, help)
(4, 5): (of, world)
(4, 2): (of, the)
(7, 6): (you, help)
(5, 6): (world, help)


In [13]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])


tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['the', 'may', 'of', 'winds']


In [14]:
# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word


In [15]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")


target_index    : 2
target_word     : the
context_indices : [6 2 1 4 3]
context_words   : ['help', 'the', 'may', 'of', 'winds']
label           : [1 0 0 0 0]


In [16]:
print("target  :", target)
print("context :", context)
print("label   :", label)


target  : 2
context : tf.Tensor([6 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [17]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)


[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


## Let's work with the IMDb Dataset for now.

first let's load the dataset and explore it.

In [26]:
from tensorflow.keras.datasets import imdb
(train_data, y_train), (test_data, y_test) = imdb.load_data(num_words=10000)

# filter out reserved tokens for word2vec-style training
RESERVED = {0,1,2,3}
filtered = [[t for t in seq if t not in RESERVED] for seq in train_data]


In [27]:
import tensorflow as tf

vocab_size = 10000
window_size = 2

pairs = []
for seq in filtered[:20000]:  
    sg, _ = tf.keras.preprocessing.sequence.skipgrams(
        seq, vocabulary_size=vocab_size,
        window_size=window_size, negative_samples=0,
        seed=42
    )
    pairs.extend(sg)

targets = tf.constant([a for a,b in pairs], dtype=tf.int32)
contexts = tf.constant([b for a,b in pairs], dtype=tf.int32)
