In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
!pip install adjustText
import zipfile
import re
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from adjustText import adjust_text

Collecting adjustText
  Downloading adjustText-1.2.0-py3-none-any.whl.metadata (3.0 kB)
Downloading adjustText-1.2.0-py3-none-any.whl (12 kB)
Installing collected packages: adjustText
Successfully installed adjustText-1.2.0


In [2]:
def download_data(url, data_dir):

  os.makedirs(data_dir, exist_ok=True)

  filepath = os.path.join(data_dir, 'bbc-fulltext.zip')

  if not os.path.exists(filepath):
    print('Downloading file ...')
    filename, _ = urlretrieve(url, filepath)
  else:
    print('File already exists')

  extract_path = os.path.join(data_dir, 'bbc')
  if not os.path.exists(extract_path):
    with zipfile.ZipFile(
        os.path.join(data_dir, 'bbc-fulltext.zip'),
        'r'
    ) as zipf:
      zipf.extractall(data_dir)
  else:
    print('bbc-fulltext.zip has already been extracted')

In [3]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
download_data(url, 'data')

Downloading file ...


In [4]:
def read_data(data_dir):

    # This will contain the full list of stories
    news_stories = []

    print("Reading files")

    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):

        for fi, f in enumerate(files):

            # We don't read the readme file
            if 'README' in f:
                continue

            # Printing progress
            i += 1
            print("."*i, f, end='\r')

            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as f:

                story = []
                # Read all the lines
                for row in f:

                    story.append(row.strip())

                # Create a single string with all the rows in the doc
                story = ' '.join(story)
                # Add that to the list
                news_stories.append(story)

        print('', end='\r')

    print(f"\nDetected {len(news_stories)} stories")
    return news_stories


news_stories = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words=None,
    filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower = True,
    split = ' '
)
tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


In [6]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))

Vocabulary size: 32360

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {'dz': 32350, 'bernd': 32351, 'weidensteiner': 32352, 'sinn': 32353, 'obstruct': 32354, 'avery': 32355, 'shenfeld': 32356, 'cibc': 32357, 'sohn': 32358, 'geopolitics': 32359}


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words = 15000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', oov_token='',
)
tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


In [8]:
print(f"Original: {news_stories[0][:100]}")
print(f"Sequence IDs: {tokenizer.texts_to_sequences([news_stories[0][:100]])[0]}")

Original: Labour in constituency race row  Labour's choice of a white candidate for one of the UK's most multi
Sequence IDs: [126, 7, 2795, 630, 1016, 787, 661, 4, 6, 1094, 2432, 8, 50, 4, 2, 945, 113, 2391]


**Converting all articles to word ID sequences**

In [9]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

**Generating skip-grams from the corpus**

In [10]:
sample_word_ids = news_sequences[0][:5]
sample_phrase = ' '.join([tokenizer.index_word[wid] for wid in sample_word_ids])
print(f"Sample phrase: {sample_phrase}")
print(f"Sample word IDs: {sample_word_ids}\n")

window_size = 1 # How many words to consider left and right.

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids,
    vocabulary_size=n_vocab,
    window_size=window_size, negative_samples=1.0, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)


print("Sample skip-grams")

for inp, lbl in zip(inputs, labels):
    print(f"\tInput: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")

Sample phrase: labour in constituency race row
Sample word IDs: [126, 7, 2795, 630, 1016]

Sample skip-grams
	Input: [126, 7] (['labour', 'in']) / Label: 1
	Input: [7, 126] (['in', 'labour']) / Label: 1
	Input: [7, 2795] (['in', 'constituency']) / Label: 1
	Input: [2795, 7] (['constituency', 'in']) / Label: 1
	Input: [2795, 630] (['constituency', 'race']) / Label: 1
	Input: [630, 2795] (['race', 'constituency']) / Label: 1
	Input: [630, 1016] (['race', 'row']) / Label: 1
	Input: [1016, 630] (['row', 'race']) / Label: 1
	Input: [126, 14684] (['labour', 'alabama']) / Label: 0
	Input: [630, 6407] (['race', 'totalling']) / Label: 0
	Input: [1016, 12663] (['row', 'compress']) / Label: 0
	Input: [7, 16213] (['in', 'â£947m']) / Label: 0
	Input: [2795, 21465] (['constituency', "mackenzie's"]) / Label: 0
	Input: [2795, 12498] (['constituency', 'fog']) / Label: 0
	Input: [7, 27145] (['in', 'pavlikowsky']) / Label: 0
	Input: [630, 23255] (['race', 'centimetre']) / Label: 0


**Generating negative candidates**

In [11]:
inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids,
    vocabulary_size=len(tokenizer.word_index.items())+1,
    window_size=window_size, negative_samples=0, shuffle=False,
)

inputs, labels = np.array(inputs), np.array(labels)

negative_sampling_candidates, true_expected_count, sampled_expected_count = tf.random.log_uniform_candidate_sampler(
    # A true context word that appears in the context of the target
    true_classes=inputs[:1,1:], # [b, 1] sized tensor
    num_true=1, # number of true words per example
    num_sampled=10,
    unique=True,
    range_max=n_vocab,
    name="negative_sampling"
)

print(f"Positive sample: {inputs[:1,1:]}")
print(f"Negative samples: {negative_sampling_candidates}")
print(f"true_expected_count: {true_expected_count}")
print(f"sampled_expected_count: {sampled_expected_count}")

Positive sample: [[7]]
Negative samples: [   39  3099     3   336 20913     7    83   717  8621   109]
true_expected_count: [[0.11341967]]
sampled_expected_count: [2.37778574e-02 3.10580333e-04 2.14877039e-01 2.85319984e-03
 4.60424235e-05 1.13419674e-01 1.13960411e-02 1.34022883e-03
 1.11679241e-04 8.71457718e-03]


**Using tf.nn.sampled_softmax_loss()**

In [12]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(n_vocab, sampling_factor=1e-05)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 ... 1.         1.         1.        ]


**Generating data (positive + negative candidates)**

In [13]:
def skip_gram_data_generator(sequences, window_size, batch_size, negative_samples, vocab_size, seed=None):

    rand_sequence_ids = np.arange(len(sequences))
    np.random.shuffle(rand_sequence_ids)


    for si in rand_sequence_ids:

        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequences[si],
            vocabulary_size=vocab_size,
            window_size=window_size,
            negative_samples=0.0,
            shuffle=False,
            sampling_table=sampling_table,
            seed=seed
        )

        targets, contexts, labels = [], [], []

        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1,
              num_sampled=negative_samples,
              unique=True,
              range_max=vocab_size,
              name="negative_sampling")

            # Build context and label vectors (for one target word)
            context = tf.concat(
                [tf.constant([context_word], dtype='int64'), negative_sampling_candidates],
                axis=0
            )

            label = tf.constant([1] + [0]*negative_samples, dtype="int64")

            # Append each element from the training example to global lists.
            targets.extend([target_word]*(negative_samples+1))
            contexts.append(context)
            labels.append(label)

        contexts, targets, labels = np.concatenate(contexts), np.array(targets), np.concatenate(labels)

        assert contexts.shape[0] == targets.shape[0]
        assert contexts.shape[0] == labels.shape[0]

        # If seed is not provided generate a random one
        if not seed:
            seed = random.randint(0, 10e6)

        np.random.seed(seed)
        np.random.shuffle(contexts)
        np.random.seed(seed)
        np.random.shuffle(targets)
        np.random.seed(seed)
        np.random.shuffle(labels)


        for eg_id_start in range(0, contexts.shape[0], batch_size):
            yield (
                targets[eg_id_start: min(eg_id_start+batch_size, targets.shape[0])],
                contexts[eg_id_start: min(eg_id_start+batch_size, contexts.shape[0])]
            ), labels[eg_id_start: min(eg_id_start+batch_size, labels.shape[0])]


news_skip_gram_gen = skip_gram_data_generator(
    news_sequences, 4, 10, 5, n_vocab
)

for btc, bl in news_skip_gram_gen:

    print(btc)
    print(bl)

    break

(array([ 1407,  3820,  8291,  2936,  3354, 14185,  8145,  5752,    26,
        5767]), array([    2,  3592, 24048, 16115,  3820,  1769,   431,     6,    28,
         191]))
[0 0 0 0 1 0 1 0 0 0]


**Skip -Gram Algorithm**

In [14]:
#Defining Hyperparameters

batch_size = 4096 # Data points in a single batch

embedding_size = 128 # Dimension of the embedding vector.

window_size=1 # We use a window size of n on either side of target word
negative_samples = 4 # Number of negative samples generated per example

epochs = 5 # Number of epochs to train for

# We pick a random validation set to sample nearest neighbors
valid_size = 16 # Random set of words to evaluate similarity on.
# We sample valid datapoints randomly from a large window without always being deterministic
valid_window = 250

# When selecting valid examples, we select some of the most frequent words as well as
# some moderately rare words as well
np.random.seed(54321)
random.seed(54321)

valid_term_ids = np.array(random.sample(range(valid_window), valid_size))
valid_term_ids = np.append(
    valid_term_ids, random.sample(range(1000, 1000+valid_window), valid_size),
    axis=0
)

In [15]:
#Defining the Model

import tensorflow.keras.backend as K

K.clear_session()

# Inputs - skipgrams() function outputs target, context in that order
# we will use the same order
input_1 = tf.keras.layers.Input(shape=(), name='target')
input_2 = tf.keras.layers.Input(shape=(), name='context')

# Two embeddings layers are used one for the context and one for the target
context_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='context_embedding'
)
target_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='target_embedding'
)

# Lookup outputs of the embedding layers
target_out = target_embedding_layer(input_1)
context_out = context_embedding_layer(input_2)

# Computing the dot product between the two
out = tf.keras.layers.Dot(axes=-1)([context_out, target_out])

# Defining the model
skip_gram_model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=out, name='skip_gram_model')

# Compiling the model
skip_gram_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam'
)

skip_gram_model.summary()

In [16]:
#Calculating Word Similarities

class ValidationCallback(tf.keras.callbacks.Callback):

    def __init__(self, valid_term_ids, model_with_embeddings, tokenizer):

        self.valid_term_ids = valid_term_ids
        self.model_with_embeddings = model_with_embeddings
        self.tokenizer = tokenizer

        super().__init__()

    def on_epoch_end(self, epoch, logs=None):
        """ Validation logic """

        # We will use context embeddings to get the most similar words
        # Other strategies include: using target embeddings, mean embeddings after avaraging context/target
        embedding_weights = self.model_with_embeddings.get_layer("context_embedding").get_weights()[0]
        normalized_embeddings = embedding_weights / np.sqrt(np.sum(embedding_weights**2, axis=1, keepdims=True))

        # Get the embeddings corresponding to valid_term_ids
        valid_embeddings = normalized_embeddings[self.valid_term_ids, :]

        # Compute the similarity between valid_term_ids and all the embeddings
        # V x d (d x D) => V x D
        top_k = 5 # Top k items will be displayed
        similarity = np.dot(valid_embeddings, normalized_embeddings.T)

        # Invert similarity matrix to negative
        # Ignore the first one because that would be the same word as the probe word
        similarity_top_k = np.argsort(-similarity, axis=1)[:, 1: top_k+1]

        # Print the output
        for i, term_id in enumerate(valid_term_ids):

            similar_word_str = ', '.join([self.tokenizer.index_word[j] for j in similarity_top_k[i, :] if j >= 1])
            print(f"{self.tokenizer.index_word[term_id]}: {similar_word_str}")

        print('\n')

In [17]:
# Running the Skip-Gram Algorithm

skipgram_validation_callback = ValidationCallback(valid_term_ids, skip_gram_model, tokenizer)

for ei in range(epochs):

    print(f"Epoch: {ei+1}/{epochs} started")

    news_skip_gram_gen = skip_gram_data_generator(
        news_sequences, window_size, batch_size, negative_samples, n_vocab
    )

    skip_gram_model.fit(
        news_skip_gram_gen, epochs=1,
        callbacks=skipgram_validation_callback,
    )

Epoch: 1/5 started
   2233/Unknown [1m333s[0m 148ms/step - loss: 0.6367

  self.gen.throw(typ, value, traceback)


election: attorney, anticipated, chong, tell, vido
months: weeks, years, complaints, vote', affected
with: 121, shadowing, statistical, beverley, against
you: they, we, sure, what, do
were: then, lost, only, mean, nothing
win: likely, issue, serve, lot, trying
those: doing, trying, lot, affected, mainstream
music: way, trying, lot, concerned
also: still, they, nothing, able, already
international: came, religious, flooded, programmes, miscellaneous
best: end, responsibility, surprised, forming
down: able, can't, starting, now, machine
too: taken, put, escape, fines, lot
some: trying, case, side's, account
through: me, boosted, possible, laptop, accusations
mr: tony, said, gordon, charles, michael
fast: understand, remake, affected, barring, take
road: jameson, forsyth, proud, lifted, divided
bush: replay, background, weeks', tally, moguls
significant: according, kind, trying, spider, failure
reached: trying, prepare, status, keen, heading
serious: driven, others, see, jinshajiang, used

In [18]:
def save_embeddings(model, tokenizer, vocab_size, save_dir):

    os.makedirs(save_dir, exist_ok=True)

    _, words_sorted = zip(*sorted(list(tokenizer.index_word.items()), key=lambda x: x[0])[:vocab_size-1])

    words_sorted = [None] + list(words_sorted)

    pd.DataFrame(
        model.get_layer("context_embedding").get_weights()[0],
        index = words_sorted
    ).to_pickle(os.path.join(save_dir, "context_embedding.pkl"))

    pd.DataFrame(
        model.get_layer("target_embedding").get_weights()[0],
        index = words_sorted
    ).to_pickle(os.path.join(save_dir, "target_embedding.pkl"))


save_embeddings(skip_gram_model, tokenizer, n_vocab, save_dir='skipgram_embeddings')