In [1]:
import random
from word_embeddings import Word2Vec
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from typing import List
from collections import deque

To-Do:
- Fix Skipgram
- Fix out of vocab_size index issues
- Build test method/function for use when imported
- Test word2vec embeddings with wiki dataset
- Fix fit method to match fit_from_tokens
- Clean wiki dataset by removing extra sections at the end (ie. see also, references)

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
with open(path_to_file) as f:
    words = [word for line in f.readlines() for word in line.split()]

print(f'Number of words: {len(words)}')

new_model = Word2Vec(vocab_size=12000,
                     batch_size=8,
                     num_skips=4, 
                     skip_window=2,
                     architecture='cbow')
new_model.fit(words)
new_model.eval()

Number of words: 202651

Tokenizing words...

Training model...


Progress:   0%|           | Elapsed: 00:01 | ETA: 07:59 | 20.80it/s  

Average loss at step 0: 6.7142415046691895


Progress:  20%|██         | Elapsed: 00:09 | ETA: 00:36 | 218.89it/s

Average loss at step 2000: 0.0020651656668633223


Progress:  40%|████       | Elapsed: 00:18 | ETA: 00:24 | 246.07it/s

Average loss at step 4000: 0.00010465641389600933


Progress:  60%|██████     | Elapsed: 00:26 | ETA: 00:18 | 216.10it/s

Average loss at step 6000: 6.053332253941335e-05


Progress:  80%|████████   | Elapsed: 00:34 | ETA: 00:09 | 208.95it/s

Average loss at step 8000: 4.3287454900564626e-05


Progress: 100%|██████████ | Elapsed: 00:43 | ETA: 00:00 | 227.96it/s

Average loss at step 10000: 3.232683957321569e-05

Training has completed successfully.

Similar indices for 'king': [   34  7989   442 10166  5892]

Embedding for 'king': [-0.03580251 -0.02700209  0.06671405  0.01987702 -0.0750451   0.03752913
  0.09966585  0.08348835 -0.08564416 -0.02801651  0.07077959 -0.01931835
 -0.11053766  0.03350592  0.07308815  0.07094621  0.14404781  0.00347055
 -0.01321465  0.11222426 -0.01781661 -0.11343024 -0.09006338 -0.1465569
  0.08967006 -0.13756578  0.07960131 -0.02542244 -0.12340839  0.0739371
 -0.00245022 -0.0705544  -0.14847781 -0.02540014 -0.0341523  -0.11598612
  0.14166783 -0.11128858  0.00839817 -0.03771213  0.06894025  0.06559727
  0.09567337  0.08064143 -0.11779281  0.0493288   0.12730995  0.1357397
  0.1379387  -0.14997934 -0.00221588  0.0552395   0.03114419  0.02987166
 -0.00979318 -0.0303425  -0.08213712  0.04316232 -0.11107747 -0.01184592
  0.09652275 -0.10239444  0.03358982 -0.07328041  0.09003548  0.13210207
  0.06599475 -0.1328156  -0.




In [3]:
unique_words = set(words)
print(f"Unique words in shakespeare: {len(unique_words)}")

Unique words in shakespeare: 25670


In [4]:
import string
from collections import Counter

vocab_size = 50000

## Corpus pre-processing
translator = str.maketrans('', '', string.punctuation)
words = [word.lower().translate(translator) for word in words]
words = [word for word in words if word.isalpha()]

## Get word counts for vocabulary with <unk> token to replace rare words
count = [['<unk>', -1]]
count.extend(Counter(words).most_common(vocab_size - 1))

print(f"Counted words in shakespeare: {len(count)}")

Counted words in shakespeare: 12848


In [None]:
def skipgram (data: List[int], batch_size: int, num_skips: int, skip_window: int, data_index: int = 0):
    """
    Generate a batch of data for the skip-gram model.

    Parameters:
    data:        List of word indices.
    batch_size:  Number of words in each batch.
    num_skips:   How many times to reuse an input to generate a label.
    skip_window: How many words to consider left and right.
    data_index:  Index to start with in the data list. Default is 0.

    Returns:
    Tuple[np.ndarray, np.ndarray]: Batch of input words and corresponding labels.
    """
    assert batch_size < len(data)
    assert num_skips <= 2 * skip_window

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    window_size = 2 * skip_window + 1

    # Create a buffer to store the data
    buffer = deque(maxlen=window_size)
    for _ in range(window_size):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    # Generates the batch of context words and labels
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]

        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, window_size - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]

        # Move the window
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    return batch, labels

In [None]:
## CBOW is functioning / Skipgram is not functioning -- Code below can be made into a debug function

data, count, dictionary, reverse_dictionary = prepareData(words)

# Call the cbow function
batch_size = 8
num_skips = 4
skip_window = 2

# Print metrics
print(f"Corpus length: {len(data)}")
print(f"Encoded corpus sample: {data[:20]}")
print(f"Decoded corpus sample: {[reverse_dictionary[i] for i in data[:20]]}")

print(f"\nVocabulary size: {len(count)}")
print(f"Index Map size: {len(dictionary)}")
print(f"\nMost common words:\n{sorted(count, key=lambda x: x[1], reverse=True)[:5]}")
print("Index Map Examples:\n", {k: reverse_dictionary[k] for k in list(reverse_dictionary)[:5]})

# Print the first 5 examples of CBOW batches
print("\nExamples of CBOW batches:")
cbow_batch, cbow_labels = cbow(data, batch_size, num_skips, skip_window)
print(f"Batch shape: {cbow_batch.shape}")
print(f"Labels shape: {cbow_labels.shape}")
for i in range(5):
    context_words = [reverse_dictionary[idx] for idx in cbow_batch[i]]
    target_word = reverse_dictionary[cbow_labels[i, 0]]
    print(f"Context words: {context_words}, Target word: {target_word}")

# Print the first 5 examples of Skip-gram batches
print("\nExamples of Skip-gram batches:")
skipgram_batch, skipgram_labels = skipgram(data, batch_size, num_skips, skip_window)
print(f"Batch shape: {skipgram_batch.shape}")
print(f"Labels shape: {skipgram_labels.shape}")
for i in range(5):
    input_words = [reverse_dictionary[idx] for idx in skipgram_batch]
    output_word = reverse_dictionary[skipgram_labels[i, 0]]
    print(f"Input word: {input_words}, Output word: {output_word}")

In [None]:
pickle_file_path = 'Resources\\corpus.pkl'
batch_size = 128
num_skips = 4
skip_window = 2

word2vec = Word2Vec(architecture='cbow', batch_size=batch_size, num_skips=num_skips, skip_window=skip_window)
model = word2vec.fit(pickle_file_path)

In [None]:
test_model = Word2Vec(architecture='cbow', batch_size=1000, num_skips=4, skip_window=2, vocab_size=500000, n_steps=1)
test_model.fit_from_tokens(test_data, test_count, test_dictionary, test_reverse_dictionary)
test_model.eval()

# Find the embedding for a word
try:
    embedding_king = test_model.get_embedding('king')
    print(f"Embedding for 'king': {embedding}")
except ValueError as e:
    print(e)

# Find similar words
try:
    similar_words = test_model.similar_by_word('king')
    print(f"Words similar to 'king': {similar_words}")
except ValueError as e:
    print(e)

# Find the embeddings for the words
try:
    embedding_man = test_model.get_embedding('man')
    embedding_woman = test_model.get_embedding('woman')
except ValueError as e:
    print(e)

# Perform the vector arithmetic: king
result_vector = embedding_king - embedding_man + embedding_woman

# Find the word closest to the resulting vector
try:
    closest_word = test_model.similar_by_vector(result_vector, topn=1)[0][0]
    print(f"The word closest to result is: {closest_word}")
except ValueError as e:
    print(e)