In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install setuptools

Note: you may need to restart the kernel to use updated packages.


Lesson derived from https://www.tensorflow.org/text/guide/word_embeddings

# Word Embeddings

Word embeddings are ways of representing words as numerical vectors. These vectors can account for a word's context, allowing us to calculate its similarity to other words in its corpus: word's that appear in the same context might be synonyms, or they might simply be used in similar ways.

Consider the examples "run" and "walk": a good word embeddings model would have these words very close in vector-space; similarly, "mother" and "father" or "Boston" and "Massachusetts".

## One-hot encodings

One naïve way to capture this information is to encode each word's location in a vocabulary as a 1 and every other word as a 0. This is known as **one-hot encoding**, and it typically doesn't get us very far: each word's vector has to be the length of the vocabulary, even though it is mostly filled with 0s.

This is what is known as a "sparse" index.

## Unique numbers

As a second attempt, you might assign each word a unique number. But this approach, while "denser" than a one-hot encoding, fails to capture any contextual information about a given word.

## Enter word embeddings

Word embeddings are trained models that produce dense representations of words in a corpus. Smaller corpora might have 8 dimensions, while larger corpora can have up to 1024 dimensions.

We're going to attempt to train an embeddings model on the Greek texts in `./data`.

In [3]:
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

2024-11-13 17:13:13.546617: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-13 17:13:19.699831: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-13 17:13:22.595172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731518007.893889    2163 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731518008.913552    2163 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-13 17:13:38.655999: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [4]:
import random

batch_size = 1024
seed = random.randint(0, 1000)
train_ds = tf.keras.utils.text_dataset_from_directory(
    "data", batch_size=batch_size, validation_split=0.2, subset="training", seed=seed
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    "data", batch_size=batch_size, validation_split=0.2, subset="validation", seed=seed
)

Found 31 files belonging to 3 classes.
Using 25 files for training.


2024-11-13 17:14:06.679567: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Found 31 files belonging to 3 classes.
Using 6 files for validation.


In [5]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [6]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [7]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[-0.02184906,  0.01496544,  0.04675484, -0.02853587, -0.02673086],
       [-0.01613537, -0.01207625, -0.03244214,  0.00352241, -0.04177449],
       [ 0.03279482, -0.01017033,  0.00553141, -0.04403266,  0.04552792]],
      dtype=float32)

In [8]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

In [9]:
import re
import string

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

2024-11-13 17:14:25.441425: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [11]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [12]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [13]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=15,
    callbacks=[tensorboard_callback])

Epoch 1/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2400 - loss: 0.6917 - val_accuracy: 0.1667 - val_loss: 0.6865
Epoch 2/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.2400 - loss: 0.6871 - val_accuracy: 0.1667 - val_loss: 0.6803
Epoch 3/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.2400 - loss: 0.6820 - val_accuracy: 0.1667 - val_loss: 0.6737
Epoch 4/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - accuracy: 0.2400 - loss: 0.6766 - val_accuracy: 0.1667 - val_loss: 0.6669
Epoch 5/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.2400 - loss: 0.6711 - val_accuracy: 0.1667 - val_loss: 0.6599
Epoch 6/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.2400 - loss: 0.6655 - val_accuracy: 0.1667 - val_loss: 0.6528
Epoch 7/15
[1m1/1[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d8249e5f470>

In [14]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [15]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

['',
 '[UNK]',
 np.str_('the'),
 np.str_('to'),
 np.str_('of'),
 np.str_('and'),
 np.str_('i'),
 np.str_('you'),
 np.str_('a'),
 np.str_('in'),
 np.str_('my'),
 np.str_('for'),
 np.str_('is'),
 np.str_('that'),
 np.str_('me'),
 np.str_('not'),
 np.str_('this'),
 np.str_('with'),
 np.str_('it'),
 np.str_('your'),
 np.str_('he'),
 np.str_('will'),
 np.str_('but'),
 np.str_('his'),
 np.str_('from'),
 np.str_('have'),
 np.str_('by'),
 np.str_('what'),
 np.str_('as'),
 np.str_('be'),
 np.str_('no'),
 np.str_('on'),
 np.str_('are'),
 np.str_('who'),
 np.str_('do'),
 np.str_('her'),
 np.str_('was'),
 np.str_('all'),
 np.str_('if'),
 np.str_('so'),
 np.str_('thou'),
 np.str_('him'),
 np.str_('at'),
 np.str_('now'),
 np.str_('thy'),
 np.str_('their'),
 np.str_('or'),
 np.str_('when'),
 np.str_('one'),
 np.str_('am'),
 np.str_('we'),
 np.str_('they'),
 np.str_('man'),
 np.str_('then'),
 np.str_('has'),
 np.str_('our'),
 np.str_('she'),
 np.str_('come'),
 np.str_('would'),
 np.str_('may'),
 np.st