# BERT Step by Step: Word Embeddings

Word embeddings are a fundamental technique in natural language processing that represents words as dense vectors in a continuous vector space, capturing semantic relationships between words.

Unlike traditional one-hot encoding, where each word is represented by a sparse binary vector, word embeddings encode semantic meaning and context, allowing algorithms to better understand the relationships between words in a text. Embeddings are learned from large corpora of text data using techniques.

In [None]:
import torch
import matplotlib.pyplot as plt
from transformers import AutoConfig, AutoTokenizer
from transformers import BertForPreTraining

import utils
%config InlineBackend.figure_format = 'svg'

In [None]:
model_checkpoint = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = BertForPreTraining.from_pretrained(model_checkpoint)
config = AutoConfig.from_pretrained(model_checkpoint)

In [None]:
encoding = tokenizer.encode("let's tokenize something?", return_tensors="pt")
encoding

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoding.flatten())
tokens

In [None]:
model

In [None]:
model.bert.embeddings

In [None]:
config.vocab_size

In [None]:
config.hidden_size  # dimension of the embeddings

In [None]:
model.bert.embeddings.word_embeddings   # (vocab_size, hidden_size)

In [None]:
seq_embedding = model.bert.embeddings.word_embeddings(encoding)

seq_embedding.shape      # (batch_size, seq_len)  -> (batch_size, seq_len, hidden_size)

Let's do a small experiment with similarities between two embeddings
$$
\text{CosineSimilarity}(\mathbf{a}, \mathbf{b}) = \frac{\mathbf{a} \cdot \mathbf{b}}{\|\mathbf{a}\| \|\mathbf{b}\|}
$$

In [None]:
queen_token = tokenizer.convert_tokens_to_ids(['queen'])
queen_embedding = model.bert.embeddings.word_embeddings(torch.tensor(queen_token))

king_token = tokenizer.convert_tokens_to_ids(['king'])
king_embedding = model.bert.embeddings.word_embeddings(torch.tensor(king_token))


# cosine similarity
# queen_embedding @ king_embedding.T / torch.norm(queen_embedding) / torch.norm(king_embedding)
cos = torch.nn.CosineSimilarity()

cos(queen_embedding, king_embedding)

In [None]:
words = ["king", "queen", "man", "woman", "dog", "cat", "apple", "pear"]
embeddings = []

for word in words:
    token_id = tokenizer.convert_tokens_to_ids([word])
    embedding = model.bert.embeddings.word_embeddings(torch.tensor(token_id))
    embeddings.append(embedding.squeeze())

similarity_matrix = torch.zeros((len(words), len(words)))

for i in range(len(words)):
    for j in range(len(words)):
        similarity_matrix[i, j] = cos(embeddings[i].unsqueeze(0), embeddings[j].unsqueeze(0))

utils.plot_matrix(similarity_matrix.detach().numpy(), words, show_values=True)