# Word Embeddings

Word embeddings are a representation of the ***semantics*** of a word, efficiently encoding semantic information that might be relevant to the task at hand.

In [1]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

torch.manual_seed(64)

<torch._C.Generator at 0x7f00f8012dd0>

In [2]:
word_to_idx = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings

print(embeds) # index i stores in the i’th row of the embeddings

Embedding(2, 5)


In [3]:
lookup_tensor = torch.tensor([word_to_idx["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print("hello_embed: ", hello_embed)

lookup_tensor = torch.tensor([word_to_idx["world"]], dtype=torch.long)
world_embed = embeds(lookup_tensor)
print("world_embed: ", world_embed)

hello_embed:  tensor([[-0.8159,  0.6431,  0.1268, -0.8538,  0.7698]],
       grad_fn=<EmbeddingBackward>)
world_embed:  tensor([[-0.1140, -0.4436, -1.9793,  1.1040,  0.1535]],
       grad_fn=<EmbeddingBackward>)


## An Example: N-Gram Language Modeling

In [4]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

In [5]:
test_sentence = """
When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.
""".split()

print(test_sentence)

['When', 'forty', 'winters', 'shall', 'besiege', 'thy', 'brow,', 'And', 'dig', 'deep', 'trenches', 'in', 'thy', "beauty's", 'field,', 'Thy', "youth's", 'proud', 'livery', 'so', 'gazed', 'on', 'now,', 'Will', 'be', 'a', "totter'd", 'weed', 'of', 'small', 'worth', 'held:', 'Then', 'being', 'asked,', 'where', 'all', 'thy', 'beauty', 'lies,', 'Where', 'all', 'the', 'treasure', 'of', 'thy', 'lusty', 'days;', 'To', 'say,', 'within', 'thine', 'own', 'deep', 'sunken', 'eyes,', 'Were', 'an', 'all-eating', 'shame,', 'and', 'thriftless', 'praise.', 'How', 'much', 'more', 'praise', "deserv'd", 'thy', "beauty's", 'use,', 'If', 'thou', 'couldst', 'answer', "'This", 'fair', 'child', 'of', 'mine', 'Shall', 'sum', 'my', 'count,', 'and', 'make', 'my', 'old', "excuse,'", 'Proving', 'his', 'beauty', 'by', 'succession', 'thine!', 'This', 'were', 'to', 'be', 'new', 'made', 'when', 'thou', 'art', 'old,', 'And', 'see', 'thy', 'blood', 'warm', 'when', 'thou', "feel'st", 'it', 'cold.']


In [6]:
from collections import defaultdict

freq = defaultdict(int)
for w in test_sentence:
  freq[w] += 1
print(freq)

defaultdict(<class 'int'>, {'see': 1, 'This': 1, 'If': 1, 'small': 1, 'Then': 1, 'besiege': 1, 'praise': 1, 'gazed': 1, 'so': 1, "totter'd": 1, 'art': 1, 'made': 1, 'new': 1, 'say,': 1, "deserv'd": 1, 'praise.': 1, 'all-eating': 1, 'thine': 1, 'Where': 1, 'thy': 6, 'weed': 1, "feel'st": 1, 'blood': 1, 'treasure': 1, 'it': 1, 'proud': 1, 'more': 1, 'days;': 1, 'old,': 1, 'fair': 1, 'cold.': 1, 'shame,': 1, 'to': 1, 'when': 2, 'be': 2, 'held:': 1, 'dig': 1, "'This": 1, 'shall': 1, 'a': 1, 'within': 1, 'count,': 1, 'Thy': 1, "beauty's": 2, 'the': 1, 'thriftless': 1, 'own': 1, 'mine': 1, 'And': 2, 'Shall': 1, 'warm': 1, 'use,': 1, 'trenches': 1, 'brow,': 1, 'Proving': 1, 'my': 2, 'an': 1, 'where': 1, 'couldst': 1, 'make': 1, 'answer': 1, "youth's": 1, 'When': 1, 'sum': 1, 'eyes,': 1, 'beauty': 2, 'forty': 1, 'succession': 1, 'How': 1, 'field,': 1, 'Will': 1, 'worth': 1, 'were': 1, "excuse,'": 1, 'lies,': 1, 'his': 1, 'deep': 2, 'asked,': 1, 'lusty': 1, 'Were': 1, 'in': 1, 'livery': 1, 'win

In [7]:
# trigram: ([ word_i-2, word_i-1 ], target word)
trigrams = [
    ([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
    for i in range(len(test_sentence) - 2)
]

# print the first 3
print(trigrams[:3])

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [8]:
# remove duplicate and build index
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

## Define the model

In [9]:
class NGramLanguageModeler(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()   
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        return out

In [10]:
loss_function = nn.CrossEntropyLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [11]:
# just for fun
winter_lookup = torch.tensor([word_to_ix["winters"]], dtype=torch.long)
warm_lookup = torch.tensor([word_to_ix["warm"]], dtype=torch.long)
thy_lookup = torch.tensor([word_to_ix["thy"]], dtype=torch.long)

winter = model.embeddings(winter_lookup)
warm = model.embeddings(warm_lookup)
thy = model.embeddings(thy_lookup)

print("winter: ", winter)
print(F.cosine_similarity(winter, warm))
print(F.cosine_similarity(winter, thy))

winter:  tensor([[ 0.1541,  1.4181,  0.9295,  1.0273, -0.1369, -1.2427,  0.0297,  0.6228,
         -0.3465,  0.2267]], grad_fn=<EmbeddingBackward>)
tensor([-0.1794], grad_fn=<DivBackward0>)
tensor([0.1908], grad_fn=<DivBackward0>)


In [12]:
for epoch in range(1, 301):
    total_loss = 0
    for context, target in trigrams:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        model.zero_grad()

        out = model(context_idxs)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        loss = loss_function(out, label)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    if epoch%10 == 0:
        total_loss = total_loss/len(trigrams)
        print("Epoch: {:3d}, total_loss: {:3.4f}".format(epoch, total_loss))

Epoch:  10, total_loss: 4.4161
Epoch:  20, total_loss: 4.2153
Epoch:  30, total_loss: 4.0180
Epoch:  40, total_loss: 3.8211
Epoch:  50, total_loss: 3.6217
Epoch:  60, total_loss: 3.4167
Epoch:  70, total_loss: 3.2040
Epoch:  80, total_loss: 2.9834
Epoch:  90, total_loss: 2.7561
Epoch: 100, total_loss: 2.5246
Epoch: 110, total_loss: 2.2922
Epoch: 120, total_loss: 2.0627
Epoch: 130, total_loss: 1.8403
Epoch: 140, total_loss: 1.6282
Epoch: 150, total_loss: 1.4293
Epoch: 160, total_loss: 1.2465
Epoch: 170, total_loss: 1.0816
Epoch: 180, total_loss: 0.9355
Epoch: 190, total_loss: 0.8086
Epoch: 200, total_loss: 0.6998
Epoch: 210, total_loss: 0.6077
Epoch: 220, total_loss: 0.5304
Epoch: 230, total_loss: 0.4657
Epoch: 240, total_loss: 0.4119
Epoch: 250, total_loss: 0.3670
Epoch: 260, total_loss: 0.3294
Epoch: 270, total_loss: 0.2978
Epoch: 280, total_loss: 0.2711
Epoch: 290, total_loss: 0.2484
Epoch: 300, total_loss: 0.2289


In [13]:
# just for fun
winter_lookup = torch.tensor([word_to_ix["winters"]], dtype=torch.long)
warm_lookup = torch.tensor([word_to_ix["warm"]], dtype=torch.long)
thy_lookup = torch.tensor([word_to_ix["thy"]], dtype=torch.long)

winter = model.embeddings(winter_lookup)
warm = model.embeddings(warm_lookup)
thy = model.embeddings(thy_lookup)

print("winter: ", winter)
print(F.cosine_similarity(winter, warm))
print(F.cosine_similarity(winter, thy))

winter:  tensor([[ 0.1268,  1.4652,  0.9505,  1.0286, -0.1670, -1.2583,  0.0223,  0.6076,
         -0.3706,  0.2573]], grad_fn=<EmbeddingBackward>)
tensor([-0.1797], grad_fn=<DivBackward0>)
tensor([0.1887], grad_fn=<DivBackward0>)


## Continuous Bag-of-Words

In [14]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """
We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.
""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

In [15]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [16]:
class CBOW(nn.Module):

    
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()   
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.sum(embeds, dim=0).view((1, -1))
        out = self.linear1(embeds)
        return out

In [17]:
loss_function = nn.CrossEntropyLoss()
model = CBOW(len(vocab), EMBEDDING_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [18]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example

tensor([42, 48,  8, 13])

In [19]:
for epoch in range(1, 301):
    total_loss = 0
    for context, target in data:
        context_idxs = make_context_vector(context, word_to_ix)
        model.zero_grad()
        
        out = model(context_idxs)
        label = torch.tensor([word_to_ix[target]], dtype=torch.long)
        loss = loss_function(out, label)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    if epoch%10 == 0:
        total_loss = total_loss/len(trigrams)
        print("Epoch: {:3d}, total_loss: {:3.4f}".format(epoch, total_loss))

Epoch:  10, total_loss: 2.1359
Epoch:  20, total_loss: 1.9306
Epoch:  30, total_loss: 1.7595
Epoch:  40, total_loss: 1.6148
Epoch:  50, total_loss: 1.4905
Epoch:  60, total_loss: 1.3824
Epoch:  70, total_loss: 1.2876
Epoch:  80, total_loss: 1.2036
Epoch:  90, total_loss: 1.1286
Epoch: 100, total_loss: 1.0611
Epoch: 110, total_loss: 1.0000
Epoch: 120, total_loss: 0.9443
Epoch: 130, total_loss: 0.8934
Epoch: 140, total_loss: 0.8466
Epoch: 150, total_loss: 0.8034
Epoch: 160, total_loss: 0.7634
Epoch: 170, total_loss: 0.7262
Epoch: 180, total_loss: 0.6915
Epoch: 190, total_loss: 0.6591
Epoch: 200, total_loss: 0.6287
Epoch: 210, total_loss: 0.6001
Epoch: 220, total_loss: 0.5733
Epoch: 230, total_loss: 0.5480
Epoch: 240, total_loss: 0.5241
Epoch: 250, total_loss: 0.5016
Epoch: 260, total_loss: 0.4803
Epoch: 270, total_loss: 0.4601
Epoch: 280, total_loss: 0.4411
Epoch: 290, total_loss: 0.4230
Epoch: 300, total_loss: 0.4059
