In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# Tokenization

In [2]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

tokens = tokenize(text)

In [3]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}
    
    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token
    
    return word_to_id, id_to_word

mapping(tokens)

({'training': 0,
  'are': 1,
  'wide': 2,
  'that': 3,
  'perform': 4,
  'on': 5,
  'in': 6,
  'a': 7,
  'so': 8,
  'without': 9,
  'predictions': 10,
  'sample': 11,
  'intelligence': 12,
  'to': 13,
  'or': 14,
  'explicitly': 15,
  'tasks': 16,
  'and': 17,
  'of': 18,
  'applications': 19,
  'being': 20,
  'mathematical': 21,
  'data': 22,
  'order': 23,
  'improve': 24,
  'subset': 25,
  'such': 26,
  'infeasible': 27,
  'machine': 28,
  'programmed': 29,
  'filtering': 30,
  'variety': 31,
  'needed': 32,
  'email': 33,
  'conventional': 34,
  'do': 35,
  'decisions': 36,
  'build': 37,
  'seen': 38,
  'model': 39,
  'based': 40,
  'is': 41,
  'make': 42,
  'automatically': 43,
  'as': 44,
  'learning': 45,
  'study': 46,
  'difficult': 47,
  'vision': 48,
  'used': 49,
  'develop': 50,
  'it': 51,
  'where': 52,
  'algorithms': 53,
  'the': 54,
  'artificial': 55,
  'computer': 56,
  'experience': 57,
  'through': 58,
  'known': 59},
 {0: 'training',
  1: 'are',
  2: 'wide',
  3

# Module

In [4]:
class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_size: int):
        super(WordEmbeddingModel, self).__init__()
        
        self.linear1 = nn.Linear(vocab_size, embedding_size)
        self.linear2 = nn.Linear(embedding_size, vocab_size)
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        x = self.linear1(x) # The embedding is the output of linear1
        x = self.linear2(x) 
        x = self.softmax(x)
        return x

# Training

In [5]:
def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)
    
    for i in range(n_tokens):
        idx = torch.concat((torch.Tensor(range(max(0, i - window), i)), torch.Tensor(range(i, min(n_tokens, i + window + 1)))))
        for j in idx:
            if i == j:
                continue
                
            print(torch.Tensor(word_to_id[tokens[i]]))
            X.append(F.one_hot(torch.Tensor([word_to_id[tokens[i]]]), len(word_to_id)))
            y.append(F.one_hot(torch.Tensor([word_to_id[tokens[j]]]), len(word_to_id)))

In [6]:
word_to_id, id_to_word = mapping(tokens)
m = WordEmbeddingModel(len(word_to_id), 10)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(m.parameters(), lr=0.05)

In [8]:
X, y = generate_training_data(tokens, word_to_id, 2)

for i in range(0, 5000):
    pred_y = m(torch.tensor(X, dtype=torch.float))
    loss = criterion(pred_y, torch.tensor(y, dtype=torch.float))
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(m.parameters(), 5.0)
    optimizer.step()
    print(loss)

tensor([1.4678e-06, 1.4111e-42, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00])


RuntimeError: one_hot is only applicable to index tensor.

# Evaluation

In [9]:
def get_embedding(model, word):
    try:
        idx = word_to_id[word]
        one_hot = F.one_hot(torch.Tensor([idx]).to(torch.int64), len(word_to_id))
        return model.linear1(one_hot.to(torch.float))
    except KeyError:
        print("`word` not in corpus")

In [10]:
machine_embedding = get_embedding(m, "machine")

In [34]:
min_distance = 10000
closest_token = ""
for t in word_to_id.keys():
    with torch.no_grad():
        embedding = get_embedding(m, t)
        similarity = torch.dot(embedding.squeeze(), machine_embedding.squeeze()) / (torch.norm(machine_embedding, dim=1) * torch.norm(embedding, dim=1))
        distance = torch.dist(embedding, machine_embedding).item()
        print(t, " - ", similarity.item())

training  -  -0.4389028251171112
are  -  0.31396082043647766
wide  -  0.37324926257133484
that  -  0.35422223806381226
perform  -  0.17575876414775848
on  -  -0.08497027307748795
in  -  -0.12576067447662354
a  -  -0.07067960500717163
so  -  0.1573219746351242
without  -  0.11637531965970993
predictions  -  0.04777004197239876
sample  -  0.3522264063358307
intelligence  -  0.2627631425857544
to  -  -0.14298105239868164
or  -  0.07358753681182861
explicitly  -  -0.1310654878616333
tasks  -  0.3535851538181305
and  -  0.26100844144821167
of  -  0.06291211396455765
applications  -  0.1983295977115631
being  -  0.10599274933338165
mathematical  -  0.6067609786987305
data  -  -0.14063821732997894
order  -  0.27168992161750793
improve  -  -0.25440409779548645
subset  -  -0.20182788372039795
such  -  -0.13383056223392487
infeasible  -  -0.46990326046943665
machine  -  1.0
programmed  -  -0.1552230417728424
filtering  -  0.058286793529987335
variety  -  0.47354981303215027
needed  -  0.21792672