In [3]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

# Tokenization

In [4]:
text = '''Machine learning is the study of computer algorithms that \
improve automatically through experience. It is seen as a \
subset of artificial intelligence. Machine learning algorithms \
build a mathematical model based on sample data, known as \
training data, in order to make predictions or decisions without \
being explicitly programmed to do so. Machine learning algorithms \
are used in a wide variety of applications, such as email filtering \
and computer vision, where it is difficult or infeasible to develop \
conventional algorithms to perform the needed tasks.'''

def tokenize(text):
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

tokens = tokenize(text)

In [9]:
def mapping(tokens):
    word_to_id = {}
    id_to_word = {}
    
    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token
    
    return word_to_id, id_to_word

mapping(tokens)

({'to': 0,
  'algorithms': 1,
  'artificial': 2,
  'training': 3,
  'vision': 4,
  'it': 5,
  'learning': 6,
  'data': 7,
  'difficult': 8,
  'wide': 9,
  'in': 10,
  'predictions': 11,
  'sample': 12,
  'explicitly': 13,
  'tasks': 14,
  'the': 15,
  'being': 16,
  'make': 17,
  'email': 18,
  'where': 19,
  'infeasible': 20,
  'known': 21,
  'without': 22,
  'seen': 23,
  'perform': 24,
  'improve': 25,
  'do': 26,
  'a': 27,
  'are': 28,
  'through': 29,
  'used': 30,
  'needed': 31,
  'such': 32,
  'applications': 33,
  'programmed': 34,
  'of': 35,
  'model': 36,
  'filtering': 37,
  'build': 38,
  'computer': 39,
  'is': 40,
  'on': 41,
  'variety': 42,
  'study': 43,
  'experience': 44,
  'that': 45,
  'order': 46,
  'so': 47,
  'and': 48,
  'subset': 49,
  'or': 50,
  'automatically': 51,
  'intelligence': 52,
  'based': 53,
  'decisions': 54,
  'conventional': 55,
  'mathematical': 56,
  'machine': 57,
  'develop': 58,
  'as': 59},
 {0: 'to',
  1: 'algorithms',
  2: 'artificia

# Module

In [39]:
class WordEmbeddingModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_size: int):
        super(WordEmbeddingModel, self).__init__()
        
        self.linear1 = nn.Linear(vocab_size, embedding_size)
        self.linear2 = nn.Linear(embedding_size, vocab_size)
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        x = self.linear1(x) # The embedding is the output of linear1
        x = self.linear2(x) 
        x = self.softmax(x)
        return x

# Training

In [33]:
def generate_training_data(tokens, word_to_id, window):
    X = []
    y = []
    n_tokens = len(tokens)
    
    for i in range(n_tokens):
        idx = torch.concat((torch.Tensor(range(max(0, i - window), i)), torch.Tensor(range(i, min(n_tokens, i + window + 1)))))
        for j in idx:
            if i == j:
                continue
                
            print(torch.Tensor(word_to_id[tokens[i]]))
            X.append(F.one_hot(torch.Tensor([word_to_id[tokens[i]]]), len(word_to_id)))
            y.append(F.one_hot(torch.Tensor([word_to_id[tokens[j]]]), len(word_to_id)))

In [44]:
word_to_id, id_to_word = mapping(tokens)
m = WordEmbeddingModel(len(word_to_id), 10)

In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(m.parameters(), lr=0.05)

In [45]:
X, y = generate_training_data(tokens, word_to_id, 2)

for i in range(0, 5000):
    pred_y = m(torch.tensor(X, dtype=torch.float))
    loss = criterion(pred_y, torch.tensor(y, dtype=torch.float))
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(m.parameters(), 5.0)
    optimizer.step()
    print(loss)

tensor([1.2894e+38, 1.2219e-42, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00])


RuntimeError: one_hot is only applicable to index tensor.

# Evaluation

In [59]:
def get_embedding(model, word):
    try:
        idx = word_to_id[word]
        one_hot = F.one_hot(torch.Tensor([idx]).to(torch.int64), len(word_to_id))
        return model.linear1(one_hot.to(torch.float))
    except KeyError:
        print("`word` not in corpus")
        
get_embedding(m, "machine")

tensor([[ 0.0521,  0.0279, -0.0518,  0.0839,  0.0105, -0.0985,  0.0078,  0.0740,
          0.0148, -0.0250]], grad_fn=<AddmmBackward0>)