<a href="https://colab.research.google.com/github/brandonowens24/Word_Embeddings/blob/main/Word_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset
from tqdm import tqdm

dataset = load_dataset("wikipedia", "20220301.simple")
documents = dataset['train']['text'][:5]


In [None]:
import torch
import torch.nn as nn
class my_LM(torch.nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size, context_size=2, embs=None):
        super(my_LM, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, emb_dim)
        if embs:
            self.embedding_layer = nn.Embedding.from_pretrained(embs)
        self.linear1 = nn.Linear(emb_dim * context_size, hidden_size)
        self.sigmoid = nn.Sigmoid()
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.Softmax(dim=0)

def forward(self, x):
    # flatten into a 1d output, concatenating vectors
    # from each embedding in the input
    x = torch.flatten(self.embedding_layer(x), start_dim=-2)
    x = self.linear1(x)
    x = self.sigmoid(x)
    x = self.linear2(x)
    x = self.softmax(x)
    return x

In [None]:
import nltk
def normalization(document, context_size=2):
    doc_tokens = []
    sentences = nltk.sent_tokenize(document)
    for sentence in tqdm(sentences):
        sent_tokens = nltk.word_tokenize(sentence)
        sent_tokens = [word.lower() for word in sent_tokens if word]
        doc_tokens += ['<s>']*context_size + sent_tokens + ['</s>']*context_size
    return doc_tokens



In [None]:
from bidict import bidict
def create_context_vectors(documents, context_size=2):
    token2id = bidict()
    next_avail_token_id = 0
    X = []
    Y = []
    for text in tqdm(documents):
        tokens = normalization(text, context_size)
        for i in range(len(tokens) - (context_size + 2)):
            outside_tokens = tokens[i: i + context_size] + tokens[i + 1 + context_size: i + 1 + 2 * context_size ]
            centered_token = tokens[i + context_size]
            for token in outside_tokens + [centered_token]:
                if token not in token2id:
                    token2id[token] = next_avail_token_id
                    next_avail_token_id += 1
            x = [token2id[t] for t in outside_tokens]
            y = token2id[centered_token]
            X.append(x)
            Y.append(y)
    vocab_size = len(token2id)
    return X,Y,token2id,vocab_size



In [None]:
context_size = 2
X,Y,token2id,vocab_size = create_context_vectors(documents, context_size)