In [3]:
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch
import re
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AdamW
from transformers import AutoTokenizer, LongT5Model
import torch.nn.functional as F
import torch.nn as nn
import re
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

https://arxiv.org/abs/1310.4546
create character based embeddings using skip-gram algorithms
https://www.geeksforgeeks.org/implement-your-own-word2vecskip-gram-model-in-python/


https://arxiv.org/abs/1301.3781

In [10]:
dataset = [
    "I took a walk in the park yesterday and enjoyed the sunny weather.",
    "She made a delicious chocolate cake for her sister's birthday.",
    "The train arrived on time, and I was able to get to work without any delays.",
    "My friends and I are planning a weekend trip to the beach next month.",
    "He found a new job in the city and is excited to start next week.",
    "The movie we watched last night was really entertaining and full of action.",
    "I bought a new book that I'm looking forward to reading over the weekend.",
    "The restaurant we went to for dinner had the best pasta I've ever tasted.",
    "She spent the afternoon organizing her closet and donating old clothes.",
    "We decided to stay in and watch a movie because it was raining outside."
]

window_size = 2

characters = []
char_pairs = []

for sentence in dataset:
    sentence = sentence.lower()
    sentence_len = len(sentence)

    for i, char in enumerate(sentence):
        if char not in characters:
            characters.append(char)

        for j in range(-window_size, window_size + 1):
            if j == 0:
                continue
            context_position = i + j
            if context_position >= 0 and context_position < sentence_len:
                context_char = sentence[context_position]
                char_pairs.append((char, context_char))

characters = sorted(characters)
char_to_index = {char: index for index, char in enumerate(characters)}
index_to_char = {index: char for index, char in enumerate(characters)}

In [11]:
class SkipGram(torch.nn.Module):
  def __init__(self, vocab_size, embedding_dims=100):
    super(SkipGram, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dims = embedding_dims
    self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dims)
    self.output_layer = nn.Linear(self.embedding_dims, self.vocab_size)

  def forward(self, input_word):
    x = self.embedding_layer(input_word)
    x = self.output_layer(x)
    return x

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

class SkipGramDataset(Dataset):
    def __init__(self, char_pairs, vocab_size, char_to_index):
        self.char_pairs = char_pairs
        self.vocab_size = vocab_size
        self.char_to_index = char_to_index

    def __len__(self):
        return len(self.char_pairs)

    def __getitem__(self, idx):
        input_word, target_word = self.char_pairs[idx]
        input_word = self.char_to_index[input_word]
        target_word = self.char_to_index[target_word]
        return torch.tensor(input_word, dtype=torch.long), torch.tensor(target_word, dtype=torch.long)

vocab_size = 29

dataset = SkipGramDataset(char_pairs, vocab_size, char_to_index)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [16]:
import torch.nn.functional as F
import torch.optim as optim

model = SkipGram(vocab_size=29, embedding_dims=100)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
num_epochs = 5


for epoch in range(num_epochs):
    for input_word, target_word in dataloader:
        optimizer.zero_grad()
        output = model(input_word)
        loss = criterion(output, target_word)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 3.149599552154541
Epoch 2, Loss: 3.0718226432800293
Epoch 3, Loss: 3.427504301071167
Epoch 4, Loss: 4.05959415435791
Epoch 5, Loss: 2.4437053203582764


In [18]:
model.embedding_layer.weight.data

tensor([[ 0.3451, -0.3230,  0.1157,  ..., -0.9892,  1.4008,  1.7541],
        [ 0.5750,  0.8476, -1.1858,  ...,  0.7309, -0.0509, -1.2395],
        [ 0.0851,  1.1289,  1.5185,  ..., -0.8972,  1.3207, -0.2341],
        ...,
        [ 1.7381,  1.7953,  0.6204,  ..., -0.6026, -1.7992,  1.1016],
        [ 1.1075, -1.3388,  0.9358,  ..., -0.4265,  0.7825, -0.3104],
        [ 0.3963,  0.5456,  1.4679,  ...,  0.9905, -1.3526,  0.8779]])