In [5]:
from tokenize import tokenize
from torch import nn
import torch
import math

sentences =["Hello my name is Bachir", "How are you ?"]

#lut = Embeddings(vocab_size, d_model) # look-up table (lut)


# Embedding

In [6]:
tokenized_sequences = [seq.split(' ') for seq in sentences]
tokenized_sequences

[['Hello', 'my', 'name', 'is', 'Bachir'], ['How', 'are', 'you', '?']]

In [7]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, dimension):
        super(Embedding, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, dimension)
        
    def forward(self, x):
        return self.word_embedding(x)

## Manual preprocess

In [10]:
import nltk
from collections import Counter
#nltk.download('punkt')

from nltk.tokenize import word_tokenize
# Tokenize text
text_list = ["Hello, how are you doing today again?", "I have to go to work", " the cat is so cute"]
tokenized_test_list = [word_tokenize(text.lower()) for text in text_list]
all_tokens = [word for sentence_tokens in tokenized_test_list for word in sentence_tokens]

# Create a vocabulary
# Create a vocabulary mapping each unique word to a numerical index. 
# You can use Python's collections.Counter to count word frequencies and assign indices based on the frequency.
word_counts = Counter(all_tokens)
#sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_idx = {word: idx for idx, word in enumerate(word_counts.keys())}

# Convert Text to Indices:
# Map each word in your text to its corresponding index in the vocabulary.
text_indices = [[word_to_idx[word] for word in tokens] for tokens in tokenized_test_list]

#Use embedding
vocab_size = len(word_to_idx)
embedding_dim = 50
embedding_layer = Embedding(vocab_size, embedding_dim)

# apply to text_indices
embeddings = [embedding_layer(torch.tensor(x)) for x in text_indices]


In [11]:
for embed in embeddings:
    print(embed.size())

torch.Size([9, 50])
torch.Size([6, 50])
torch.Size([5, 50])


## Automated preprocess

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [27]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
    "In a cold and gray Chicago morning, a poor little baby child is born in the ghetto",
    "I'll be there for you and the rain starts to pour",
    "I like big butts and I cannot lie",
]

encoded_input = tokenizer(batch_sentences, padding=True, return_tensors="pt")


encoded_input.input_ids.size()


torch.Size([6, 21])

In [30]:
tokenizer.vocab_size

28996

In [28]:
tokenizer.decode(encoded_input["input_ids"][3])

'[CLS] In a cold and gray Chicago morning, a poor little baby child is born in the ghetto [SEP]'

# Positional embedding

In [12]:
class PositionalEmbedding(nn.Module):
    def __init__(self, dimension, max_seq_length=2000):
        super(PositionalEmbedding, self).__init__()

        positional_encoding = torch.zeros(max_seq_length, dimension)    
        for pos in range(max_seq_length):
            for i in range(dimension):
                if i%2 == 0:
                    pe = math.sin(pos / 1000**(2*i/dimension))
                else:
                    pe = math.cos(pos / 1000**(2*i/dimension))
                positional_encoding[pos, i] = pe

        self.register_buffer('positional_encoding', positional_encoding)
        
    def forward(self, x):
        return x + self.positional_encoding[:x.size(1), :]