## Imports

In [1]:
import pickle

In [2]:
import numpy as np

In [3]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /usr/share/nltk_data...
[nltk_data]    | Downloading pa

True

In [4]:
import logging
logging.basicConfig(filename="logs.log",format="%(asctime)s - %(message)s",level=logging.DEBUG,force=True)

In [5]:
sen1 = "This is an apple"

### Get the vector for a sentence

> Tokenize 
> For each token get the index from the vocabulary

In [6]:
# Read the vocab and merge rules

with open("/kaggle/input/tinystories-custom-bpe/merge_rules.pkl","rb") as f:
    merge_rules = pickle.load(f)
    
with open("/kaggle/input/tinystories-custom-bpe/vocab.pkl","rb") as f:
    vocab = pickle.load(f)

In [7]:
len(merge_rules),len(vocab)

(9786, 9840)

In [8]:
def tokenize(text,merge_rules):
    '''Return tokens from the text'''
    logging.info(f"Starting tokenization for the text -> {text}")
    text = text.lower() # our vocabulary is uncased.
    words = nltk.word_tokenize(text)
    
    ##preparing dict where word is the key and the tokens is its value
    word_dict = dict()
    for word in words:
        word_dict[word] = []
        for char in word:
            word_dict[word].append(char)
            
    logging.info("Starting to iterate through each merge rule")
    
    for merge_rule in merge_rules.keys():
        #go through each merge rule and tokenize each word from left to right
        for word in word_dict.keys():
            tokens = word_dict[word]
            idx=0
            while(idx<len(tokens)-1):
                pair = (tokens[idx],tokens[idx+1])
                if pair==merge_rule:
                    logging.info(f"merging for word {word}")
                    #merge the token of the word as per the merge rule
                    #if merged we dont increase the index of the tokens because the next pair should include the new pair and the next char
                    tokens = tokens[:idx]+[tokens[idx]+tokens[idx+1]]+tokens[idx+2:]
                    word_dict[word] = tokens
                else:
                    #if not merged then we slide the window over
                    idx = idx+1
        logging.info(f"After merge {word_dict}")
    return word_dict    
        

In [9]:
merge_rules[('t', 'h')]

'th'

In [10]:
tokenize(sen1,merge_rules)

{'this': ['this'], 'is': ['is'], 'an': ['an'], 'apple': ['apple']}

In [11]:
def sen_to_tensor(sen,vocab,merge_rules):
    '''Each sen will be converted to a vector where each token will be mapped to its index in vocab'''
    sen = sen.lower()
    token_dict = tokenize(sen,merge_rules)
    words = nltk.word_tokenize(sen)
    vocab = list(vocab)
    list_of_tokens = []
    for word in words:
        list_of_tokens.extend(token_dict[word])
    
    #find the index of the token in the vocab
    tensor = np.empty(len(list_of_tokens),dtype=int)
    for idx,token in enumerate(list_of_tokens):
        tensor[idx] = vocab.index(token)
    return tensor
        
    

In [12]:
sen_to_tensor(sen1,vocab,merge_rules)

array([3141,   34, 9520,   53])

In [13]:
def pos_embed_sen(sen_tensor,n=100,d=4):
    '''Given a sen tensor return its pos embed'''
    seq_len = len(sen_tensor)
    pos_embed_matrix = np.zeros((seq_len,d))
    
    for k in range(len(sen_tensor)):
        for i in range(0,int(d/2)):
            denom = np.power(n,((2*i)/d))
            pos_embed_matrix[k,2*i] = np.sin(k/denom)#for even embed dim
            pos_embed_matrix[k,2*i+1] = np.cos(k/denom)#for odd embed dim
    
    return pos_embed_matrix
        
    
    

In [14]:
tensor= sen_to_tensor(sen1,vocab,merge_rules)
pos_embed_sen(tensor)

array([[ 0.        ,  1.        ,  0.        ,  1.        ],
       [ 0.84147098,  0.54030231,  0.09983342,  0.99500417],
       [ 0.90929743, -0.41614684,  0.19866933,  0.98006658],
       [ 0.14112001, -0.9899925 ,  0.29552021,  0.95533649]])