## Imports

In [13]:
import pickle

In [14]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /usr/share/nltk_data...
[nltk_data]    |   Pack

True

In [15]:
import logging
logging.basicConfig(filename="logs.log",format="%(asctime)s - %(message)s",level=logging.DEBUG,force=True)

In [16]:
sen1 = "This is an apple"

### Get the vector for a sentence

> Tokenize 
> For each token get the index from the vocabulary

In [17]:
# Read the vocab and merge rules

with open("/kaggle/input/basic-bpe/merge_rules.pkl","rb") as f:
    merge_rules = pickle.load(f)
    
with open("/kaggle/input/basic-bpe/vocab.pkl","rb") as f:
    vocab = pickle.load(f)

In [18]:
len(merge_rules),len(vocab)

(9786, 9840)

In [19]:
def tokenize(text,merge_rules):
    '''Return tokens from the text'''
    logging.info(f"Starting tokenization for the text -> {text}")
    text = text.lower() # our vocabulary is uncased.
    words = nltk.word_tokenize(text)
    
    ##preparing dict where word is the key and the tokens is its value
    word_dict = dict()
    for word in words:
        word_dict[word] = []
        for char in word:
            word_dict[word].append(char)
            
    logging.info("Starting to iterate through each merge rule")
    
    for merge_rule in merge_rules.keys():
        #go through each merge rule and tokenize each word from left to right
        for word in word_dict.keys():
            tokens = word_dict[word]
            idx=0
            while(idx<len(tokens)-1):
                pair = (tokens[idx],tokens[idx+1])
                if pair==merge_rule:
                    logging.info(f"merging for word {word}")
                    #merge the token of the word as per the merge rule
                    #if merged we dont increase the index of the tokens because the next pair should include the new pair and the next char
                    tokens = tokens[:idx]+[tokens[idx]+tokens[idx+1]]+tokens[idx+2:]
                    word_dict[word] = tokens
                else:
                    #if not merged then we slide the window over
                    idx = idx+1
        logging.info(f"After merge {word_dict}")
    return word_dict    
        

In [20]:
merge_rules[('t', 'h')]

'th'

In [None]:
tokenize(sen1,merge_rules)

2023-10-08 17:03:36,298 - Starting tokenization for the text -> This is an apple
2023-10-08 17:03:36,300 - Starting to iterate through each merge rule
2023-10-08 17:03:36,301 - After merge {'this': ['t', 'h', 'i', 's'], 'is': ['i', 's'], 'an': ['a', 'n'], 'apple': ['a', 'p', 'p', 'l', 'e']}
2023-10-08 17:03:36,302 - After merge {'this': ['t', 'h', 'i', 's'], 'is': ['i', 's'], 'an': ['a', 'n'], 'apple': ['a', 'p', 'p', 'l', 'e']}
2023-10-08 17:03:36,304 - merging for word an
2023-10-08 17:03:36,305 - After merge {'this': ['t', 'h', 'i', 's'], 'is': ['i', 's'], 'an': ['an'], 'apple': ['a', 'p', 'p', 'l', 'e']}
2023-10-08 17:03:36,306 - After merge {'this': ['t', 'h', 'i', 's'], 'is': ['i', 's'], 'an': ['an'], 'apple': ['a', 'p', 'p', 'l', 'e']}
2023-10-08 17:03:36,307 - After merge {'this': ['t', 'h', 'i', 's'], 'is': ['i', 's'], 'an': ['an'], 'apple': ['a', 'p', 'p', 'l', 'e']}
2023-10-08 17:03:36,308 - After merge {'this': ['t', 'h', 'i', 's'], 'is': ['i', 's'], 'an': ['an'], 'apple': 