In [8]:
def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split('_')
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = '_'.join(pair)
    for word in v_in:
        w_out = word.replace('_'.join(pair), bigram)
        v_out[w_out] = v_in[word]
    return v_out

def byte_pair_encoding(vocab, num_merges):
    for _ in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best_pair = max(pairs, key=pairs.get)
        vocab = merge_vocab(best_pair, vocab)
    return vocab

def tokenize_with_bpe(text, vocab):
    words = text.split()
    tokenized_text = []
    for word in words:
        while len(word) > 0:
            subword = max([w for w in vocab if word.startswith(w)], key=len, default=word)
            tokenized_text.append(subword)
            word = word[len(subword):]
    return '_'.join(tokenized_text)

In [10]:
'''vocab = {
    "s_a_i": 5,
    "r_a_m": 2,
    "j_a_i": 6,
    "o_m": 3,
    "s__r_i": 1,
    }

# Perform 5 BPE merges
num_merges = 5
vocab = byte_pair_encoding(vocab, num_merges)

# Test tokenization
text = "sairam"
tokenized_text = tokenize_with_bpe(text, vocab)
print(tokenized_text)'''

sairam


---

# Byte Pair Encoding using Sathya Sai Speaks Volume 1 Chapters 1-5

This notebook is an experiment on Byte-Pair Encoding using Sathya Sai Speaks.

## Loading the data

In [100]:
filename = ['./sss/sss_01_01.txt',
            './sss/sss_01_02.txt',
            './sss/sss_01_03.txt',
            './sss/sss_01_04.txt',
            './sss/sss_01_05.txt',
               ]

In [101]:
line = []
for i in range(len(filename)):
    with open(filename[i]) as file:
        l = [line.strip() for line in file]
        line.append(l)

In [102]:
lines = []
for sublist in line:
    for item in sublist:
        lines.append(item)

## Creating the Vocabulary Dictionary using regex tokenizer

In [103]:
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
tk = RegexpTokenizer('\s+', gaps = True)

In [107]:
vocab_dict = {}

for i in range(len(lines)):
    pre_vocab = tk.tokenize(lines[i])
    
    for j in range(len(pre_vocab)):
        if pre_vocab[j] in vocab_dict.keys():
            vocab_dict[pre_vocab[j]] += 1
        else:
            vocab_dict[pre_vocab[j]] = 1

In [108]:
print(vocab_dict)

{'When': 18, 'I': 44, 'was': 29, 'at': 23, 'Uravakonda': 1, 'studying': 1, 'in': 130, 'the': 667, 'high': 1, 'school,': 1, 'you': 178, 'know': 12, 'came': 6, 'away': 7, 'one': 36, 'day': 10, 'and': 310, 'threw': 2, 'off': 5, 'my': 4, 'books': 1, 'declared': 1, 'that': 148, 'have': 37, 'My': 8, 'work': 3, 'waiting': 1, 'for': 70, 'Me.': 4, 'The': 79, 'Telugu': 2, 'scholar': 2, 'described': 1, 'incident': 1, 'of': 273, 'evening': 1, 'to': 238, 'all': 47, 'his': 33, 'speech.': 1, 'Well,': 2, 'when': 40, 'out': 9, 'publicly': 1, 'as': 34, 'Sai': 4, 'Baba,': 1, 'first': 10, 'song': 2, 'taught': 1, 'gathering': 3, 'garden': 1, 'which': 27, 'went': 4, 'from': 31, 'pandit’s': 1, 'house': 4, 'was:': 1, 'Manasa': 2, 'bhajare': 2, 'guru': 7, 'charanam': 1, 'Dustara': 1, 'bhava': 1, 'sagara': 1, 'taranam': 1, 'called': 6, 'on': 38, 'those': 4, 'suffering': 4, 'endless': 1, 'round': 2, 'birth': 4, 'death': 1, 'worship': 4, 'Feet': 1, 'Guru': 2, '(spiritual': 2, 'preceptor),': 1, 'who': 35, 'announc

In [109]:
vocab_dict = dict(sorted(vocab_dict.items(), key=lambda x:x[1], reverse=True))

In [110]:
print(vocab_dict)

{'the': 667, 'and': 310, 'of': 273, 'to': 238, 'is': 216, 'you': 178, 'that': 148, 'a': 131, 'in': 130, 'it': 101, 'not': 81, 'The': 79, 'will': 75, 'are': 72, 'for': 70, 'this': 59, 'or': 52, 'your': 51, 'with': 51, 'be': 49, 'all': 47, 'I': 44, 'when': 40, 'on': 38, 'do': 38, 'have': 37, 'one': 36, 'who': 35, 'as': 34, 'by': 34, 'his': 33, 'but': 33, 'He': 32, 'from': 31, 'has': 31, 'can': 31, 'was': 29, 'which': 27, 'get': 26, 'Lord': 26, 'only': 26, 'You': 25, 'he': 25, 'at': 23, 'like': 23, 'so': 23, 'no': 23, 'they': 22, 'must': 21, 'In': 19, 'even': 19, 'spiritual': 19, 'its': 19, 'When': 18, 'had': 18, 'It': 18, 'His': 18, 'into': 18, 'some': 16, 'if': 16, 'come': 15, 'an': 15, 'Of': 15, 'But': 15, 'more': 15, 'what': 15, 'Name': 15, 'That': 14, 'Form': 14, 'very': 13, 'them': 13, 'there': 13, 'So': 13, 'know': 12, 'their': 12, 'order': 12, 'faith': 12, 'name': 12, 'This': 11, 'may': 11, 'give': 11, 'asked': 11, 'through': 11, 'There': 11, 'take': 11, 'day': 10, 'first': 10, 'u

---