In [None]:
import collections

In [1]:
# corpus for training

corpus = [
    "The sun set behind the mountains, casting a golden glow across the landscape.",

"She walked through the quiet forest, her footsteps muffled by the soft blanket of fallen leaves.",

"The aroma of freshly brewed coffee filled the air as he sat down at the kitchen table.",

"In the distance, the sound of a river flowing peacefully echoed through the valley."
]

In [2]:
corpus

['The sun set behind the mountains, casting a golden glow across the landscape.',
 'She walked through the quiet forest, her footsteps muffled by the soft blanket of fallen leaves.',
 'The aroma of freshly brewed coffee filled the air as he sat down at the kitchen table.',
 'In the distance, the sound of a river flowing peacefully echoed through the valley.']

In [4]:
# Initialize vocabulary with unique characters
unique_chars = set()
for sentence in corpus:
    for char in sentence:
        unique_chars.add(char)

vocab = list(unique_chars)
vocab.sort()

# add special token at end of word token
end_of_word = '<|endofword|>'
vocab.append(end_of_word)

In [6]:
print("vocabulary:" , vocab)
print("length of vocabulary:" , len(vocab))

vocabulary: [' ', ',', '.', 'I', 'S', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', '<|endofword|>']
length of vocabulary: 30


In [17]:
# pre tokenize the corpus(split into word->then characters)

word_split = {}
for sentence in corpus:
  words = sentence.split(' ')
  print("words:------------",words)
  for word in words:
    if word:
      char_list = list(word) + [end_of_word]
      print("char_list:---------",char_list)
      word_tuple = tuple(char_list)

      if word_tuple not in word_split:
        word_split[word_tuple] = 0
      word_split[word_tuple] += 1 #this count freq of each initial word split


print("word_split:------------",word_split)


words:------------ ['The', 'sun', 'set', 'behind', 'the', 'mountains,', 'casting', 'a', 'golden', 'glow', 'across', 'the', 'landscape.']
char_list:--------- ['T', 'h', 'e', '<|endofword|>']
char_list:--------- ['s', 'u', 'n', '<|endofword|>']
char_list:--------- ['s', 'e', 't', '<|endofword|>']
char_list:--------- ['b', 'e', 'h', 'i', 'n', 'd', '<|endofword|>']
char_list:--------- ['t', 'h', 'e', '<|endofword|>']
char_list:--------- ['m', 'o', 'u', 'n', 't', 'a', 'i', 'n', 's', ',', '<|endofword|>']
char_list:--------- ['c', 'a', 's', 't', 'i', 'n', 'g', '<|endofword|>']
char_list:--------- ['a', '<|endofword|>']
char_list:--------- ['g', 'o', 'l', 'd', 'e', 'n', '<|endofword|>']
char_list:--------- ['g', 'l', 'o', 'w', '<|endofword|>']
char_list:--------- ['a', 'c', 'r', 'o', 's', 's', '<|endofword|>']
char_list:--------- ['t', 'h', 'e', '<|endofword|>']
char_list:--------- ['l', 'a', 'n', 'd', 's', 'c', 'a', 'p', 'e', '.', '<|endofword|>']
words:------------ ['She', 'walked', 'throug

### Helper Function: `get_pair_stats`

```This function takes the current word splits (represented as a dictionary where keys are tuples of symbols/characters forming a word and values are their frequencies) and calculates the frequency of each adjacent pair of symbols across the entire corpus.```

```
**Input Example (`splits`):**

 {('T', 'h', 'i', 's', '<|endofword|>'): 2, ('i', 's', '<|endofword|>'): 2, ...}

**Output Example (`pair_counts`):**

{('i', 's'): 4, ('s', '<|endofword|>'): 4, ('T', 'h'): 2, ...} ```


In [19]:
def get_pair_stats(splits):
  pair_counts = collections.defaultdict(int)
  for word_tuple, freq in splits.items():
      symbols = list(word_tuple)
      for i in range(len(symbols)-1):
        pair = (symbols[i],symbols[i+1])
        pair_counts[pair] += freq
  return pair_counts


### Helper Function: `merge_pair`


This function takes a specific pair (`pair_to_merge`) that we want to combine and the current `splits`. It iterates through all the word representations in `splits`, replaces occurrences of the `pair_to_merge` with a new single token (concatenation of the pair), and returns the updated `splits`.

```
**Input Example:**
`pair_to_merge`: `('i', 's')`
`splits`: `{('T', 'h', 'i', 's', '<|endofword|>'): 2, ('i', 's', '<|endofword|>'): 2, ...}`

**Output Example (`new_splits`):**
`{('T', 'h', 'is', '<|endofword|>'): 2, ('is', '<|endofword|>'): 2, ...}` (assuming 'is' is the merged token)
```

In [21]:
def merge_pairs(pair_to_merge, splits):
  new_splits = {}
  (first, second) = pair_to_merge
  merged_token = first + second
  for word_tuple , freq in splits.items():
    symbols = list(word_tuple)
    new_symbols = []
    i = 0
    while i < len(symbols):
      if i < len(symbols) - 1 and symbols[i] == first and symbols[i+1] == second:
        new_symbols.append(merged_token)
        i += 2
      else:
        new_symbols.append(symbols[i])
        i += 1

    new_splits[tuple(new_symbols)] = freq
  return new_splits