# Simple word-level encoder-decoder from scratch

Credit to Amir Nazeri & \
Udacity Gen AI nanodegree


In [1]:
from __future__ import annotations

# COMPLETE: Feel free to add other imports as needed
import string
import re
from collections import defaultdict

# Tokenization Steps

In this exercise, you'll code your own tokenizer from scratching using base
Python!

You might normally start with a pretrained tokenizer, but this exercise will
help you get to know see some of the tokenization steps better.

## Define Sample Text

Let's first define some sample text you will use to test your tokenization
steps.

In [2]:
sample_text = '''Mr. Louis continued to say, "Penguins are important,
but we mustn't forget the nuumber 1 priority: the READER!"
'''

print(sample_text)

Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"



## Normalization

This step is where you'll normalize your text by converting to lowercase,
removing accented characters, etc.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized to:
```
did uncle max like the jalapeno dip
```

In [10]:
def normalize_text(text: str) -> str:
    # Normalize incoming text; can be multiple actions
    # Only keep ASCII letters, numbers, punctuation, and whitespace characters
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.punctuation
        + string.whitespace
    )
    normalized_text = ''.join(
        filter(lambda letter: letter in acceptable_characters, text)
    )
    # Make text lower-case
    normalized_text = normalized_text.lower()
    return normalized_text

In [11]:
# Test out your normalization
normalize_text(sample_text)

'mr. louis continued to say, "penguins are important, \nbut we mustn\'t forget the nuumber 1 priority: the reader!"\n'

## Pretokenization

This step will take in the normalized text and pretokenize the text into a list
of smaller pieces.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized & then pretokenized to:
```
[
    'did',
    'uncle',
    'max',
    'like',
    'the',
    'jalapeno',
    'dip?',
]
```

In [12]:
def pretokenize_text(text: str) -> list[str]:
    # pretokenize normalized text
    # Split based on spaces
    smaller_pieces = text.split()
    return smaller_pieces

In [13]:
# Test out your pretokenization step (after normalizing the text)
normalized_text = normalize_text(sample_text)
pretokenize_text(normalized_text)

['mr.',
 'louis',
 'continued',
 'to',
 'say,',
 '"penguins',
 'are',
 'important,',
 'but',
 'we',
 "mustn't",
 'forget',
 'the',
 'nuumber',
 '1',
 'priority:',
 'the',
 'reader!"']

## Tokenization

This step will take in the list of pretokenized pieces (after the text has
been normalized) into the tokens that will be used.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized, pretokenized, and then tokenized to:
```
[
    'did',
    'uncle',
    'max',
    'like',
    'the',
    'jalapeno',
    'dip'
    '?',
]
```

In [14]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    # Apply created steps
    normalized_text: str = normalize_text(text)
    pretokenized_text: list[str] = pretokenize_text(normalized_text)
    # COMPLETE: Go through pretokenized text to create a list of tokens
    tokens = []
    # Small 'pieces' to make full tokens
    for word in pretokenized_text:
        tokens.extend(
            re.findall(
                f'[\w]+|[{string.punctuation}]', # Split word at punctuations
                word,
            )
        )
    return tokens

In [15]:
# Test out your tokenization (that uses normalizing & pretokenizing functions)
tokenize_text(sample_text)

['mr',
 '.',
 'louis',
 'continued',
 'to',
 'say',
 ',',
 '"',
 'penguins',
 'are',
 'important',
 ',',
 'but',
 'we',
 'mustn',
 "'",
 't',
 'forget',
 'the',
 'nuumber',
 '1',
 'priority',
 ':',
 'the',
 'reader',
 '!',
 '"']

## Postprocessing

This final step will take in the list of tokens from the original text and add
any special tokens to the text.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized, pretokenized, and then tokenized to:
```
[
    '[BOS]',
    'did',
    'uncle',
    'max',
    'like',
    'the',
    'jalapeno',
    'dip'
    '?',
    '[EOS]',
]
```

In [23]:
# Useful for some tasks
def postprocess_tokens(tokens: list[str]) -> list[str]:
    # COMPLETE: Add beginning and end of sequence tokens to your tokenized text
    # Can use a format like '[BOS]' & '[EOS]'
    bos_token = '[BOS]'
    eos_token = '[EOS]'
    updated_tokens = (
        [bos_token]
        + tokens
        + [eos_token]
    )
    return updated_tokens

In [24]:
# Test full pipeline (normalizing, pretokenizing, tokenizing, & postprocessing)
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(tokens)

['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']


# Encoding & Decoding

## Encoding Text to Token IDs

Create an encoder (`encode()`) that will encode the token strings to integer IDs
by defining how to map each token to a unique ID.

> HINT:
>
> An easy method is to assign an arbitrary integer to each unique token from
> the corpus by iterating through the unique tokens.

In [25]:
# Sample corpus (normally this would be much bigger)
sample_corpus = (
    '''Mr. Louis continued to say, "Penguins are important, \nbut we mustn't forget the nuumber 1 priority: the READER!"''',
    '''BRUTUS:\nHe's a lamb indeed, that baes like a bear.''',
    '''Both by myself and many other friends:\mBut he, his own affections' counsellor,\nIs to himself--I will not say how true--\nBut to himself so secret and so close,'''
)

## Methods for Updating Dictionaries

Here are three methods to update a dictionary, specifically `token2id`, to include new entries from another dictionary.

### 1. **In-Place Union Update (`|=`)**
- **Syntax:** `token2id |= new_entries`
- **Description:** Updates `token2id` in place by merging it with `new_entries` using the `|=` operator.
- **Introduced in:** Python 3.9

### 2. **Using `update()` Method**
- **Syntax:** `token2id.update(new_entries)`
- **Description:** Uses the `update()` method to add or update key-value pairs from `new_entries` into `token2id`.
- **Common Use Case:** Directly modifies the original dictionary.

### 3. **Reassignment with Unpacking (`{**token2id, **new_entries}`)**
- **Syntax:** `token2id = {**token2id, **new_entries}`
- **Description:** Merges `token2id` and `new_entries` by unpacking them into a new dictionary and reassigning the result back to `token2id`.
- **Common Use Case:** Creates a new merged dictionary, leaving the original unchanged until reassignment.

All three methods effectively merge dictionaries, but the choice depends on the Python version and whether you prefer modifying the dictionary in place or creating a new one.

In [27]:
# Create an encoder to transform token strings to IDs using the sample
# corpus as the basis of your encoding

# Your code here (might be outside of the encode() function scope)

# Retrieve unique tokens (from the pipeline defined above) in a set
unique_tokens = set()
for text in sample_corpus:
    tokens_from_text = tokenize_text(text)
    tokens_from_text = postprocess_tokens(tokens_from_text)
    unique_tokens.update(tokens_from_text)

# Create mapping (dictionary) for unique tokens using arbitrary & unique IDs
token2id = defaultdict(lambda : 0) # Allow for unknown tokens to map to 0
token2id |= {
    token: idx
    for idx, token in enumerate(unique_tokens, 1) # Skip 0 (represents unknown)
}

# A mapping for IDs to convert back to token
id2token = defaultdict(lambda : '[UNK]') # Allow for unknown token ('[UNK]')
id2token |= {
    idx: token
    for token, idx in token2id.items()
}


def encode(tokens: list[str]) -> list[int]:
    # Complete this function to encode tokens to integer IDs
    encoded_tokens = [token2id[token] for token in tokens]
    return encoded_tokens


### Test `encode()`

In [28]:
# Use sample text for testing
sample_text = sample_corpus[0]
# Create tokens (to be fed to encode())
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)
print(f'Tokens:\n{tokens}\n')

Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']



In [29]:
# Test encode()
encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

Encoded Tokens:
[36, 7, 24, 30, 53, 33, 6, 31, 17, 21, 49, 3, 31, 50, 44, 43, 56, 37, 20, 22, 29, 34, 55, 32, 22, 51, 4, 17, 28]



## Decoding Token IDs to Text

Based on your enocder you created (`encode()`), create a decoder (`decode()`) to
take a list of token IDs and map them to their associated token.

In [30]:
# COMPLETE: Create an encoder to transform IDs (from encode()) to token strings

def decode(ids: list[int]) -> list[str]:
    # Complete this function to decode integer IDs to token strings
    token_strings = [id2token[idx] for idx in ids]
    return token_strings

### Test `decode()`

In [31]:
# Use sample text for testing
sample_text = sample_corpus[0]
# Create tokens
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)
print(f'Tokens:\n{tokens}\n')

# Create token IDs (to be fed to decode())
encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']

Encoded Tokens:
[36, 7, 24, 30, 53, 33, 6, 31, 17, 21, 49, 3, 31, 50, 44, 43, 56, 37, 20, 22, 29, 34, 55, 32, 22, 51, 4, 17, 28]



In [32]:
# Test out decode()
decoded_tokens = decode(encoded_tokens)
print(f'Decoded Tokens:\n{decoded_tokens}\n')

Decoded Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', '[EOS]']

