In [127]:
from __future__ import annotations

import string
import re
from collections import defaultdict
# TODO: Feel free to add other imports as needed

# Tokenization Steps

In this exercise, you'll code your own tokenizer from scratching using base
Python!

You might normally start with a pretrained tokenizer, but this exercise will
help you get to know see some of the tokenization steps better.

## Define Sample Text

Let's first define some sample text you will use to test your tokenization
steps.

In [128]:
sample_text = '''Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"
'''

print(sample_text)

Mr. Louis continued to say, "Penguins are important, 
but we mustn't forget the nuumber 1 priority: the READER!"



## Normalization

This step is where you'll normalize your text by converting to lowercase,
removing accented characters, etc.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized to:
```
did uncle max like the jalapeno dip
```

In [129]:
def check_char( letter : str):
    acceptable_chars  = ( string.ascii_letters +
                          string.punctuation +
                          string.whitespace +
                          string.digits)
    if(letter in acceptable_chars):
        return True
    else:
        return False
    
def normalize_text(text: str) -> str:
    # TODO: Normalize incoming text; can be multiple actions

    #remove accented characters
    acceptable_characters = (
        string.ascii_letters
        + string.digits
        + string.punctuation
        + string.whitespace
    )
    normalized_text1 = [x for x in text if x in acceptable_characters]
    normalized_text1 = ''.join(normalized_text1)
    
    #solution from udacity
    normalized_text = ''.join(
    filter(lambda letter: letter in acceptable_characters, text)
    )
#    normalized_text = ''.join(
#    filter(check_char, text)
#    )

    #first lower all letters
    normalized_text : str = normalized_text.lower()
    normalized_text1 : str = normalized_text1.lower()
    
    return normalized_text

In [130]:
# Test out your normalization
normalize_text(sample_text)
#normalize_text('Did Uncle Max like the jalapeño dip?')

'mr. louis continued to say, "penguins are important, \nbut we mustn\'t forget the nuumber 1 priority: the reader!"\n'

## Pretokenization

This step will take in the normalized text and pretokenize the text into a list
of smaller pieces.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized & then pretokenized to:
```
[
    'did',
    'uncle',
    'max',
    'like',
    'the',
    'jalapeno',
    'dip?',
]
```

In [131]:
def pretokenize_text(text: str) -> list[str]:
    # TODO: Pretokenize normalized text
    smaller_pieces = list()
    smaller_pieces = text.split()
    return smaller_pieces

In [132]:
# Test out your pretokenization step (after normalizing the text)
normalized_text = normalize_text(sample_text)
pretokenize_text(normalized_text)

['mr.',
 'louis',
 'continued',
 'to',
 'say,',
 '"penguins',
 'are',
 'important,',
 'but',
 'we',
 "mustn't",
 'forget',
 'the',
 'nuumber',
 '1',
 'priority:',
 'the',
 'reader!"']

## Tokenization

This step will take in the list of pretokenized pieces (after the text has 
been normalized) into the tokens that will be used.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized, pretokenized, and then tokenized to:
```
[
    'did',
    'uncle',
    'max',
    'like',
    'the',
    'jalapeno',
    'dip'
    '?',
]
```

In [133]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [134]:
# Combine normalization and pretokenization steps before breaking things further
def tokenize_text(text: str) -> list[str]:
    # Apply normalization & pretokenization steps 
    normalized_text: str = normalize_text(text)
    pretokenized_text: list[str] = pretokenize_text(normalized_text)
    # TODO: Go through pretokenized text to create a list of tokens
    tokens = list()
    word : str
    for word in pretokenized_text:
        tokens.extend(
        re.findall(f'[\w]+|[{string.punctuation}]', word)
        )
        
    return tokens

In [135]:
# Test out your tokenization (that uses normalizing & pretokenizing functions)
tokenize_text(sample_text)

['mr',
 '.',
 'louis',
 'continued',
 'to',
 'say',
 ',',
 '"',
 'penguins',
 'are',
 'important',
 ',',
 'but',
 'we',
 'mustn',
 "'",
 't',
 'forget',
 'the',
 'nuumber',
 '1',
 'priority',
 ':',
 'the',
 'reader',
 '!',
 '"']

## Postprocessing

This final step will take in the list of tokens from the original text and add
any special tokens to the text.

For example, the text:
```
Did Uncle Max like the jalapeño dip?
```
might be normalized, pretokenized, and then tokenized to:
```
[
    '[BOS]',
    'did',
    'uncle',
    'max',
    'like',
    'the',
    'jalapeno',
    'dip'
    '?',
    '[EOS]',
]
```

In [136]:
def postprocess_tokens(tokens: list[str]) -> list[str]:
    # TODO: Add beginning and end of sequence tokens to your tokenized text
    # Can use a format like '[BOS]' & '[EOS]'
    updated_tokens = list()
    updated_tokens.append('[BOS]')
    updated_tokens.extend(tokens)
    updated_tokens.append('[EOS]')
    #udacity version
    updated_tokens = ( ['[BOS]'] + tokens + ['EOS'])
    return updated_tokens

In [137]:
# Test full pipeline (normalizing, pretokenizing, tokenizing, & postprocessing)
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)

print(tokens)

['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', 'EOS']


# Encoding & Decoding

## Encoding Text to Token IDs

Create an encoder (`encode()`) that will encode the token strings to integer IDs
by defining how to map each token to a unique ID.

> HINT: 
> 
> An easy method is to assign an arbitrary integer to each unique token from 
> the corpus by iterating through the unique tokens.

In [138]:
# Sample corpus (normally this would be much bigger)
sample_corpus = (
    '''Mr. Louis continued to say, "Penguins are important, \nbut we mustn't forget the nuumber 1 priority: the READER!"''',
    '''BRUTUS:\nHe's a lamb indeed, that baes like a bear.''',
    '''Both by myself and many other friends:\mBut he, his own affections' counsellor,\nIs to himself--I will not say how true--\nBut to himself so secret and so close,'''
)

In [139]:
# TODO: Create an encoder to transform token strings to IDs using the sample
# corpus as the basis of your encoding

unique_tokens : set = set()
# TODO: Your code here (might be outside of the encode() function scope)
for text in sample_corpus:
    tokens_from_text = tokenize_text(text)
    tokens_from_text = postprocess_tokens(tokens_from_text)
    unique_tokens.update(tokens_from_text)
# Create mapping (dictionary) for unique tokens using arbitrary & unique IDs
token2id = defaultdict(lambda : 0) # Allow for unknown tokens to map to 0
token2id |= {
    token: idx
    for idx, token in enumerate(unique_tokens, 1) # Skip 0 (represents unknown)
}

# A mapping for IDs to convert back to token
id2token = defaultdict(lambda : '[UNK]') # Allow for unknown token ('[UNK]')
id2token |= {
    idx: token
    for token, idx in token2id.items()
}


def encode(tokens: list[str]) -> list[int]:
    # COMPLETE: Complete this function to encode tokens to integer IDs
    encoded_tokens = [token2id[token] for token in tokens]
    return encoded_tokens

In [140]:
print(unique_tokens)

{'but', '-', 'important', 'that', 'mr', 'a', 'indeed', 's', 'mbut', 'by', 't', 'brutus', 'the', ',', 'secret', 'he', 'affections', 'is', 'so', 'reader', 'many', 'friends', 'own', 'mustn', 'priority', 'bear', '"', '!', 'continued', 'baes', 'both', 'himself', 'other', 'are', 'lamb', 'counsellor', 'not', '[BOS]', 'nuumber', 'EOS', '1', '.', 'say', 'like', 'i', 'will', 'penguins', 'close', 'his', 'we', 'myself', 'how', 'louis', ':', 'and', 'true', "'", 'forget', 'to'}


### Test `encode()`

In [141]:
# Use sample text for testing
sample_text = sample_corpus[0]
# Create tokens (to be fed to encode())
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)
print(f'Tokens:\n{tokens}\n')

Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', 'EOS']



In [155]:
my_sample_text : str = "Eins Zwei Drei"
my_tokens = tokenize_text(my_sample_text)
my_tokens = postprocess_tokens(my_tokens)
print(f'Tokens:\n{my_tokens}\n')
my_decoded_tokens = encode(my_tokens)
print(f'Tokens:\n{my_decoded_tokens}\n')

Tokens:
['[BOS]', 'eins', 'zwei', 'drei', 'EOS']

Tokens:
[38, 0, 0, 0, 40]



In [142]:
# Test encode()
encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

Encoded Tokens:
[38, 5, 42, 53, 29, 59, 43, 14, 27, 47, 34, 3, 14, 1, 50, 24, 57, 11, 58, 13, 39, 41, 25, 54, 13, 20, 28, 27, 40]



## Decoding Token IDs to Text

Based on your enocder you created (`encode()`), create a decoder (`decode()`) to
take a list of token IDs and map them to their associated token.

In [149]:
# TODO: Create an encoder to transform IDs (from encode()) to token strings

def decode(ids: list[int]) -> list[str]:
    # TODO: Complete this function to decode integer IDs to token strings
    token_strings = list()
    token_strings = [ id2token[x] for x in ids]
    return token_strings

### Test `decode()`

In [150]:
# Use sample text for testing
sample_text = sample_corpus[0]
# Create tokens
tokens = tokenize_text(sample_text)
tokens = postprocess_tokens(tokens)
print(f'Tokens:\n{tokens}\n')

# Create token IDs (to be fed to decode())
encoded_tokens = encode(tokens)
print(f'Encoded Tokens:\n{encoded_tokens}\n')

Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', 'EOS']

Encoded Tokens:
[38, 5, 42, 53, 29, 59, 43, 14, 27, 47, 34, 3, 14, 1, 50, 24, 57, 11, 58, 13, 39, 41, 25, 54, 13, 20, 28, 27, 40]



In [151]:
# Test out decode()
decoded_tokens = decode(encoded_tokens)
print(f'Decoded Tokens:\n{decoded_tokens}\n')

Decoded Tokens:
['[BOS]', 'mr', '.', 'louis', 'continued', 'to', 'say', ',', '"', 'penguins', 'are', 'important', ',', 'but', 'we', 'mustn', "'", 't', 'forget', 'the', 'nuumber', '1', 'priority', ':', 'the', 'reader', '!', '"', 'EOS']

