In [4]:
import numpy as np
import pandas as pd
import nltk
import spacy
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display, HTML
import re
import networkx as nx
import graphviz

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
!python3 -m spacy download en_core_web_sm
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-dark')
sns.set_palette('husl')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [5]:
def word_tokenize_custom(text):
    """Custom word tokenization implementation"""
    # Convert to lowercase
    text = text.lower()
    # Split on whitespace
    words = text.split()
    # Remove punctuation
    words = [re.sub(r'[^\w\s]', '', word) for word in words]
    # Remove empty strings
    words = [word for word in words if word]
    return words

def sentence_tokenize_custom(text):
    """Custom sentence tokenization implementation"""
    # Split on sentence boundaries
    sentences = re.split(r'[.!?]+', text)
    # Clean up whitespace
    sentences = [s.strip() for s in sentences]
    # Remove empty sentences
    sentences = [s for s in sentences if s]
    return sentences

# Interactive widget for custom text input
text_input = widgets.Textarea(
    value='Enter your text here...',
    placeholder='Type something',
    description='Text:',
    style={'description_width': 'initial'},
    layout={'width': '80%', 'height': '100px'}
)

def update_tokenization(text):
    print("Word Tokens:")
    print(word_tokenize_custom(text))
    print("\nSentence Tokens:")
    print(sentence_tokenize_custom(text))

interact(update_tokenization, text=text_input)

interactive(children=(Textarea(value='Enter your text here...', description='Text:', layout=Layout(height='100…

<function __main__.update_tokenization(text)>

## Understanding Regular Expressions (Regex) in Tokenization

Regular expressions are powerful patterns used to match and manipulate text. Let's break down the regex patterns we use in tokenization with interactive examples.

### Common Regex Patterns in Tokenization

1. `\w+` - Matches one or more word characters (letters, numbers, underscore)
2. `[.!?]+` - Matches one or more sentence-ending punctuation marks
3. `[^\w\s]` - Matches any character that is NOT a word character or whitespace
4. `\s+` - Matches one or more whitespace characters

Try these patterns in the interactive tool above!

In [6]:
# Add this as a new cell
def explain_regex(pattern, text):
    """Interactive regex explanation tool"""
    import re
    
    # Find all matches
    matches = re.finditer(pattern, text)
    
    # Create a colored version of the text
    colored_text = text
    offset = 0
    
    print(f"Pattern: {pattern}\n")
    print("Matches found:")
    
    for match in matches:
        start, end = match.span()
        print(f"\nMatch: '{match.group()}' at positions {start}-{end}")
        
        # Add color to matched text
        colored_text = colored_text[:start + offset] + \
                      '\033[92m' + match.group() + '\033[0m' + \
                      colored_text[end + offset:]
        offset += 11  # Length of color codes
    
    print(f"\nText with matches highlighted:")
    print(colored_text)

# Interactive widget for regex testing
pattern_input = widgets.Text(
    value=r'\w+',
    description='Pattern:',
    style={'description_width': 'initial'},
    layout={'width': '80%'}
)

text_input = widgets.Textarea(
    value='Enter text to test regex pattern...',
    placeholder='Type something',
    description='Text:',
    style={'description_width': 'initial'},
    layout={'width': '80%', 'height': '100px'}
)

def update_regex_explanation(pattern, text):
    explain_regex(pattern, text)

interact(update_regex_explanation, pattern=pattern_input, text=text_input)

interactive(children=(Text(value='\\w+', description='Pattern:', layout=Layout(width='80%'), style=TextStyle(d…

<function __main__.update_regex_explanation(pattern, text)>

## Advanced Tokenization with NLP Libraries

Now let's explore tokenization using popular NLP libraries. We'll compare different tokenization methods and see how they handle various types of text.

In [9]:
# Initialize tokenizers
nltk_tokenizer = nltk.tokenize.word_tokenize
spacy_tokenizer = nlp.tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def compare_tokenizers(text):
    """Compare different tokenization methods"""
    print(f"Original text: {text}\n")
    
    print("NLTK Tokenization:")
    print(nltk_tokenizer(text))
    print()
    
    print("spaCy Tokenization:")
    print([token.text for token in spacy_tokenizer(text)])
    print()
    
    print("BERT Tokenization:")
    print(bert_tokenizer.tokenize(text))

# Interactive widget for tokenizer comparison
text_input = widgets.Textarea(
    value='Enter text to compare different tokenizers...',
    placeholder='Type something',
    description='Text:',
    style={'description_width': 'initial'},
    layout={'width': '80%', 'height': '100px'}
)

interact(compare_tokenizers, text=text_input)

interactive(children=(Textarea(value='Enter text to compare different tokenizers...', description='Text:', lay…

<function __main__.compare_tokenizers(text)>

In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Interactive Visualizations

Let's create visual representations of the tokenization process. We'll show word frequency distributions and the tokenization flow.

In [10]:
def visualize_tokenization(text):
    """Create interactive visualization of tokenization process"""
    # Create word frequency plot
    words = word_tokenize_custom(text)
    word_freq = pd.Series(words).value_counts()
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x=word_freq.values[:20], y=word_freq.index[:20])
    plt.title('Top 20 Most Frequent Words')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()
    
    # Create tokenization flow diagram
    G = nx.DiGraph()
    G.add_edges_from([
        ('Raw Text', 'Preprocessed'),
        ('Preprocessed', 'Tokenized'),
        ('Tokenized', 'Final Output')
    ])
    
    plt.figure(figsize=(10, 6))
    nx.draw(G, with_labels=True, node_color='lightblue', 
            node_size=2000, arrowsize=20)
    plt.title('Tokenization Process Flow')
    plt.show()

# Interactive widget for visualization
text_input = widgets.Textarea(
    value='Enter text for visualization...',
    placeholder='Type something',
    description='Text:',
    style={'description_width': 'initial'},
    layout={'width': '80%', 'height': '100px'}
)

interact(visualize_tokenization, text=text_input)

interactive(children=(Textarea(value='Enter text for visualization...', description='Text:', layout=Layout(hei…

<function __main__.visualize_tokenization(text)>

## Challenges and Edge Cases

Let's explore some common challenges in tokenization and how to handle them. We'll look at contractions, special characters, and other edge cases.

In [11]:
def explore_challenges(text):
    """Demonstrate tokenization challenges and solutions"""
    # Handle contractions
    contractions = {
        "n't": " not",
        "'m": " am",
        "'s": " is",
        "'re": " are",
        "'ll": " will",
        "'ve": " have"
    }
    
    def expand_contractions(text):
        for contraction, expansion in contractions.items():
            text = text.replace(contraction, expansion)
        return text
    
    # Handle special characters
    def handle_special_chars(text):
        # Replace special characters with spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text)
        return text
    
    print("Original text:")
    print(text)
    print("\nAfter handling contractions:")
    print(expand_contractions(text))
    print("\nAfter handling special characters:")
    print(handle_special_chars(text))

# Interactive widget for challenges
text_input = widgets.Textarea(
    value="I don't like this! It's too expensive...",
    placeholder='Type something with contractions and special characters',
    description='Text:',
    style={'description_width': 'initial'},
    layout={'width': '80%', 'height': '100px'}
)

interact(explore_challenges, text=text_input)

interactive(children=(Textarea(value="I don't like this! It's too expensive...", description='Text:', layout=L…

<function __main__.explore_challenges(text)>

## Conclusion and Further Resources

### Key Takeaways

1. Tokenization is a crucial first step in NLP pipelines
2. Different tokenization methods serve different purposes
3. Understanding edge cases is important for robust tokenization
4. Regular expressions are powerful tools for text manipulation

### Further Reading

- [NLTK Documentation](https://www.nltk.org/)
- [spaCy Documentation](https://spacy.io/)
- [Hugging Face Transformers](https://huggingface.co/transformers/)
- [Natural Language Processing with Python](https://www.nltk.org/book/)
- [Regex Tutorial](https://www.regular-expressions.info/tutorial.html)

### Practice Exercises

1. Try tokenizing different types of financial documents
2. Experiment with different tokenization parameters
3. Build your own custom tokenizer for specific use cases
4. Practice writing regex patterns for different text patterns