In [10]:
import re

def custom_tokenizer(text):
    # Define tokenization rules using regex
    rules = [
        (r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "URL"),  # URLs
        (r"\b\w+@\w+\.\w+\b", "EMAIL"),  # Emails
        (r"(\d+)([a-zA-Z]+)", r"\1 \2"),  # Split numbers and letters
        (r"([a-zA-Z]+)(\d+)", r"\1 \2"),  # Split letters and numbers
        (r"([.,!?;:])", r" \1 "),  # Split punctuation
        (r"\b'\b", " ' "),  # Handle contractions (e.g., don't -> do n't)
        (r"\s+", " "),  # Remove extra spaces
        (r"@(\w+)", r"\1"), # Remove '@' but keep the username
        (r"#(\w+)", r"\1")  # Remove '@' but keep the username
    ]
    
    # Apply each rule sequentially
    for pattern, repl in rules:
        text = re.sub(pattern, repl, text)
    
    # Split by space for final tokens
    tokens = text.strip().split(" ")
    return tokens

# Test the tokenizer
sample_text = "@VirginAmerica good to be home #texas #moodlighting http://t.co/N3BVZTY3zI"
tokens = custom_tokenizer(sample_text)

print("Original text:", sample_text)
print("Tokenized text:", tokens)


Original text: @VirginAmerica good to be home #texas #moodlighting http://t.co/N3BVZTY3zI
Tokenized text: ['VirginAmerica', 'good', 'to', 'be', 'home', 'texas', 'moodlighting', 'URL']
