In [2]:
import re

token_patterns = [
    # 1. Abbreviations
    r'(?:[A-Z]\.){2,}',

    # 2. Email IDs
    r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',

    # 3. IP Addresses
    r'\b(?:\d{1,3}\.){3}\d{1,3}\b',

    # 4. Hyphenated Words
    r'\b\w+(?:-\w+)+\b',

    # 5. Short Forms
    r'\b[A-Z][a-z]+\.',

    # 6. Numbers
    r'\$?\d+(?:/\d+/\d+)?|\d{1,2}/\d{1,2}/\d{2,4}|\d+\.\d+|\d{4}',

    # 7. Phone Numbers
    r'(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}',

    # 8. Regular words
    r"[A-Za-z']+",

    # 9. Punctuation
    r'[.,!?;:]'
]

token_regex = re.compile('|'.join(f'({p})' for p in token_patterns))

def custom_tokenizer(text):
    tokens = [t for t in re.findall(token_regex, text)]
    tokens = [next(filter(None, tup)) for tup in tokens]
    return tokens

In [8]:
text = """
The team arrived at the research site.
Weather conditions were partly cloudy with a temperature of 72.5Â°F.
Three participants were present: Dr. Allen, Prof. White, and Ms. Johnson.
One laptop was connected to the local network (IP 172.16.254.1) for data logging.
During the session, a call was made to +44 20 7946 0958 to confirm equipment delivery.
The cost of additional supplies was estimated at $350.00.
"""
tokens = custom_tokenizer(text)
print(tokens)

['The', 'team', 'arrived', 'at', 'the', 'research', 'site', '.', 'Weather', 'conditions', 'were', 'partly', 'cloudy', 'with', 'a', 'temperature', 'of', '72', '.', '5', 'F', '.', 'Three', 'participants', 'were', 'present', ':', 'Dr.', 'Allen', ',', 'Prof.', 'White', ',', 'and', 'Ms.', 'Johnson.', 'One', 'laptop', 'was', 'connected', 'to', 'the', 'local', 'network', 'IP', '172.16.254.1', 'for', 'data', 'logging', '.', 'During', 'the', 'session', ',', 'a', 'call', 'was', 'made', 'to', '+44 20 7946 0958', 'to', 'confirm', 'equipment', 'delivery', '.', 'The', 'cost', 'of', 'additional', 'supplies', 'was', 'estimated', 'at', '$350', '.', '00', '.']
