In [1]:
# Handling special types of words/texts using spacy and nltk
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

In [2]:
# All example sentences together : 
examples = [
    "\" We're moving to L.A.!\"",
    "We're here to help! Send snail-mail, email support@cdac.in or visit us at http://www.cdac.in",
    "A 5km NYC cab ride costs $10.30",
    "Let's viit St. Louis in the U.S. next year. "]

for i , sentence in enumerate(examples,1):
    doc = nlp(sentence)
    tokens = [token.text for token in doc]
    print(f"Sentence {i}: {sentence}")
    print(f"Tokens ({len(tokens)}): {tokens}")
    print("-"*60)

Sentence 1: " We're moving to L.A.!"
Tokens (8): ['"', 'We', "'re", 'moving', 'to', 'L.A.', '!', '"']
------------------------------------------------------------
Sentence 2: We're here to help! Send snail-mail, email support@cdac.in or visit us at http://www.cdac.in
Tokens (18): ['We', "'re", 'here', 'to', 'help', '!', 'Send', 'snail', '-', 'mail', ',', 'email', 'support@cdac.in', 'or', 'visit', 'us', 'at', 'http://www.cdac.in']
------------------------------------------------------------
Sentence 3: A 5km NYC cab ride costs $10.30
Tokens (9): ['A', '5', 'km', 'NYC', 'cab', 'ride', 'costs', '$', '10.30']
------------------------------------------------------------
Sentence 4: Let's viit St. Louis in the U.S. next year. 
Tokens (11): ['Let', "'s", 'viit', 'St.', 'Louis', 'in', 'the', 'U.S.', 'next', 'year', '.']
------------------------------------------------------------


In [4]:
# How would NLTK do this?
import nltk
from nltk.tokenize import word_tokenize

# nltk.download('punkt')      # Make sure tokenizer model are downloaded

for text in examples :
    print(f'Text : {text}')
    tokens = word_tokenize(text)
    print(tokens)
    print(f"Number of tokens : {len(tokens)}\n")

Text : " We're moving to L.A.!"
['``', 'We', "'re", 'moving', 'to', 'L.A.', '!', "''"]
Number of tokens : 8

Text : We're here to help! Send snail-mail, email support@cdac.in or visit us at http://www.cdac.in
['We', "'re", 'here', 'to', 'help', '!', 'Send', 'snail-mail', ',', 'email', 'support', '@', 'cdac.in', 'or', 'visit', 'us', 'at', 'http', ':', '//www.cdac.in']
Number of tokens : 20

Text : A 5km NYC cab ride costs $10.30
['A', '5km', 'NYC', 'cab', 'ride', 'costs', '$', '10.30']
Number of tokens : 8

Text : Let's viit St. Louis in the U.S. next year. 
['Let', "'s", 'viit', 'St.', 'Louis', 'in', 'the', 'U.S.', 'next', 'year', '.']
Number of tokens : 11



#### Output :
##### Spacy performs better than NLTK