In [1]:
# Daniel Bandala @ sep 2022
from nltk import download, NLTKWordTokenizer
from nltk.tokenize import word_tokenize,sent_tokenize,TweetTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

# Tokenization
Natural Language Processing (NLP) enables machine learning algorithms to organize and understand human language. NLP enables machines to not only gather text and speech but also identify the core meaning it should respond to. Human language is complex, and constantly evolving, which means natural language processing has quite the challenge. Tokenization is one of the many pieces of the puzzle in how NLP works. In this sense, tokenization is a simple process that takes raw data and converts it into a useful data string. While tokenization is well known for its use in cybersecurity and in the creation of NFTs, tokenization is also an important part of the NLP process. Tokenization is used in natural language processing to split paragraphs and sentences into smaller units that can be more easily assigned meaning. In general, tokenization is a way of separating a piece of text into smaller units called tokens. Here, tokens can be either words, characters, or subwords. Hence, tokenization can be broadly classified into 3 types – word, character, and subword (n-gram characters) tokenization.

In [2]:
# download punkt tokenizer source
download('punkt')

[nltk_data] Downloading package punkt to /home/bandala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text = "God is Great! I won a lottery."
# word tokenization
print(word_tokenize(text))

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']


In [4]:
# sentence tokenization
print(sent_tokenize(text))

['God is Great!', 'I won a lottery.']


In [5]:
s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
print(word_tokenize(s1))

['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']


In [6]:
s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
print(word_tokenize(s2))

['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']


### Gathering the spans of the tokenized strings

In [7]:
s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]

In [8]:
list(NLTKWordTokenizer().span_tokenize(s)) == expected

True

In [9]:
expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
[s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected

True

### Testing improvement made to the TreebankWordTokenizer

In [10]:
sx1 = '\xabNow that I can do.\xbb'
expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
word_tokenize(sx1) == expected

True

In [11]:
sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
word_tokenize(sx2) == expected

True

In [12]:
detokenizer = TreebankWordDetokenizer()

In [13]:
s = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
detokenizer.detokenize(word_tokenize(s))

'On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.'

In [14]:
s = "\"We beat some pretty good teams to get here,\" Slocum said."
detokenizer.detokenize(word_tokenize(s))

'"We beat some pretty good teams to get here," Slocum said.'

In [15]:
s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
detokenizer.detokenize(word_tokenize(s))

'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.'

### TweetTokenizer
TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks.



In [16]:
tknzr = TweetTokenizer()

In [18]:
s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
print(tknzr.tokenize(s0))

['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']


In [19]:
s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)"
print(tknzr.tokenize(s1))

['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)']


In [20]:
s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn"
print(tknzr.tokenize(s2))

['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn']
