# Tokenizer

### Exploring Brown Corpus




In [1]:
import nltk 
nltk.download('brown')
from nltk.corpus import brown
brown.categories()

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\EMERITUS\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

# Exploring Tokenization

In [2]:
sentence = "The capital of Australia is Canberra"
sentence.split()

['The', 'capital', 'of', 'Australia', 'is', 'Canberra']

In [3]:
sentence = "Australia's capital is Canberra"
sentence.split()

["Australia's", 'capital', 'is', 'Canberra']

In [4]:
sentence = "Sydney is where we'll go"
sentence.split()

['Sydney', 'is', 'where', "we'll", 'go']

In [5]:
sentence = "Most of the times umm I travel"
sentence.split()

['Most', 'of', 'the', 'times', 'umm', 'I', 'travel']

In [6]:
sentence = "Let's travel to Gold Coast from Brisbane"
sentence.split()

["Let's", 'travel', 'to', 'Gold', 'Coast', 'from', 'Brisbane']

In [7]:
sentence = "Melbourne is a cool place!!! :-P <3 #Awesome."
sentence.split()

['Melbourne', 'is', 'a', 'cool', 'place!!!', ':-P', '<3', '#Awesome.']

# Regexp Tokenizer

In [8]:
from nltk.tokenize import RegexpTokenizer
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.\n\n I want a book as well."
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokenizer.tokenize(s)

['A',
 'Rolex',
 'watch',
 'costs',
 'in',
 'the',
 'range',
 'of',
 '$3000.0',
 '-',
 '$8000.0',
 'in',
 'USA',
 '.',
 'I',
 'want',
 'a',
 'book',
 'as',
 'well',
 '.']

![image.png](attachment:image.png)

# Blankline Tokenizer

In [9]:
from nltk.tokenize import BlanklineTokenizer
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA. I want a book as well."
tokenizer1 = BlanklineTokenizer()
tokenizer1.tokenize(s)

['A Rolex watch costs in the range of $3000.0 - $8000.0 in USA. I want a book as well.']

In [10]:
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA. \n\n I want a book as well."
tokenizer2 = BlanklineTokenizer()
tokenizer2.tokenize(s)

['A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.',
 'I want a book as well.']

# WordPunct Tokenizer

In [11]:
from nltk.tokenize import WordPunctTokenizer
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA.\n\n I want a book as well."
tokenizer = WordPunctTokenizer()
tokenizer.tokenize(s)

['A',
 'Rolex',
 'watch',
 'costs',
 'in',
 'the',
 'range',
 'of',
 '$',
 '3000',
 '.',
 '0',
 '-',
 '$',
 '8000',
 '.',
 '0',
 'in',
 'USA',
 '.',
 'I',
 'want',
 'a',
 'book',
 'as',
 'well',
 '.']

# TreebankWord Tokenizer

In [12]:
from nltk.tokenize import TreebankWordTokenizer
s = "I'm going to buy a Rolex watch which doesn't cost more than $3000.0.\n\n I want a book as well."
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(s)

['I',
 "'m",
 'going',
 'to',
 'buy',
 'a',
 'Rolex',
 'watch',
 'which',
 'does',
 "n't",
 'cost',
 'more',
 'than',
 '$',
 '3000.0.',
 'I',
 'want',
 'a',
 'book',
 'as',
 'well',
 '.']

# Tweet Tokenizer

In [13]:
from nltk.tokenize import TweetTokenizer
s = "@sreerama I'm going to buy a Rolexxxxxxxx watch!!! :-D #so happy #rolex <3"
tokenizer = TweetTokenizer()
tokenizer.tokenize(s)

['@sreerama',
 "I'm",
 'going',
 'to',
 'buy',
 'a',
 'Rolexxxxxxxx',
 'watch',
 '!',
 '!',
 '!',
 ':-D',
 '#so',
 'happy',
 '#rolex',
 '<3']

In [14]:
from nltk.tokenize import TweetTokenizer
s = "@sreerama I'm going to buy a Rolexxxxxxxx watch!!! :-D #so happy #rolex <3"
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokenizer.tokenize(s)

["I'm",
 'going',
 'to',
 'buy',
 'a',
 'Rolexxx',
 'watch',
 '!',
 '!',
 '!',
 ':-D',
 '#so',
 'happy',
 '#rolex',
 '<3']

In [15]:
from nltk.tokenize import WordPunctTokenizer
s = "@sreerama I'm going to buy a Rolexxxxxxxx watch!!! :-D #so happy #rolex <3"
tokenizer = WordPunctTokenizer()
tokenizer.tokenize(s)

['@',
 'sreerama',
 'I',
 "'",
 'm',
 'going',
 'to',
 'buy',
 'a',
 'Rolexxxxxxxx',
 'watch',
 '!!!',
 ':-',
 'D',
 '#',
 'so',
 'happy',
 '#',
 'rolex',
 '<',
 '3']

# Sentence Tokenizer

In [16]:
import nltk
nltk.download('punkt')
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EMERITUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



In [17]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print()

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']

['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']



# Example1

In [18]:
import nltk
nltk.download('punkt')
#This tokenizer divides a text into a list of sentences by using an 
# unsupervised algorithm to build a model 
#for abbreviation words, collocations, and words that start sentences
from nltk.tokenize import sent_tokenize, \
        word_tokenize, WordPunctTokenizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EMERITUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# Define input text
input_text = "Do you know how tokenization works? It's actually quite interesting! Let's analyze a couple of sentences and figure it out."


In [20]:
# Sentence tokenizer
print("\nSentence tokenizer:")
print(sent_tokenize(input_text))


Sentence tokenizer:
['Do you know how tokenization works?', "It's actually quite interesting!", "Let's analyze a couple of sentences and figure it out."]


In [21]:
# WordPunct tokenizer
print("\nWord punct tokenizer:")
print(WordPunctTokenizer().tokenize(input_text))


Word punct tokenizer:
['Do', 'you', 'know', 'how', 'tokenization', 'works', '?', 'It', "'", 's', 'actually', 'quite', 'interesting', '!', 'Let', "'", 's', 'analyze', 'a', 'couple', 'of', 'sentences', 'and', 'figure', 'it', 'out', '.']
