## Tokenization of text into sentences

http://www.nltk.org/api/nltk.tokenize.html?highlight=regexp

In [13]:
from nltk import sent_tokenize

# Split text into sentences with sent_tokenize.
text = "Welcome readers. I hope you find it interesting. Please do reply."
sent_tokenize(text)

['Welcome readers.', 'I hope you find it interesting.', 'Please do reply.']

In [15]:
# To tokenize large amount of sentences, we can load PunktSentenceTokenizer and use the tokenize function.
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = "Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
tokenizer.tokenize(text)

['Hello everyone.',
 'Hope all are fine and doing well.',
 'Hope you find the book interesting']

## Tokenization of text in other languages

In [17]:
import nltk
german_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
german_tokenizer.tokenize('Hallo! Guten Tag! Wie geht es dir?')

['Hallo!', 'Guten Tag!', 'Wie geht es dir?']

## Tokenization of sentences into word

In [19]:
from nltk import word_tokenize

word_tokenize('Pierre Vinkin, 59 years old, will join as a nonexecutive director on Nov. 29')

['Pierre',
 'Vinkin',
 ',',
 '59',
 'years',
 'old',
 ',',
 'will',
 'join',
 'as',
 'a',
 'nonexecutive',
 'director',
 'on',
 'Nov.',
 '29']

## Tokenization using TreeBankWordTokenizer

In [28]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize('Have a nice day! I hope you find the book interesting.')

['Have',
 'a',
 'nice',
 'day',
 '!',
 'I',
 'hope',
 'you',
 'find',
 'the',
 'book',
 'interesting',
 '.']

In [31]:
# This tokenizer works by separating contractions.
nltk.word_tokenize("Don't hesitate to ask questions")

['Do', "n't", 'hesitate', 'to', 'ask', 'questions']

In [34]:
# WordPunctTokenizer works by splitting punctuation.
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Don't hesitate to ask questions")

['Don', "'", 't', 'hesitate', 'to', 'ask', 'questions']

## Tokenization using Regular Expressions

In [39]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[\w]+')
tokenizer.tokenize("Don't hesitate to ask questions")

['Don', 't', 'hesitate', 'to', 'ask', 'questions']

In [41]:
# Instead of instantiating a class, an alternative way is to use the function.
from nltk.tokenize import regexp_tokenize
regexp_tokenize("Don't hesitate to ask questions", '\w+|\$[\d\.]+|\S+')

['Don', "'t", 'hesitate', 'to', 'ask', 'questions']

In [43]:
# Tokenize using whitespace
tokenizer = RegexpTokenizer('\s+', gaps=True)
tokenizer.tokenize("Don't hesitate to ask questions")

["Don't", 'hesitate', 'to', 'ask', 'questions']

In [45]:
# Selecting words that starts with a capital.
tokenizer = RegexpTokenizer('[A-Z]\w+')
tokenizer.tokenize("She secured 90.56% in class X. She is a meritorious student")

['She', 'She']

In [49]:
# Using predefined regular expression.
from nltk.tokenize import BlanklineTokenizer
tokenizer = BlanklineTokenizer()
tokenizer.tokenize("""She secured 90.56% in class X. 
She is a meritorious student.
""")

['She secured 90.56% in class X. \nShe is a meritorious student.\n']

In [52]:
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
tokenizer.tokenize("She secured 90.56% in class X. She is a meritorious student.")

['She',
 'secured',
 '90.56%',
 'in',
 'class',
 'X.',
 'She',
 'is',
 'a',
 'meritorious',
 'student.']