# In this chapter, Bya will cover the following recipes:
1. Tokenizing text into sentences
2. Tokenizing sentences into words
3. Tokenizing sentences using regular expressions 
4. Training a sentence tokenizer
5. Filtering stopwords in a tokenized sentence
6. Looking up Synsets for a word in WordNet
7. Looking up lemmas and synonyms in WordNet 
8. Calculating WordNet Synset similarity
9. Discovering word collocations

# 1. Tokenizing text into sentences

In [1]:
# paragraph
para = "Hello World. It's good to see you. Thanks for buying this book."

# import sentence tokenizer
from nltk.tokenize import sent_tokenize

# tokenize paragraph
sent_tokenize(para)

['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

###  If tokenizing a lot of sentences

In [2]:
#  it's more efficient to load the PunktSentenceTokenizer class once, 
# and call its tokenize() method instead:

import nltk.data

tokenizer = nltk.data.load('/Users/Bya/nltk_data/tokenizers/punkt/PY3/english.pickle')
tokenizer.tokenize(para)

['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

### Tokenizing sentences in other languages

In [4]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_tokenizer.tokenize('Hola amigo. Estoy bien.')

['Hola amigo.', 'Estoy bien.']

# 2. Tokenizing sentences into words

### Separating words using spaces and punctuation.

In [5]:
from nltk.tokenize import word_tokenize

word_tokenize('Hello World.')

['Hello', 'World', '.']

In [6]:
# same as above (word_tokenize)
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize('Hello World.'))

['Hello', 'World', '.']


### Separating contractions

In [12]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("can't"))

['ca', "n't"]


### WordPunctTokenizer

In [18]:
# all punctuation into separate tokens
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Can't is a contraction.")

['Can', "'", 't', 'is', 'a', 'contraction', '.']

### WhitespaceTokenizer

In [23]:
# all punctuation into separate tokens
from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
tokenizer.tokenize("Can't is a  contraction. \t hehe \n ho")

["Can't", 'is', 'a', 'contraction.', 'hehe', 'ho']

# 3. Tokenizing sentences using regular expressions

### RegexTokenizer

In [26]:
# simple usage of RegexpTokenizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction']

In [25]:
# same as above
from nltk.tokenize import regexp_tokenize

regexp_tokenize("Can't is a contraction.", "[\w']+")

["Can't", 'is', 'a', 'contraction']

### Simple whitespace tokenizer

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\s+', gaps=True)
tokenizer.tokenize("Can't is a contraction.")