# In this chapter, Bya will cover the following recipes:
1. Tokenizing text into sentences
2. Tokenizing sentences into words
3. Tokenizing sentences using regular expressions 
4. Training a sentence tokenizer
5. Filtering stopwords in a tokenized sentence
6. Looking up Synsets for a word in WordNet
7. Looking up lemmas and synonyms in WordNet 
8. Calculating WordNet Synset similarity
9. Discovering word collocations

# 1. Tokenizing text into sentences

In [2]:
# paragraph
para = "Hello World. It's good to see you. This is the Mr.Bya's seminar code."

# import sentence tokenizer
from nltk.tokenize import sent_tokenize

# tokenize paragraph
sent_tokenize(para)

['Hello World.', "It's good to see you.", "This is the Mr.Bya's seminar code."]

###  If tokenizing a lot of sentences

In [2]:
#  it's more efficient to load the PunktSentenceTokenizer class once, 
# and call its tokenize() method instead:

import nltk.data

tokenizer = nltk.data.load('/Users/Bya/nltk_data/tokenizers/punkt/PY3/english.pickle')
tokenizer.tokenize(para)

['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

### Tokenizing sentences in other languages

In [4]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_tokenizer.tokenize('Hola amigo. Estoy bien.')

['Hola amigo.', 'Estoy bien.']

# 2. Tokenizing sentences into words

### Separating words using spaces and punctuation.

In [3]:
from nltk.tokenize import word_tokenize

word_tokenize("Hello World can't this.")

['Hello', 'World', 'ca', "n't", 'this', '.']

### TreebankWordTokenizer

In [20]:
# import word tokenizer
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("Hello World. Mr.Bya can't do \t this.")

['Hello', 'World.', 'Mr.Bya', 'ca', "n't", 'do', 'this', '.']

### Separating contractions

In [12]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("can't"))

['ca', "n't"]


### WordPunctTokenizer

In [10]:
# all punctuation into separate tokens
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Hello World. \n Bya can't do \t this.")

['Hello', 'World', '.', 'Bya', 'can', "'", 't', 'do', 'this', '.']

### WhitespaceTokenizer

In [9]:
# all punctuation into separate tokens
from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
tokenizer.tokenize("Hello World. \n Bya can't do \t this.")

['Hello', 'World.', 'Bya', "can't", 'do', 'this.']

# 3. Tokenizing sentences using regular expressions

### RegexTokenizer

In [13]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Hello World. \n Bya can't do \t this.")

['Hello', 'World', 'Bya', "can't", 'do', 'this']

In [26]:
# simple usage of RegexpTokenizer
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction']

In [25]:
# same as above
from nltk.tokenize import regexp_tokenize

regexp_tokenize("Can't is a contraction.", "[\w']+")

["Can't", 'is', 'a', 'contraction']

### Simple whitespace tokenizer

In [12]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('\s+', gaps=True)
tokenizer.tokenize("Hello World. \n Bya can't do \t this.")

['Hello', 'World.', 'Bya', "can't", 'do', 'this.']

In [29]:
tokenizer = RegexpTokenizer("a", gaps=True)
tokenizer.tokenize("Can't is a contraction.")

['C', "n't is ", ' contr', 'ction.']

### ￼￼Training a sentence tokenizer

In [30]:
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)

In [33]:
sents1 = sent_tokenizer.tokenize(text)

from nltk.tokenize import sent_tokenize
sents2 = sent_tokenize(text)

In [34]:
sents1[0]

'White guy: So, do you have any plans for this evening?'

In [35]:
sents2[0]

'White guy: So, do you have any plans for this evening?'

In [36]:
sents1[678]

'Girl: But you already have a Big Mac...'

In [37]:
sents2[678]

'Girl: But you already have a Big Mac...\nHobo: Oh, this is all theatrical.'

### More

In [39]:
with open('/Users/Bya/nltk_data/corpora/webtext/overheard.txt', encoding='ISO-8859-2') as f:
    text = f.read()
sent_tokenizer = PunktSentenceTokenizer(text)
sents = sent_tokenizer.tokenize(text)

In [40]:
sents[0]

'White guy: So, do you have any plans for this evening?'

In [41]:
sents[678]

'Girl: But you already have a Big Mac...'

# 4. Filtering stopwords in a tokenized sentence

### Filtering Stopwords

In [16]:
# import english stopwords
from nltk.corpus import stopwords

# defining stopwords
english_stops = set(stopwords.words('english'))

# filtering stopwords
words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops]

["Can't", 'contraction']

### Add word to Stopwords

In [17]:
# import english stopwords
from nltk.corpus import stopwords

# defining stopwords
english_stops = set(stopwords.words('english'))

# add word to stopwords
english_stops_added = english_stops | {'contraction'}

words = ["Can't", 'is', 'a', 'contraction']
[word for word in words if word not in english_stops_added]

["Can't"]

# 5. Looking up Synsets for a word in WordNet

In [133]:
# importing Wordnet
from nltk.corpus import wordnet

# look up the Synset for 'cookbook'
syn = wordnet.synsets('cookbook')[0]

print("Name: %s \n" % syn.name())
print("Definition: %s \n" % syn.definition())
print("Examples: %s \n" % syn.examples())

Name: cookbook.n.01 

Definition: a book of recipes and cooking directions 

Examples: [] 



### Working with hypernyms

In [121]:
syn = wordnet.synsets('cookbook')[0]

print("Hypernyms of %s: \n\n \t %s" % (syn, syn.hypernyms()))

Hypernyms of Synset('cookbook.n.01'): 

 	 [Synset('reference_book.n.01')]


In [126]:
print("Hyponyms of %s:" % (syn.hypernyms()[0]))
syn.hypernyms()[0].hyponyms()

Hyponyms of Synset('reference_book.n.01'):


[Synset('annual.n.02'),
 Synset('atlas.n.02'),
 Synset('cookbook.n.01'),
 Synset('directory.n.01'),
 Synset('encyclopedia.n.01'),
 Synset('handbook.n.01'),
 Synset('instruction_book.n.01'),
 Synset('source_book.n.01'),
 Synset('wordbook.n.01')]

In [127]:
# root of 'cookbook'
syn.root_hypernyms()

[Synset('entity.n.01')]

In [134]:
# entire path of 'cookbook'
# hypernym_paths() method returns a list of lists
syn.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('creation.n.02'),
  Synset('product.n.02'),
  Synset('work.n.02'),
  Synset('publication.n.01'),
  Synset('book.n.01'),
  Synset('reference_book.n.01'),
  Synset('cookbook.n.01')]]

### Part Of Speech (POS)

In WordNet 4 types of POS
* n - Noun
* a - Adjective
* r - Adverb
* v - Verb

In [136]:
syn = wordnet.synsets('cookbook')[0]
syn.pos()

'n'

In [138]:
print(len(wordnet.synsets('great')))
print(len(wordnet.synsets('great', pos='n')))
print(len(wordnet.synsets('great', pos='a')))

7
1
6


# 6. Looking up Lemmas and Synonyms in WordNet

In [139]:
from nltk.corpus import wordnet

syn = wordnet.synsets('cookbook')[0]

[lemma.name() for lemma in syn.lemmas()]

['cookbook', 'cookery_book']

### All possible synonyms

In [141]:
synonyms = []
for syn in wordnet.synsets('book'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())

print(len(synonyms))
print(len(set(synonyms)))

38
25


### Antonyms

In [143]:
gn2 = wordnet.synset('good.n.02')
print(gn2.definition())

evil = gn2.lemmas()[0].antonyms()[0]
print(evil.name)

print(evil.synset().definition())

ga1 = wordnet.synset('good.a.01')
print(ga1.definition())

bad = ga1.lemmas()[0].antonyms()[0]
print(bad.name())

print(bad.synset().definition())

moral excellence or admirableness
<bound method Lemma.name of Lemma('evil.n.03.evil')>
the quality of being morally wrong in principle or practice
having desirable or positive qualities especially those suitable for a thing specified
bad
having undesirable or negative qualities


# 7. Calculating WordNet Synset similarity

In [144]:
from nltk.corpus import wordnet

cb = wordnet.synset('cookbook.n.01')
ib = wordnet.synset('instruction_book.n.01')

cb.wup_similarity(ib)

0.9166666666666666

In [146]:
# Comparing Verbs
cook = wordnet.synset('cook.v.01')
bake = wordnet.synset('bake.v.02')
cook.wup_similarity(bake)

0.6666666666666666

# 8. Discovering Word Collocations

### BigramCollocationFinder

In [152]:
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

words = [w.lower() for w in webtext.words('grail.txt')]
bcf = BigramCollocationFinder.from_words(words)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)

[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]

In [158]:
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10)

[('black', 'knight'),
 ('clop', 'clop'),
 ('head', 'knight'),
 ('mumble', 'mumble'),
 ('squeak', 'squeak'),
 ('saw', 'saw'),
 ('holy', 'grail'),
 ('run', 'away'),
 ('french', 'guard'),
 ('cartoon', 'character')]

### TrigramCollocationFinder 

In [160]:
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
words = [w.lower() for w in webtext.words('singles.txt')]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(3)
tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

[('long', 'term', 'relationship')]