# Shallow Parsing - Chunking

In [1]:
# Do imports
import nltk

In [2]:
data = "I prefer a morning flight."


In [3]:
# Prepare data
tokens = nltk.word_tokenize(data)
tag = nltk.pos_tag(tokens)
print (tag)

[('I', 'PRP'), ('prefer', 'VBP'), ('a', 'DT'), ('morning', 'NN'), ('flight', 'NN'), ('.', '.')]


In [4]:
# Grammar to use
## Reference: https://www.nltk.org/book/ch07.html
grammar = "NP: {<DT>?<JJ>*<NN>}" # determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN)
cp  =nltk.RegexpParser(grammar)

In [5]:
# Parse based on regex
result = cp.parse(tag)
print(result)

(S I/PRP prefer/VBP (NP a/DT morning/NN) (NP flight/NN) ./.)


In [6]:
# Another grammar
## Reference: https://www.nltk.org/book/ch07.html
grammar = "NP: {<NN><NN>}  # Chunk two consecutive nouns"
cp  =nltk.RegexpParser(grammar)

In [7]:
# Parse based on regex
result = cp.parse(tag)
print(result)

(S I/PRP prefer/VBP a/DT (NP morning/NN flight/NN) ./.)


## Another example

In [8]:

data = "The authority did not permit giving of fishing permit."
tokens = nltk.word_tokenize(data)
print(tokens)


['The', 'authority', 'did', 'not', 'permit', 'giving', 'of', 'fishing', 'permit', '.']


In [9]:
tag = nltk.pos_tag(tokens)
print(tag)


[('The', 'DT'), ('authority', 'NN'), ('did', 'VBD'), ('not', 'RB'), ('permit', 'VB'), ('giving', 'VBG'), ('of', 'IN'), ('fishing', 'VBG'), ('permit', 'NN'), ('.', '.')]


In [10]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp  =nltk.RegexpParser(grammar)
result = cp.parse(tag)
print(result)


(S
  (NP The/DT authority/NN)
  did/VBD
  not/RB
  permit/VB
  giving/VBG
  of/IN
  fishing/VBG
  (NP permit/NN)
  ./.)


In [11]:
result.draw()    # It will draw the pattern graphically which can be seen in Noun Phrase chunking 

In [12]:
# Do for a running example
data = "I prefer a morning flight."
tokens = nltk.word_tokenize(data)
tag = nltk.pos_tag(tokens)
result = cp.parse(tag)
print(result)

(S I/PRP prefer/VBP (NP a/DT morning/NN) (NP flight/NN) ./.)


In [13]:
result.draw()