In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import nltk
from nltk import word_tokenize

In [3]:
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
    print(tree) # the grammar lets us look at the sentence in 2 ways

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [5]:
# a simple context free grammar.
# we can write our own grammars with CFG
grammar1 = nltk.CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP
  PP -> P NP
  V -> "saw" | "ate" | "walked"
  NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
  Det -> "a" | "an" | "the" | "my"
  N -> "man" | "dog" | "cat" | "telescope" | "park"
  P -> "in" | "on" | "by" | "with"
  """)
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)

(S (NP Mary) (VP (V saw) (NP Bob)))


In [None]:
# a grammar is said to be recursive if a category occuring on the left side also appears on the right side
# like Nom -> Adj Nom | N below
grammar2 = nltk.CFG.fromstring("""
  S  -> NP VP
  NP -> Det Nom | PropN
  Nom -> Adj Nom | N     
  VP -> V Adj | V NP | V S | V NP PP
  PP -> P NP
  PropN -> 'Buster' | 'Chatterer' | 'Joe'
  Det -> 'the' | 'a'
  N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
  Adj  -> 'angry' | 'frightened' |  'little' | 'tall'
  V ->  'chased'  | 'saw' | 'said' | 'thought' | 'was' | 'put'
  P -> 'on'
  """)

In [6]:
# Parsing with context free grammar
# we'll see 2 simple parsing algorithms - recursive descent parsing and shift reduce parsing
# Recursive descent parsing 
# the top level goal is to find S. The S -> NP VP permits the parser to replace this goal with 2 subgoals
# find and NP, then find a VP. Each of these can be replaces by sub-sub goals.
rd_parser = nltk.RecursiveDescentParser(grammar1)
sent = 'Mary saw a dog'.split()
for tree in rd_parser.parse(sent):
    print(tree)

(S (NP Mary) (VP (V saw) (NP (Det a) (N dog))))


In [7]:
# Recursive descent parsing has three key shortcomings. 
# First, left-recursive productions like NP -> NP PP send it into an infinite loop. 
# Second, the parser wastes a lot of time considering words and structures that do not correspond to the input sentence. 
# Third, the backtracking process may discard parsed constituents that will need to be rebuilt again later.
# top down parsers use the grammar to predict what the input will be before inspecting the input

# a better approach is bottom up parsing like a shift-reduce parser
# It tries to find sequences of words and phrases that correspond to the right hand side of a grammar production, 
# and replace them with the left-hand side, until the whole sentence is reduced to an S.
sr_parser = nltk.ShiftReduceParser(grammar1)
sent = 'Mary saw a dog'.split()
for tree in sr_parser.parse(sent):
    print(tree)
    
# A shift-reduce parser can reach a dead end and fail to find any parse. when this happens, no inputs remain and the
# stack contains items which cannot be reduced to an S

(S (NP Mary) (VP (V saw) (NP (Det a) (N dog))))


In [None]:
# the left corner parser
# this is a hybrid between the top down and bottom up approaches we have seen
