# Constituency-Based Parsing

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 16 06:55:49 2016

@author: DIP
"""

sentence = 'The brown fox is quick and he is jumping over the lazy dog'

# set java path
import os
java_path = r'C:\Program Files\Java\jdk1.8.0_144\bin\java.exe'
os.environ['JAVAHOME'] = java_path

from nltk.parse.stanford import StanfordParser

# scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
#                    path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')

scp = StanfordParser(path_to_jar='C:/Software/StanfordNLP/stanford-parser-full-2016-10-31/stanford-parser.jar',
                   path_to_models_jar='C:/Software/StanfordNLP/stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar')


result = list(scp.raw_parse(sentence))
print result[0]


In [2]:
result[0].draw()


In [3]:
import nltk
from nltk.grammar import Nonterminal
from nltk.corpus import treebank

training_set = treebank.parsed_sents()

print training_set[1]


(S
  (NP-SBJ (NNP Mr.) (NNP Vinken))
  (VP
    (VBZ is)
    (NP-PRD
      (NP (NN chairman))
      (PP
        (IN of)
        (NP
          (NP (NNP Elsevier) (NNP N.V.))
          (, ,)
          (NP (DT the) (NNP Dutch) (VBG publishing) (NN group))))))
  (. .))


In [4]:
# extract the productions for all annotated training sentences
treebank_productions = list(
                        set(production 
                            for sent in training_set  
                            for production in sent.productions()
                        )
                    )

treebank_productions[0:10]
  


[VBZ -> 'cites',
 VBD -> 'spurned',
 PRN -> , ADVP-TMP ,,
 NNP -> 'ACCOUNT',
 JJ -> '36-day',
 NP-SBJ-2 -> NN,
 JJ -> 'unpublished',
 NP-SBJ-1 -> NNP,
 JJ -> 'elusive',
 NNS -> 'Lids']

In [5]:
# add productions for each word, POS tag
for word, tag in treebank.tagged_words():
	t = nltk.Tree.fromstring("("+ tag + " " + word  +")")
	for production in t.productions():
		treebank_productions.append(production)


In [6]:
# build the PCFG based grammar  
treebank_grammar = nltk.grammar.induce_pcfg(Nonterminal('S'), 
                                         treebank_productions)


In [7]:
# build the parser
viterbi_parser = nltk.ViterbiParser(treebank_grammar)


In [8]:
# get sample sentence tokens
tokens = nltk.word_tokenize(sentence)


In [9]:
# get parse tree for sample sentence
result = list(viterbi_parser.parse(tokens))


ValueError: Grammar does not cover some of the input words: u"'brown', 'fox', 'lazy', 'dog'".

Unfortunately, we get an error when we try to parse our sample sentence tokens
with our newly built parser. The reason is quite clear from the error: Some of the words
in our sample sentence are not covered by the treebank -based grammar because they
are not present in our treebank corpus. Now, because this constituency-based grammar
uses POS tags and phrase tags to build the tree based on the training data, we will add the
token and POS tags for our sample sentence in our grammar and rebuild the parser:

In [10]:
# get tokens and their POS tags
from pattern.en import tag as pos_tagger
tagged_sent = pos_tagger(sentence)


In [11]:
print tagged_sent


[(u'The', u'DT'), (u'brown', u'JJ'), (u'fox', u'NN'), (u'is', u'VBZ'), (u'quick', u'JJ'), (u'and', u'CC'), (u'he', u'PRP'), (u'is', u'VBZ'), (u'jumping', u'VBG'), (u'over', u'IN'), (u'the', u'DT'), (u'lazy', u'JJ'), (u'dog', u'NN')]


In [12]:
# extend productions for sample sentence tokens
for word, tag in tagged_sent:
    t = nltk.Tree.fromstring("("+ tag + " " + word  +")")
    for production in t.productions():
		treebank_productions.append(production)


In [13]:
# rebuild grammar
treebank_grammar = nltk.grammar.induce_pcfg(Nonterminal('S'), 
                                         treebank_productions)                                         

# rebuild parser
viterbi_parser = nltk.ViterbiParser(treebank_grammar)

# get parse tree for sample sentence
result = list(viterbi_parser.parse(tokens))

print result[0]


(S
  (NP-SBJ-163 (DT The) (JJ brown) (NN fox))
  (VP
    (VBZ is)
    (PRT (JJ quick))
    (S
      (CC and)
      (NP-SBJ (PRP he))
      (VP
        (VBZ is)
        (PP-1
          (VBG jumping)
          (NP (IN over) (DT the) (JJ lazy) (NN dog))))))) (p=2.02604e-48)


In [14]:
result[0].draw()                  