In [88]:
from __future__ import division
import sys
from pprint import pprint
from collections import defaultdict
import nltk
#nltk file on my L drive
nltk.data.path.append("L:\\nltk_data\\")
from nltk.corpus import treebank
from nltk import ConditionalFreqDist, Nonterminal, FreqDist
from fixesNLTK3 import *
from BetterICP import *
from nltk import InsideChartParser

## Main body of code ##
# Extracting tagged sentences using NLTK libraries
psents = treebank.parsed_sents()
# Comment out the following 3 lines if you get tired of seeing them
print ("\n 1st parsed sentence: {} \n".format(psents[0]))
print ("\n Productions in the 1st parsed sentence: \n")
pprint(psents[0].productions())

grammar = parse_pgrammar("""
    # Grammatical productions.
     S -> NP VP [1.0]
     NP -> Pro [0.1] | Det N [0.3] | N [0.5] | NP PP [0.1]
     VP -> Vi [0.05] | Vt NP [0.9] | VP PP [0.05]
     Det -> Art [1.0]
     PP -> Prep NP [1.0]
   # Lexical productions.
     Pro -> "i" [0.3] | "we" [0.1] | "you" [0.1] | "he" [0.3] | "she" [0.2]
     Art -> "a" [0.4] | "an" [0.1] | "the" [0.5]
     Prep -> "with" [0.7] | "in" [0.3]
     N -> "salad" [0.4] | "fork" [0.3] | "mushrooms" [0.3]
     Vi -> "sneezed" [0.5] | "ran" [0.5]
     Vt -> "eat" [0.2] | "eats" [0.2] | "ate" [0.2] | "see" [0.2] | "saw" [0.2]
    """)

sentence1 = "he ate salad"
sentence2 = "he ate salad with mushrooms"
sentence3 = "he ate salad with a fork"



 1st parsed sentence: (S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .)) 


 Productions in the 1st parsed sentence: 

[S -> NP-SBJ VP .,
 NP-SBJ -> NP , ADJP ,,
 NP -> NNP NNP,
 NNP -> 'Pierre',
 NNP -> 'Vinken',
 , -> ',',
 ADJP -> NP JJ,
 NP -> CD NNS,
 CD -> '61',
 NNS -> 'years',
 JJ -> 'old',
 , -> ',',
 VP -> MD VP,
 MD -> 'will',
 VP -> VB NP PP-CLR NP-TMP,
 VB -> 'join',
 NP -> DT NN,
 DT -> 'the',
 NN -> 'board',
 PP-CLR -> IN NP,
 IN -> 'as',
 NP -> DT JJ NN,
 DT -> 'a',
 JJ -> 'nonexecutive',
 NN -> 'director',
 NP-TMP -> NNP CD,
 NNP -> 'Nov.',
 CD -> '29',
 . -> '.']


## Question 4

In [89]:
# Un-comment the following 2 non-comment lines
# when working on `PCFG Parser` section in the lab.
## Initialize a parser with our toy probabilistic grammar
##  (it will have 'S' as the start symbol),
##  and parse a sentence
sppc=BetterICP(grammar)
sppc.parse(sentence1.split())

****
(S (NP (Pro he)) (VP (Vt ate) (NP (N salad)))) (p=0.00108)9.855(9.855)
****
1 total parses found


<list_iterator at 0x218f9610b38>

In [90]:
# Parse some more complex sentences
sppc.parse(sentence2.split())
sppc.parse(sentence3.split())



****
(S
  (NP (Pro he))
  (VP
    (Vt ate)
    (NP (NP (N salad)) (PP (Prep with) (NP (N mushrooms)))))) (p=1.134e-05)16.43(16.43)
****
****
(S
  (NP (Pro he))
  (VP
    (VP (Vt ate) (NP (N salad)))
    (PP (Prep with) (NP (N mushrooms))))) (p=5.67e-06)17.43(17.43)
****
2 total parses found
****
(S
  (NP (Pro he))
  (VP
    (Vt ate)
    (NP
      (NP (N salad))
      (PP (Prep with) (NP (Det (Art a)) (N fork)))))) (p=2.7216e-06)18.49(18.49)
****
****
(S
  (NP (Pro he))
  (VP
    (VP (Vt ate) (NP (N salad)))
    (PP (Prep with) (NP (Det (Art a)) (N fork))))) (p=1.3608e-06)19.49(19.49)
****
2 total parses found


<list_iterator at 0x218f9864748>

## Question a)

In [91]:
import re
regexp = re.compile(r'^NNP|(NP-*)')
def spec_cnt(prods):
    count =0
    myList=[]
    for i in range(0,len(prods)):
        word = str(prods[i].lhs())
        if regexp.search(word) and word != "NNP":
            if word not in myList:
                myList.append(word)
            count +=1
            #print("i:{}\tcost:{}\tlhs:{}\trhs:{}".format(i,prods[i].cost(), prods[i].lhs(), prods[i].rhs()))
    print("len 'NP or variant include WHNP*':{}".format(count))
    for w in myList:
        print(w)

In [92]:
prods = get_costed_productions(psents)
ppg=PCFG(Nonterminal('NP'), prods)
print("len 'S':{}".format(len(ppg.productions(Nonterminal('S')))))
print("len 'NP':{}".format(len(ppg.productions(Nonterminal('NP')))))
spec_cnt(prods)

len 'S':772
len 'NP':1490
len 'NP or variant include WHNP*':3347
WHNP
NP
NNPS
WHNP-1
WHNP-8
NP-EXT
NP-PRD
NP-SBJ-1
NP-TTL-SBJ
NP-SBJ-20
NP-LGS
NP-SBJ-2
NP-SBJ
NP-TTL
NP-SBJ-35
WHNP-14
NP-3
NP-SBJ-104
WHNP-256
NP-SBJ-156
WHNP-33
NP-2
WHNP-231
NP-LOC
NP-TMP
NP-SBJ-60
NP-SBJ-3
NP-SBJ-108
WHNP-121
WHNP-41
WHNP-31
NP-SBJ-123
NP-SBJ-122
NP-ADV
WHNP-158
NP-SBJ-81
WHNP-2
WHNP-97
WHNP-136
NP-SBJ-11
NP-SBJ-134
WHNP-56
WHNP-6
WHNP-36
WHNP-28
NP-SBJ-79
WHNP-124
NP-SBJ-14
WHNP-232
NP-TMP-CLR
NP-SBJ-38
WHNP-203
NP-SBJ-48
WHNP-4
WHNP-35
NP-6
WHNP-234
WHNP-57
NP-SBJ-139
NP-HLN
NP-SBJ-27
NP-SBJ-80
NP-SBJ-46
WHNP-100
NP-SBJ-43
NP-SBJ-8
NP-SBJ-73
NP-SBJ-36
WHNP-83
WHNP-63
NP-SBJ-54
WHNP-122
WHNP-64
WHNP-217
WHNP-75
NP-1
WHNP-245
NP-SBJ-49
WHNP-13
WHNP-66
WHNP-240
WHNP-117
WHNP-152
WHNP-51
WHNP-3
WHNP-49
NP-SBJ-50
NP-SBJ-22
NP-SBJ-155
WHNP-11
NP-SBJ-6
WHNP-102
WHNP-110
NP-TMP-2
WHNP-214
NP-SBJ-164
NP-SBJ-4
WHNP-236
NP-SBJ-47
WHNP-199
WHNP-156
WHNP-58
WHNP-139
WHNP-142
NP-SBJ-68
NP-SBJ-149
NP-SBJ-102
NP-SB

## B

In [93]:
ppc=BetterICP(ppg,1000)
print ("beam = 1000")
ppc.parse("the men".split(),True,3)

beam = 1000
0 total parses found


<list_iterator at 0x21895bd1630>

In [94]:

ppc.beam(1050)
print ("beam = 1050")
ppc.parse("the men".split(),True,3)


beam = 1050
****
(NP (JJ the) (NP (NNS men))) (p=2.50811e-12)38.54(38.54)
****
1 total parses found


<list_iterator at 0x218f93bcc18>

In [None]:

#ppc.trace(3)


In [95]:
ppc.beam(1200)
print ("beam = 1200")


beam = 1200


In [96]:
ppc.parse("the men".split(),True,3)

****
(NP (DT the) (NP (NNS men))) (p=2.89457e-09)28.36(28.36)
****
****
(NP (JJ the) (NP (NNS men))) (p=2.50811e-12)38.54(38.54)
****
****
(NP (NNP the) (NP (NNS men))) (p=3.10995e-13)41.55(41.55)
****
3 total parses found


<list_iterator at 0x2188b72c668>

In [129]:
b=1042
ppc.beam(b)
print ("beam ={}".format(b))
ppc.parse("the men".split(),True,3)

beam =1042
****
(NP (JJ the) (NP (NNS men))) (p=2.50811e-12)38.54(38.54)
****
1 total parses found


<list_iterator at 0x218ff6d3550>