## Part 1: Probabilistic context-free grammars

In [629]:
#importing libraries
import steven_tree
from collections import defaultdict

In [630]:
#example tree provided
t = steven_tree.Tree.from_string('''(S
    (NP
        (DT the)
        (NN teacher)
    )
    (S|
        (VP
            (MD will)
            (VP
                (VB lecture)
                (VP|
                    (NP
                        (NN today)
                    )
                    (PP
                        (IN in)
                        (NP
                            (DT the)
                            (NP|
                                (NN lecture)
                                (NN hall)
                            )
                        )
                    )
                )
            )
        )
        (. .)
    )
)''')

In [631]:
for x in t.collapse_unary().chomsky_normal_form().terminals():
    print(x)

the
teacher
will
lecture
today
in
the
lecture
hall
.


In [632]:
for (mother, daughters) in t.productions():
    print mother
    print daughters

S
('NP', 'S|')
NP
('DT', 'NN')
DT
('the',)
NN
('teacher',)
S|
('VP', '.')
VP
('MD', 'VP')
MD
('will',)
VP
('VB', 'VP|')
VB
('lecture',)
VP|
('NP', 'PP')
NP
('NN',)
NN
('today',)
PP
('IN', 'NP')
IN
('in',)
NP
('DT', 'NP|')
DT
('the',)
NP|
('NN', 'NN')
NN
('lecture',)
NN
('hall',)
.
('.',)


In [633]:
#additional functions for creating the grammar
def prob_productions(dict_productions):
    new_dict = defaultdict(lambda: defaultdict(float))
    for key in dict_productions:
        denom = sum(dict_productions[key].values())
        for value in dict_productions[key]:
            new_dict[key][value] = dict_productions[key][value]/denom
    return new_dict

def combine_dict_productions(dict_productions1, dict_productions2):
    new_dictionary = defaultdict(lambda: defaultdict(float))
    for key in dict_productions1:
        for value in dict_productions1[key]:
            new_dictionary[key][value] += dict_productions1[key][value]
    for key in dict_productions2:
        for value in dict_productions2[key]:
            new_dictionary[key][value] += dict_productions2[key][value]
    return new_dictionary

def make_pretty_dict(grammar_dictionary):
    new_dictionary = defaultdict(lambda: defaultdict(float))
    for key in grammar_dictionary:
        new_item = ""
        for value in grammar_dictionary[key]:
            print(value)
            new_item = ""
            for item in value:
                #print(item)
                #new_item = ""
                new_item+=item
                new_item+= " "
            print(new_item)
            new_dictionary[key][new_item[:-1]] = grammar_dictionary[key][value]
    return new_dictionary


In [634]:
t.collapse_unary().chomsky_normal_form().dict_productions()

defaultdict(<function steven_tree.<lambda>>,
            {'.': defaultdict(float, {('.',): 1.0}),
             'DT': defaultdict(float, {('the',): 2.0}),
             'IN': defaultdict(float, {('in',): 1.0}),
             'MD': defaultdict(float, {('will',): 1.0}),
             'NN': defaultdict(float,
                         {('hall',): 1.0,
                          ('lecture',): 1.0,
                          ('teacher',): 1.0,
                          ('today',): 1.0}),
             'NP': defaultdict(float,
                         {('DT', 'NN'): 1.0,
                          ('DT', 'NP|'): 1.0,
                          ('NN',): 1.0}),
             'NP|': defaultdict(float, {('NN', 'NN'): 1.0}),
             'PP': defaultdict(float, {('IN', 'NP'): 1.0}),
             'S': defaultdict(float, {('NP', 'S|'): 1.0}),
             'S|': defaultdict(float, {('VP', '.'): 1.0}),
             'VB': defaultdict(float, {('lecture',): 1.0}),
             'VP': defaultdict(float, {('MD', 'VP

In [635]:
x = prob_productions(t.dict_productions())
x2 = make_pretty_dict(x)
for key in x2:
    for value in x2[key]:
        print(key + " -> " + str(value) + " " + str(x2[key][value]))

('will',)
will 
('lecture',)
lecture 
('teacher',)
teacher 
('today',)
today 
('lecture',)
lecture 
('hall',)
hall 
('VP', '.')
VP . 
('NP', 'PP')
NP PP 
('IN', 'NP')
IN NP 
('.',)
. 
('MD', 'VP')
MD VP 
('VB', 'VP|')
VB VP| 
('NN', 'NN')
NN NN 
('NP', 'S|')
NP S| 
('in',)
in 
('DT', 'NN')
DT NN 
('NN',)
NN 
('DT', 'NP|')
DT NP| 
('the',)
the 
MD -> will 1.0
VB -> lecture 1.0
NN -> lecture 0.25
NN -> teacher 0.25
NN -> today 0.25
NN -> hall 0.25
S| -> VP . 1.0
VP| -> NP PP 1.0
PP -> IN NP 1.0
. -> . 1.0
VP -> MD VP 0.5
VP -> VB VP| 0.5
NP| -> NN NN 1.0
S -> NP S| 1.0
IN -> in 1.0
NP -> DT NN 0.333333333333
NP -> DT NP| 0.333333333333
NP -> NN 0.333333333333
DT -> the 1.0


In [636]:
trees = []
f = open("wsj-normalized.psd", "r")
x = steven_tree.Tree.from_stream(f)
for item in x:
    trees.append(item)
f.close()

In [637]:
trees[0]

(TOP
    (NP-SBJ
        (NP
            (NNP <NNP>)
            (NNP <NNP>)
        )
        (, ,)
        (ADJP
            (NP
                (CD <CD>)
                (NNS years)
            )
            (JJ old)
        )
        (, ,)
    )
    (VP
        (MD will)
        (VP
            (VB join)
            (NP
                (DT the)
                (NN board)
            )
            (PP-CLR
                (IN as)
                (NP
                    (DT a)
                    (JJ nonexecutive)
                    (NN director)
                )
            )
            (NP-TMP
                (NNP <NNP>)
                (CD <CD>)
            )
        )
    )
    (. .)
)

In [638]:
trees[0].collapse_unary().chomsky_normal_form()

(TOP
    (NP-SBJ
        (NP
            (NNP <NNP>)
            (NNP <NNP>)
        )
        (NP-SBJ|<,&ADJP>
            (, ,)
            (NP-SBJ|<ADJP&,>
                (ADJP
                    (NP
                        (CD <CD>)
                        (NNS years)
                    )
                    (JJ old)
                )
                (, ,)
            )
        )
    )
    (TOP|<VP&.>
        (VP
            (MD will)
            (VP
                (VB join)
                (VP|<NP&PP-CLR>
                    (NP
                        (DT the)
                        (NN board)
                    )
                    (VP|<PP-CLR&NP-TMP>
                        (PP-CLR
                            (IN as)
                            (NP
                                (DT a)
                                (NP|<JJ&NN>
                                    (JJ nonexecutive)
                                    (NN director)
                                )
     

In [639]:
len(trees)

14898

In [640]:
chomsky_normed_trees = []
i = 0
for my_tree in trees:
    #print(my_tree)
    if i %100 == 0:
        print(i)
    i+=1
    new_tree = my_tree.collapse_unary().chomsky_normal_form()
    chomsky_normed_trees.append(new_tree)
    #print(new_tree)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800


In [641]:
test = chomsky_normed_trees[0]
prev_dict = test.dict_productions()
print(prev_dict)
i = 1
for my_tree in chomsky_normed_trees[1:]:
    #print(my_tree.collapse_unary().chomsky_normal_form())
    if i%100 == 0:
        print(i)
    #print(next_tree)
    next_tree_dict = my_tree.dict_productions()
    prev_dict = combine_dict_productions(prev_dict, next_tree_dict)
    i+=1

defaultdict(<function <lambda> at 0x12d0882a8>, {'VP|<PP-CLR&NP-TMP>': defaultdict(<type 'float'>, {('PP-CLR', 'NP-TMP'): 1.0}), 'TOP|<VP&.>': defaultdict(<type 'float'>, {('VP', '.'): 1.0}), 'NP-SBJ|<ADJP&,>': defaultdict(<type 'float'>, {('ADJP', ','): 1.0}), 'ADJP': defaultdict(<type 'float'>, {('NP', 'JJ'): 1.0}), 'TOP': defaultdict(<type 'float'>, {('NP-SBJ', 'TOP|<VP&.>'): 1.0}), 'NP-TMP': defaultdict(<type 'float'>, {('NNP', 'CD'): 1.0}), 'JJ': defaultdict(<type 'float'>, {('nonexecutive',): 1.0, ('old',): 1.0}), 'DT': defaultdict(<type 'float'>, {('the',): 1.0, ('a',): 1.0}), 'NN': defaultdict(<type 'float'>, {('board',): 1.0, ('director',): 1.0}), ',': defaultdict(<type 'float'>, {(',',): 2.0}), '.': defaultdict(<type 'float'>, {('.',): 1.0}), 'NP': defaultdict(<type 'float'>, {('CD', 'NNS'): 1.0, ('DT', 'NN'): 1.0, ('NNP', 'NNP'): 1.0, ('DT', 'NP|<JJ&NN>'): 1.0}), 'NNS': defaultdict(<type 'float'>, {('years',): 1.0}), 'NNP': defaultdict(<type 'float'>, {('<NNP>',): 3.0}), 'VB

In [642]:
len(prev_dict.keys())

3960

In [643]:
rules = 0
for key in prev_dict:
    rules+= len(prev_dict[key].values())
rules

30363

In [644]:
print(prob_productions(prev_dict))



#### A paragraph talking about your approach, what worked, what didn't, etc.
I used Steven's code from tree.py, since I believe I have a small error in my version of tree.py--I think I do not check for three daughters of the head of the whole tree. I made one edit to Steven's file (which I've renamed "steven_tree.py")--I added a function to the Tree class called `dict_productions()`. I then used this to create the grammar, by also using three more functions in this jupyter notebook: `prob_productions()`,  `combine_dict_productions()`, and `make_pretty_dict()`. `combine_dict_productions()` allowed me to join together the productions from two trees. `prob_productions()` changed the count structure of the dictionary of productions to a probability. And finally in `make_pretty_dict()` I changed the tuples of the productions to combined strings with a space inbetween them, so that it would better match the expected output, and be easier to work with.

When I tested this on the tree in the homework example, I got the expected output. And more importantly, when I trained it on the Wall Street Journal corpus, I believe I got the intended output. I ended up having 30,363 productions in my grammar.

## Part 2: The CYK algorithm

In [645]:
#the main algorithm:
def prob_cyk(words_in_sentence, grammar):
    whole_table = list()
    x = 0
    #making the table format
    while x<len(words_in_sentence):
        table = list()
        for i in range(x, len(words_in_sentence)):
            #table.append(dict())
            table.append((str(x) + "," + str(i+1),dict()))
        whole_table.append(table)
        x+=1
    #print(whole_table)
    #filling in the table    
    for j in range(0, len(words_in_sentence)):
        for key in grammar:
            if words_in_sentence[j] in grammar[key]: 
                whole_table[j][0][1][key] = (grammar[key][words_in_sentence[j]], [])
    for i in range(1, len(words_in_sentence)):
        y = 1
        for x in range(i-1, -1, -1):
            a = y
            b = 1
            while b<=y:
                for key in whole_table[x][y-a][1]:
                    #print(key)
                    #print(whole_table[x][y-a][1][key])
                    for other_key in whole_table[x+b][y-b][1]:
                        for grammar_key in grammar:
                            for value in grammar[grammar_key]:
                                if key + " " + other_key == value:
                                    #check if higher
                                    #print("uh")
                                    #print(not whole_table[x][y][1])
                                    if not whole_table[x][y][1]:
                                        whole_table[x][y][1][grammar_key] = ((whole_table[x][y-a][1][key][0] * whole_table[x+b][y-b][1][other_key][0] * grammar[grammar_key][value]), [x, y-a, x+b, y-b])
                                    else:
                                        for yet_another_key in whole_table[x][y][1]:
                                            #print(whole_table[x][y][1][yet_another_key])
                                            if (whole_table[x][y-a][1][key][0] * whole_table[x+b][y-b][1][other_key][0] * grammar[grammar_key][value]) > whole_table[x][y][1][yet_another_key][0]:
                                                del whole_table[x][y][1][yet_another_key]
                                                whole_table[x][y][1][grammar_key] = ((whole_table[x][y-a][1][key][0] * whole_table[x+b][y-b][1][other_key][0] * grammar[grammar_key][value]), [x, y-a, x+b, y-b])
                a = a-1
                b = b+1
               #now should be able to access the right things...
                
            y+= 1
    return(whole_table)
        

# a function for printing my results
def print_table_nicely(table):
    s = ""
    for ls in table:
        #print("{:10d}".format(str(ls)))
        print(str(ls).ljust(40))
        print("")
        #s += "        "

In [646]:
#the J&M example
example_sentence = ["the", "flight", "includes", "a", "meal"]

example_grammar = defaultdict(lambda: defaultdict(float))
example_grammar['S']['NP VP'] = .8
example_grammar['NP']['Det N'] = .3
example_grammar['VP']['V NP'] = .2
example_grammar['V']['includes'] = .05
example_grammar['Det']['the'] = .4
example_grammar['Det']['a'] = .4
example_grammar['N']['meal'] = .01
example_grammar['N']['flight'] = .02 

In [647]:
example_grammar

defaultdict(<function __main__.<lambda>>,
            {'Det': defaultdict(float, {'a': 0.4, 'the': 0.4}),
             'N': defaultdict(float, {'flight': 0.02, 'meal': 0.01}),
             'NP': defaultdict(float, {'Det N': 0.3}),
             'S': defaultdict(float, {'NP VP': 0.8}),
             'V': defaultdict(float, {'includes': 0.05}),
             'VP': defaultdict(float, {'V NP': 0.2})})

In [648]:
test = prob_cyk(example_sentence, example_grammar)

In [649]:
print_table_nicely(test)

[('0,1', {'Det': (0.4, [])}), ('0,2', {'NP': (0.0024, [0, 0, 1, 0])}), ('0,3', {}), ('0,4', {}), ('0,5', {'S': (2.304e-08, [0, 1, 2, 2])})]

[('1,2', {'N': (0.02, [])}), ('1,3', {}), ('1,4', {}), ('1,5', {})]

[('2,3', {'V': (0.05, [])}), ('2,4', {}), ('2,5', {'VP': (1.2e-05, [2, 0, 3, 1])})]

[('3,4', {'Det': (0.4, [])}), ('3,5', {'NP': (0.0012, [3, 0, 4, 0])})]

[('4,5', {'N': (0.01, [])})]            



In [653]:
def make_tree(parse_table, label):
    if label not in parse_table[0][-1][1].keys():
        return 1
    return 0
    #print(parse_table[0][-1][1])
    #next_indicies = parse_table[0][-1][1]['S'][1]
    #left = parse_table[next_indicies[0]][next_indicies[1]][1]
    #right = parse_table[next_indicies[2]][next_indicies[3]][1]
    #print(left)
    #print(right)
    
    

In [654]:
make_tree(test, 'S')

0

In [581]:
bigger_trees = []
f = open("bigger_treebank_2.psd", "r")
print(f)
x = steven_tree.Tree.from_stream(f)
for item in x:
    bigger_trees.append(item)
f.close()

<open file 'bigger_treebank_2.psd', mode 'r' at 0x1279b0e40>


In [582]:
len(bigger_trees)

1187

In [656]:
bigger_chomsky_normed_trees = []
i = 0
for my_tree in bigger_trees:
    #print(my_tree)
    if i %100 == 0:
        print(i)
    i+=1
    new_tree = my_tree.collapse_unary().chomsky_normal_form()
    bigger_chomsky_normed_trees.append(new_tree)
    #print(new_tree)

0
100
200
300
400
500
600
700
800
900
1000
1100


In [657]:
start = bigger_chomsky_normed_trees[0]
gram = start.dict_productions()
i = 1
for my_tree in bigger_chomsky_normed_trees[1:]:
    #print(my_tree.collapse_unary().chomsky_normal_form())
    if i%100 == 0:
        print(i)
    #print(next_tree)
    next_tree_dict = my_tree.dict_productions()
    gram = combine_dict_productions(gram, next_tree_dict)
    i+=1

100
200
300
400
500
600
700
800
900
1000
1100


In [658]:
bigger_prob = prob_productions(gram)

In [659]:
bigger_prob['TOP']

defaultdict(float,
            {('-LRB-', 'TOP|<NP-SBJ&VP>'): 0.002527379949452401,
             (':', 'TOP|<CC&NP-SBJ>'): 0.0008424599831508003,
             (':', 'TOP|<NNP&NNP>'): 0.0008424599831508003,
             (':', 'TOP|<NP&,>'): 0.0008424599831508003,
             ('ADJP-TMP', 'TOP|<,&NP-SBJ>'): 0.0008424599831508003,
             ('ADVP', 'TOP|<,&NP-SBJ-1>'): 0.0008424599831508003,
             ('ADVP', 'TOP|<,&NP-SBJ>'): 0.017691659646166806,
             ('ADVP', 'TOP|<,&PP>'): 0.0008424599831508003,
             ('ADVP', 'TOP|<,&SBAR-TMP>'): 0.0008424599831508003,
             ('ADVP', 'TOP|<NP-SBJ&VP>'): 0.0016849199663016006,
             ('ADVP', 'TOP|<PP-LOC&,>'): 0.0008424599831508003,
             ('ADVP-LOC', 'TOP|<,&NP-SBJ>'): 0.0016849199663016006,
             ('ADVP-LOC', 'TOP|<,&PP-LOC>'): 0.0008424599831508003,
             ('ADVP-MNR', 'TOP|<NP-SBJ&VP>'): 0.0008424599831508003,
             ('ADVP-TMP', 'TOP|<,&NP-SBJ>'): 0.010109519797809604,
             

In [660]:
end_wsj_trees = []
f = open("end_of_wsj.psd", "r")
x = steven_tree.Tree.from_stream(f)
for item in x:
    end_wsj_trees.append(item)
f.close()

In [661]:
end_wsj_sentences = []
i = 0
for my_tree in end_wsj_trees:
    #print(my_tree)
    if i %100 == 0:
        print(i)
    i+=1
    sentence = []
    new_tree = my_tree.terminals()
    for item in new_tree:
        sentence.append(str(item))
    end_wsj_sentences.append(sentence)
    #print(new_tree)

0


In [662]:
end_wsj_sentences

[['and',
  'at',
  'the',
  '<NNP>',
  ',',
  'meanwhile',
  ',',
  'new',
  '<NNP>',
  '<NNP>',
  '<NNP>',
  '<NNP>',
  'has',
  'said',
  '0',
  'he',
  'wants',
  'the',
  '<CD>',
  'sides',
  'to',
  'hammer',
  'out',
  'their',
  'own',
  'plan',
  '.'],
 ['the',
  '<CD>',
  'additional',
  'defendants',
  'were',
  "n't",
  'parties',
  'to',
  'the',
  'settlement',
  '.'],
 ['a',
  'trial',
  'on',
  'criminal',
  'allegations',
  'against',
  'the',
  'company',
  'and',
  'the',
  'same',
  '<CD>',
  'former',
  'executives',
  'began',
  '<NNP>',
  '<CD>',
  'in',
  'federal',
  'court',
  'for',
  'the',
  '<NNP>',
  'of',
  '<NNP>',
  '.'],
 ['the', 'company', 'and', 'its', 'executives', 'deny', 'the', 'charges', '.'],
 ['the',
  '<NNP>',
  '<NNP>',
  'originally',
  'suspended',
  'the',
  'company',
  '<NNP>',
  '<CD>',
  ',',
  '<CD>',
  ',',
  'and',
  'has',
  'been',
  'renewing',
  'the',
  'ban',
  'ever',
  'since',
  '.'],
 ['<NNP>',
  '<NNP>',
  'reported',
  '

In [663]:
bigger_prob = make_pretty_dict(bigger_prob)

('JJR', 'CD')
JJR CD 
(':', 'TOP|<CC&S>')
: TOP|<CC&S> 
('my',)
my 
('its',)
its 
('his',)
his 
('her',)
her 
('their',)
their 
('your',)
your 
('our',)
our 
(',', 'PP-LOC|<CC&PP>')
, PP-LOC|<CC&PP> 
('WHADVP', 'S')
WHADVP S 
('SBAR', 'SBAR|<,&CC>')
SBAR SBAR|<,&CC> 
('-NONE-', 'S')
-NONE- S 
('IN', 'SBAR|<``&S>')
IN SBAR|<``&S> 
('SBAR', 'SBAR|<CC&SBAR>')
SBAR SBAR|<CC&SBAR> 
('IN', 'S')
IN S 
('CC', 'NP-PRD|<JJ&NNS>')
CC NP-PRD|<JJ&NNS> 
('CD', 'CD')
CD CD 
('IN', 'NP')
IN NP 
('NN', 'TOP|<NN&.>')
NN TOP|<NN&.> 
(',', 'JJ')
, JJ 
(',', 'ADJP|<JJ&,>')
, ADJP|<JJ&,> 
(',', 'ADJP|<JJ&CC>')
, ADJP|<JJ&CC> 
('NNP', 'NP|<IN&DT>')
NNP NP|<IN&DT> 
('CC', 'ADJP-PRD|<,&PP>')
CC ADJP-PRD|<,&PP> 
('ADVP-MNR', 'PP-TMP')
ADVP-MNR PP-TMP 
('ADVP-MNR', 'VP|<PP-TMP&,>')
ADVP-MNR VP|<PP-TMP&,> 
('NP-PRD', 'VP|<ADVP-TMP&,>')
NP-PRD VP|<ADVP-TMP&,> 
('NP-PRD', 'ADVP-TMP')
NP-PRD ADVP-TMP 
(',', 'S|<ADVP&ADVP=3>')
, S|<ADVP&ADVP=3> 
('ADVP-CLR', 'VP|<PP-TMP&PP-TMP>')
ADVP-CLR VP|<PP-TMP&PP-TMP> 
('ADVP-C

('further',)
further 
('insane',)
insane 
('bilingual',)
bilingual 
('unchanged',)
unchanged 
('64-year-old',)
64-year-old 
('standardized',)
standardized 
('chinese',)
chinese 
('political',)
political 
('normal',)
normal 
('continuous',)
continuous 
('second-largest',)
second-largest 
('complicated',)
complicated 
('cost-control',)
cost-control 
('money-market',)
money-market 
('effective',)
effective 
('investment-grade',)
investment-grade 
('profitable',)
profitable 
('automatic',)
automatic 
('miami-based',)
miami-based 
('sentimental',)
sentimental 
('deceptive',)
deceptive 
('notable',)
notable 
('three-month',)
three-month 
('84-year-old',)
84-year-old 
('27-year',)
27-year 
('cutthroat',)
cutthroat 
('playful',)
playful 
('beaten',)
beaten 
('asset-backed',)
asset-backed 
('british',)
british 
('high-grade',)
high-grade 
('sound',)
sound 
('coincident',)
coincident 
('sluggish',)
sluggish 
('particular',)
particular 
('necessary',)
necessary 
('nearby',)
nearby 
('outstanding'

estimated 
('continued',)
continued 
('stressed',)
stressed 
('lifted',)
lifted 
('withdrawn',)
withdrawn 
('followed',)
followed 
('concerned',)
concerned 
('indicated',)
indicated 
('held',)
held 
('diluted',)
diluted 
('prolonged',)
prolonged 
('mixed',)
mixed 
('been',)
been 
('grown',)
grown 
('organized',)
organized 
('integrated',)
integrated 
('complained',)
complained 
('designated',)
designated 
('skyrocketed',)
skyrocketed 
('retired',)
retired 
('sustained',)
sustained 
('limited',)
limited 
('regarded',)
regarded 
('set',)
set 
('respected',)
respected 
('risen',)
risen 
('insured',)
insured 
('played',)
played 
('resulted',)
resulted 
('gained',)
gained 
('trained',)
trained 
('offset',)
offset 
('renewed',)
renewed 
('combined',)
combined 
('kept',)
kept 
('rumored',)
rumored 
('pointed',)
pointed 
('opened',)
opened 
('lost',)
lost 
('annualized',)
annualized 
('retained',)
retained 
('tanked',)
tanked 
('seen',)
seen 
('bribed',)
bribed 
('closed',)
closed 
('spurred',

VBN NP|<NNP&NNP> 
('VBN', 'NP|<NNP&NN>')
VBN NP|<NNP&NN> 
('IN', 'NP')
IN NP 
('IN', 'PP-PRP|<IN&``>')
IN PP-PRP|<IN&``> 
('JJ', 'PP')
JJ PP 
('IN', 'PP-PRP|<IN&NP>')
IN PP-PRP|<IN&NP> 
('ADVP', 'PP-PRP|<IN&IN>')
ADVP PP-PRP|<IN&IN> 
('b',)
b 
('JJ', 'NNS')
JJ NNS 
('IN', 'NP')
IN NP 
(':', 'TOP|<SQ&.>')
: TOP|<SQ&.> 
('DT', 'NAC-LOC|<NNP&,>')
DT NAC-LOC|<NNP&,> 
('NNP', 'NAC-LOC|<,&NNP>')
NNP NAC-LOC|<,&NNP> 
('S', 'VP|<ADVP-TMP&,>')
S VP|<ADVP-TMP&,> 
('IN', 'NP')
IN NP 
('PP-LOC', 'PP-TMP')
PP-LOC PP-TMP 
(',', 'SBAR-PRP')
, SBAR-PRP 
('JJ', 'NP-EXT|<CD&NNS>')
JJ NP-EXT|<CD&NNS> 
(',', 'VP')
, VP 
(',', 'NP')
, NP 
('JJ', 'NP-SBJ|<JJ&NN>')
JJ NP-SBJ|<JJ&NN> 
('JJ', 'NP-SBJ|<JJ&NNS>')
JJ NP-SBJ|<JJ&NNS> 
('PP-TMP', 'VP')
PP-TMP VP 
('-RRB-', 'VP')
-RRB- VP 
('IN', 'NP-SBJ+QP|<CD&IN>')
IN NP-SBJ+QP|<CD&IN> 
('IN', 'CD')
IN CD 
('NNS', 'NP|<,&NNS>')
NNS NP|<,&NNS> 
('NNS', 'NP|<,&NN>')
NNS NP|<,&NN> 
('NNS', 'NP|<,&FW>')
NNS NP|<,&FW> 
(',', 'SBAR-TMP')
, SBAR-TMP 
('VBN', 'NNS')
VBN N

('VBN', 'NP-EXT')
VBN NP-EXT 
('VBD', 'VP|<NP&PP>')
VBD VP|<NP&PP> 
('VBZ', 'VP|<PP-LOC&SBAR-ADV>')
VBZ VP|<PP-LOC&SBAR-ADV> 
('VB', 'VP|<NP&PP-DTV>')
VB VP|<NP&PP-DTV> 
('MD', 'VP|<ADVP&VP>')
MD VP|<ADVP&VP> 
('VBN', 'VP|<NP-EXT&NP-TMP>')
VBN VP|<NP-EXT&NP-TMP> 
('VBD', 'VP|<PP-LOC&PP-CLR>')
VBD VP|<PP-LOC&PP-CLR> 
('VBD', 'SBAR-TMP')
VBD SBAR-TMP 
('ADVP-MNR', 'VBN')
ADVP-MNR VBN 
('VBZ', 'VP|<ADVP-TMP&NP-PRD>')
VBZ VP|<ADVP-TMP&NP-PRD> 
('VBD', 'VP|<PP-DIR&SBAR-PRP>')
VBD VP|<PP-DIR&SBAR-PRP> 
('VB', 'VP|<ADJP-PRD&SBAR-ADV>')
VB VP|<ADJP-PRD&SBAR-ADV> 
('VBD', 'VP|<PP-EXT&PP-DIR>')
VBD VP|<PP-EXT&PP-DIR> 
('VBZ', 'VP|<ADVP&NP-PRD>')
VBZ VP|<ADVP&NP-PRD> 
('VBD', 'VP|<NP-TMP&PP-LOC>')
VBD VP|<NP-TMP&PP-LOC> 
('VBG', 'VP|<ADVP-CLR&PP-CLR>')
VBG VP|<ADVP-CLR&PP-CLR> 
('VBN', 'VP|<PP-DIR&PP-DIR>')
VBN VP|<PP-DIR&PP-DIR> 
('VBD', 'VP|<NP&PP-DIR>')
VBD VP|<NP&PP-DIR> 
('VBG', 'VP|<PP-DIR&,>')
VBG VP|<PP-DIR&,> 
('VBD', 'VP|<ADVP-MNR&PP-LOC>')
VBD VP|<ADVP-MNR&PP-LOC> 
('VBG', 'VP|<PRT&PP-

NP TOP|<:&SQ> 
('NP', 'NP')
NP NP 
('CC', 'TOP|<MD&NP-SBJ>')
CC TOP|<MD&NP-SBJ> 
('NP-SBJ', 'TOP|<RB&VP>')
NP-SBJ TOP|<RB&VP> 
('NNP', 'TOP|<NNP&.>')
NNP TOP|<NNP&.> 
('S', 'TOP|<:&S>')
S TOP|<:&S> 
('ADVP-MNR', 'TOP|<NP-SBJ&VP>')
ADVP-MNR TOP|<NP-SBJ&VP> 
('NP-SBJ', 'TOP|<PRN&VP>')
NP-SBJ TOP|<PRN&VP> 
('PP-TMP', 'TOP|<PRN&NP-SBJ>')
PP-TMP TOP|<PRN&NP-SBJ> 
('CC', 'TOP|<VBD&NP-SBJ>')
CC TOP|<VBD&NP-SBJ> 
('MD', 'TOP|<NP-SBJ&ADVP>')
MD TOP|<NP-SBJ&ADVP> 
('NP-SBJ', 'TOP|<ADVP-TMP&VP>')
NP-SBJ TOP|<ADVP-TMP&VP> 
('NP-TTL-SBJ', 'TOP|<VP&.>')
NP-TTL-SBJ TOP|<VP&.> 
('PP', 'TOP|<,&``>')
PP TOP|<,&``> 
('S', 'TOP|<,&IN>')
S TOP|<,&IN> 
('VBG', 'TOP|<NNS&.>')
VBG TOP|<NNS&.> 
('PP-TMP', 'TOP|<,&PP>')
PP-TMP TOP|<,&PP> 
('PP', 'TOP|<,&SBAR-ADV>')
PP TOP|<,&SBAR-ADV> 
('NP-SBJ', 'TOP|<DT&VP>')
NP-SBJ TOP|<DT&VP> 
('CC', 'TOP|<``&NP-SBJ>')
CC TOP|<``&NP-SBJ> 
('NNPS', 'NNP')
NNPS NNP 
('ADVP-LOC', 'TOP|<,&NP-SBJ>')
ADVP-LOC TOP|<,&NP-SBJ> 
('NP', 'TOP|<:&RB>')
NP TOP|<:&RB> 
('SBAR-ADV', 'TOP|<

RB JJ 
('IN', 'PP')
IN PP 
('NNP',)
NNP 
('NP', 'JJR')
NP JJR 
('NP', 'IN')
NP IN 
('DT', 'RBR')
DT RBR 
('IN', 'NN')
IN NN 
('RB', 'NP')
RB NP 
('RB', 'PP')
RB PP 
('NP-ADV', 'RBR')
NP-ADV RBR 
('RB', 'ADVP|<IN&NN>')
RB ADVP|<IN&NN> 
('JJS',)
JJS 
('IN', 'JJS')
IN JJS 
('CD', 'ADVP|<TO&CD>')
CD ADVP|<TO&CD> 
('IN', 'NP')
IN NP 
('JJ', 'PP')
JJ PP 
('JJ',)
JJ 
('IN',)
IN 
('IN', 'DT')
IN DT 
('RBR',)
RBR 
('RBR', 'RB')
RBR RB 
('RBR', 'IN')
RBR IN 
('VBN', 'PP')
VBN PP 
('RB',)
RB 
('RB', 'RBR')
RB RBR 
('RB', 'RB')
RB RB 
('IN', 'RB')
IN RB 
('ADVP', 'PP')
ADVP PP 
('JJR',)
JJR 
('PP-TMP', ',')
PP-TMP , 
('NP', "''")
NP '' 
('PP', 'ADJP-PRD|<``&PP>')
PP ADJP-PRD|<``&PP> 
('CD', 'NP-TMP-CLR|<,&CD>')
CD NP-TMP-CLR|<,&CD> 
('NNP', 'CD')
NNP CD 
('CC', 'VP|<VBD&NP>')
CC VP|<VBD&NP> 
('ADVP', 'SBAR')
ADVP SBAR 
('RB', 'PP')
RB PP 
('RB',)
RB 
('RB', 'JJR')
RB JJR 
('RB', 'RB')
RB RB 
('IN',)
IN 
('ADVP', 'ADVP-TMP|<CC&ADVP>')
ADVP ADVP-TMP|<CC&ADVP> 
('NP', 'JJR')
NP JJR 
('NP', 'IN')
NP I

therapies 
('restaurants',)
restaurants 
('stands',)
stands 
('exits',)
exits 
('billions',)
billions 
('disclosures',)
disclosures 
('towns',)
towns 
('friends',)
friends 
('projects',)
projects 
('subskills',)
subskills 
('drivers',)
drivers 
('diseases',)
diseases 
('hours',)
hours 
('booklets',)
booklets 
('advertisements',)
advertisements 
('headquarters',)
headquarters 
('ballplayers',)
ballplayers 
('market-makers',)
market-makers 
('shares',)
shares 
('uses',)
uses 
('standards',)
standards 
('efforts',)
efforts 
('parties',)
parties 
('guests',)
guests 
('thrifts',)
thrifts 
('assets',)
assets 
('rivals',)
rivals 
('pounds',)
pounds 
('iras',)
iras 
('services',)
services 
('dollars',)
dollars 
('scientists',)
scientists 
('municipalities',)
municipalities 
('skills',)
skills 
('blocks',)
blocks 
('areas',)
areas 
('tours',)
tours 
('derivatives',)
derivatives 
('hands',)
hands 
('speeches',)
speeches 
('documents',)
documents 
('careers',)
careers 
('dividends',)
dividends 
(

sketch 
('cheating',)
cheating 
('rival',)
rival 
('volume',)
volume 
('showing',)
showing 
('organization',)
organization 
('membership',)
membership 
('cost',)
cost 
('sunlight',)
sunlight 
('party',)
party 
('culprit',)
culprit 
('surge',)
surge 
('contractor',)
contractor 
('downside',)
downside 
('commission',)
commission 
('gunship',)
gunship 
('woman',)
woman 
('leader',)
leader 
('gain',)
gain 
('text',)
text 
('funding',)
funding 
('symbol',)
symbol 
('instance',)
instance 
('connection',)
connection 
('compensation',)
compensation 
('press',)
press 
('production',)
production 
('veal',)
veal 
('alternative',)
alternative 
('law',)
law 
('assembly-line',)
assembly-line 
('encore',)
encore 
('suit',)
suit 
('interview',)
interview 
('set',)
set 
('weakening',)
weakening 
('fret',)
fret 
('challenge',)
challenge 
('hardware',)
hardware 
('technique',)
technique 
('dividend',)
dividend 
('fund',)
fund 
('request',)
request 
('downfall',)
downfall 
('factory',)
factory 
('strike',

creditor 
('carrier',)
carrier 
('street',)
street 
('equity',)
equity 
('victim',)
victim 
('rhythm',)
rhythm 
('core',)
core 
('pianist\\/bassoonist\\/composer',)
pianist\/bassoonist\/composer 
('departure',)
departure 
('debate',)
debate 
('profit',)
profit 
('equipment',)
equipment 
('shareholder',)
shareholder 
('binge',)
binge 
('subtraction',)
subtraction 
('bell-ringer',)
bell-ringer 
('ballpark',)
ballpark 
('competition',)
competition 
('cadet',)
cadet 
('chassis',)
chassis 
('downturn',)
downturn 
('bassist',)
bassist 
('extent',)
extent 
('gasoline',)
gasoline 
('news',)
news 
('tv',)
tv 
('shipbuilding',)
shipbuilding 
('majority',)
majority 
('parking',)
parking 
('story',)
story 
('energy',)
energy 
('asbestos',)
asbestos 
('food-industry',)
food-industry 
('participant',)
participant 
('corn',)
corn 
('freeway',)
freeway 
('tire',)
tire 
('music',)
music 
('benchmark',)
benchmark 
('wife',)
wife 
('slide',)
slide 
('focus',)
focus 
('astonishment',)
astonishment 
('trig

('JJ', 'NP|<JJ&JJ>')
JJ NP|<JJ&JJ> 
('PRP$', 'NP|<NNS&POS>')
PRP$ NP|<NNS&POS> 
('JJ', 'NP')
JJ NP 
('PRP$', 'NP|<CD&CC>')
PRP$ NP|<CD&CC> 
('NP', 'NP|<:&ADVP-LOC>')
NP NP|<:&ADVP-LOC> 
('NP', 'NP-ADV')
NP NP-ADV 
('DT', 'NP|<NNS&CC>')
DT NP|<NNS&CC> 
('NP', 'NP|<PP&PP>')
NP NP|<PP&PP> 
('NNS', 'POS')
NNS POS 
('DT', 'NP|<JJS&RB>')
DT NP|<JJS&RB> 
('NP', 'NP|<PP-LOC&VP>')
NP NP|<PP-LOC&VP> 
('NNP', 'NP|<NN&VBN>')
NNP NP|<NN&VBN> 
('PRP$', 'NP|<NNP&NNP>')
PRP$ NP|<NNP&NNP> 
('JJR', 'NN')
JJR NN 
('JJ', 'NP|<NN&NNP>')
JJ NP|<NN&NNP> 
('NN', 'NP|<CC&NN>')
NN NP|<CC&NN> 
('JJ',)
JJ 
('NNP', 'NNS')
NNP NNS 
('NP', 'NP|<CONJP&NP>')
NP NP|<CONJP&NP> 
('DT', 'NP|<VBN&NNP>')
DT NP|<VBN&NNP> 
('NP', 'NP|<:&NP+NP>')
NP NP|<:&NP+NP> 
('NN', 'POS')
NN POS 
('CD', 'CD')
CD CD 
('CD', 'NP|<NN&NNS>')
CD NP|<NN&NNS> 
('NP', 'NP|<,&PP>')
NP NP|<,&PP> 
('NNS',)
NNS 
('NP', 'NP|<,&ADVP-TMP>')
NP NP|<,&ADVP-TMP> 
('CD', 'NP|<VBN&NNS>')
CD NP|<VBN&NNS> 
('NNPS', 'NP|<NNP&NNPS>')
NNPS NP|<NNP&NNPS> 
('NP', '

In [665]:
for sentence in end_wsj_sentences:
    #print(sentence)
    #print("")
    test = prob_cyk(sentence, bigger_prob)
    #print(test)
    #print("")
    print(make_tree(test, 'TOP'))
    print("")

1

1

1

1

1

1

1

1

1

1

1

1

1

1

1

1

1

1

1

1



#### A paragraph talking about your approach, what worked, what didn't, etc.
My code seems to be on the right track, but it's not getting any of the sentences from end_of_wsj to parse. When I run it for a simple example above ("the flight includes a meal" from J&M), I feel very confident that the code is parsing correctly. However, there seems to be an issue once I get to the larger grammar. I am not sure what's going on--perhaps a rounding issue from not using log prob? Maybe a different bug somewhere? 