In [1]:
import nltk
import re

In [4]:
>>> groucho_grammar = nltk.CFG.fromstring( """ 
... S -> NP VP 
... PP -> P NP 
... NP -> Det N | Det N PP | 'I' 
... VP -> V NP | VP PP 
... Det -> 'an' | 'my' 
... N -> 'elephant' | 'pajamas' 
... V -> 'shot' 
... P -> 'in' 
... """ )

In [5]:
>>> text = [ 'I' , 'shot' , 'an' , 'elephant' , 'in' , 'my' , 'pajamas' ]
>>> groucho_grammar.productions(rhs=text[1])

[V -> 'shot']

In [14]:
def init_wfst (tokens, grammar):
    numtokens = len(tokens)
    wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)]
    for i in range(numtokens):
        productions = grammar.productions(rhs=tokens[i])
        wfst[i][i+1] = productions[0].lhs()
    return wfst

def complete_wfst (wfst, tokens, grammar, trace=False):
    index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
    numtokens = len(tokens)
    for span in range(2, numtokens+1):
         for start in range(numtokens+1-span):
            end = start + span
            for mid in range(start+1, end):
                nt1, nt2 = wfst[start][mid], wfst[mid][end]
                if nt1 and nt2 and (nt1,nt2) in index:
                    wfst[start][end] = index[(nt1,nt2)]
                    if trace:
                         print ( "[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % \
                        (start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end))
    return wfst

def display (wfst, tokens):
    print ( '\nWFST ' + ' ' .join(( "%-4d" % i) for i in range(1, len(wfst))))
    for i in range(len( wfst)-1):
        print ( "%d " % i, end= " " )
        for j in range(1, len(wfst)):
            print ( "%-4s" % (wfst[i][j] or  '.' ), end= " " )
        print ()

tokens ="I shot an elephant in my pajamas".split()
wfst0 = init_wfst(tokens, groucho_grammar)
display(wfst0, tokens)


WFST 1    2    3    4    5    6    7   
0  NP   .    .    .    .    .    .    
1  .    V    .    .    .    .    .    
2  .    .    Det  .    .    .    .    
3  .    .    .    N    .    .    .    
4  .    .    .    .    P    .    .    
5  .    .    .    .    .    Det  .    
6  .    .    .    .    .    .    N    


In [15]:
>>> wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)
>>> display(wfst1, tokens)


WFST 1    2    3    4    5    6    7   
0  NP   .    .    S    .    .    S    
1  .    V    .    VP   .    .    VP   
2  .    .    Det  NP   .    .    .    
3  .    .    .    N    .    .    .    
4  .    .    .    .    P    .    PP   
5  .    .    .    .    .    Det  NP   
6  .    .    .    .    .    .    N    


In [17]:
>>> groucho_dep_grammar = nltk.DependencyGrammar.fromstring( """ 
... 'shot' -> 'I' | 'elephant' | 'in' 
... 'elephant' -> 'an' | 'in' 
... 'in' -> 'pajamas' 
... 'pajamas' -> 'my' 
... """ )
>>> print (groucho_dep_grammar)

Dependency grammar with 7 productions
  'shot' -> 'I'
  'shot' -> 'elephant'
  'shot' -> 'in'
  'elephant' -> 'an'
  'elephant' -> 'in'
  'in' -> 'pajamas'
  'pajamas' -> 'my'


In [18]:
>>> from nltk.corpus import treebank
>>> t = treebank.parsed_sents( 'wsj_0001.mrg' )[0]
>>> print (t)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


In [19]:
def  filter (tree):
    child_nodes = [child.label() for child in tree
                    if isinstance(child, nltk.Tree)]
    return (tree.label() == 'VP' ) and ( 'S'  in child_nodes)

In [20]:
>>> from collections import defaultdict
>>> entries = nltk.corpus.ppattach.attachments( 'training' )
>>> table = defaultdict( lambda : defaultdict(set))
>>> for entry in entries:
...     key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2
...     table[key][entry.attachment].add(entry.verb)
... 
>>> for key in sorted( table):
...     if len(table[key]) > 1:
...        print(key, 'N:' , sorted(table[key][ 'N' ]), 'V:' , sorted(table[key][ 'V' ]))

%-below-level N: ['left'] V: ['be']
%-from-year N: ['was'] V: ['declined', 'dropped', 'fell', 'grew', 'increased', 'plunged', 'rose', 'was']
%-in-August N: ['was'] V: ['climbed', 'fell', 'leaping', 'rising', 'rose']
%-in-September N: ['increased'] V: ['climbed', 'declined', 'dropped', 'edged', 'fell', 'grew', 'plunged', 'rose', 'slipped']
%-in-week N: ['declined'] V: ['was']
%-to-% N: ['add', 'added', 'backed', 'be', 'cut', 'go', 'grow', 'increased', 'increasing', 'is', 'offer', 'plummet', 'reduce', 'rejected', 'rise', 'risen', 'shaved', 'wants', 'yield', 'zapping'] V: ['fell', 'rise', 'slipped']
%-to-million N: ['declining'] V: ['advanced', 'climbed', 'cutting', 'declined', 'declining', 'dived', 'dropped', 'edged', 'fell', 'gained', 'grew', 'increased', 'jump', 'jumped', 'plunged', 'rising', 'rose', 'slid', 'slipped', 'soared', 'tumbled']
1-to-21 N: ['dropped'] V: ['dropped']
1-to-33 N: ['gained'] V: ['dropped', 'fell', 'jumped']
1-to-4 N: ['added'] V: ['gained']
1-to-47 N: ['jumped']

In [21]:
>>> nltk.corpus.sinica_treebank.parsed_sents()

[Tree('NP', [Tree('Neu', ['一'])]), Tree('NP', [Tree('Nad', ['友情'])]), ...]