## The rule-based approach:
通过代码，人工设定语法规则：

In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [1]:
# Context Free Grammer
from nltk import CFG

In [6]:
toy_grammer = CFG.fromstring(
"""
S -> NP VP
VP -> V NP
V -> 'eats' | 'drinks'
NP -> DET N
Det -> 'a' | 'an' | 'the'
N -> 'president' | 'Obama' | 'apple' | 'coke'
""")

# 一个完整的句子 =                限定词 + 名词（词组） + 动词（词组） + 限定词 + 名词（词组）
# S= NP + VP = NP + (V + NP) = (DET  +   N    ) +   (V + (        DET +    N    ))

In [7]:
toy_grammer.productions()

[S -> NP VP,
 VP -> V NP,
 V -> 'eats',
 V -> 'drinks',
 NP -> DET N,
 Det -> 'a',
 Det -> 'an',
 Det -> 'the',
 N -> 'president',
 N -> 'Obama',
 N -> 'apple',
 N -> 'coke']

### 以上给定的规则可生成的句子有：
* President eats apple
* Obama drinks coke

但同样的语法也可能生成无意义的句子：
* Apple eats coke
* President drinks Obama

# Different types of parsers (skim)

## Unit tests for the  CFG class

In [9]:
from nltk import Nonterminal, nonterminals, Production, CFG

In [10]:
nt1 = Nonterminal("NP")
nt2 = Nonterminal("VP")
nt1.symbol()

'NP'

In [11]:
nt1 == Nonterminal('NP')

True

In [12]:
nt1 == nt2

False

In [20]:
S, NP, VP, PP = nonterminals('S, NP, VP, PP')
N, V, P, DT = nonterminals('N, V, P, DT')

In [21]:
S.symbol()
VP.symbol()

'S'

'VP'

In [28]:
prod1 = Production(S, [NP, VP])
prod2 = Production(NP, [DT, NP])
prod1.lhs() #Return the left-hand side of this ``Production``.
prod2.rhs()

S

(DT, NP)

In [29]:
prod1 == Production(S, [NP, VP])
prod1 == prod2

True

False

In [31]:
grammer = CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> 'the' N | PP | 'the' N PP
    VP -> V NP | V PP | V NP PP
    N -> 'cat' | 'dog' |'rug'
    V -> 'chased'| 'sat'
    P -> 'in'|'on'
    """)

In [32]:
grammar.productions()

[S -> NP VP,
 PP -> P NP,
 NP -> 'the' N,
 NP -> N PP,
 NP -> 'the' N PP,
 VP -> V NP,
 VP -> V PP,
 VP -> V NP PP,
 N -> 'cat',
 N -> 'dog',
 N -> 'rug',
 V -> 'chased',
 V -> 'sat',
 P -> 'in',
 P -> 'on']

## Recursive Descent Parser

In [35]:
from nltk.parse import RecursiveDescentParser
rd = RecursiveDescentParser(grammar)

In [36]:
# 分别解析歧义句和非歧义句
sentence1 = 'the cat chased the dog'
sentence2 = 'the cat chased the dog on the rug'
tokens1 = sentence1.split()
tokens2 = sentence2.split()

In [38]:
for t in rd.parse(tokens1):
    print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [39]:
for t in rd.parse(tokens2):
    print(t)

(S
  (NP the (N cat))
  (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
(S
  (NP the (N cat))
  (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))


### sr (Shift Reduce Parser) 

In [40]:
from nltk.parse import ShiftReduceParser
sr = ShiftReduceParser(grammer)

In [41]:
for t in sr.parse(tokens1):
    print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [42]:
for t in sr.parse(tokens2):
    print(t)     # 只会返回唯一的之前的解析

**当有多个可能的shift or reduce 操作可供选择时，sr 解析器采用heuristics 来做决策。而对于给定的语法，会选择错误的操作。**


## Chart Parser

In [43]:
import nltk

In [47]:
# First, we test tracing with a short sentence
nltk.parse.chart.demo(2, print_times=False, trace=1,sent='I saw a dog', numparses=1)

* Sentence:
I saw a dog
['I', 'saw', 'a', 'dog']

* Strategy: Bottom-up

|.    I    .   saw   .    a    .   dog   .|
|[---------]         .         .         .| [0:1] 'I'
|.         [---------]         .         .| [1:2] 'saw'
|.         .         [---------]         .| [2:3] 'a'
|.         .         .         [---------]| [3:4] 'dog'
|>         .         .         .         .| [0:0] NP -> * 'I'
|[---------]         .         .         .| [0:1] NP -> 'I' *
|>         .         .         .         .| [0:0] S  -> * NP VP
|>         .         .         .         .| [0:0] NP -> * NP PP
|[--------->         .         .         .| [0:1] S  -> NP * VP
|[--------->         .         .         .| [0:1] NP -> NP * PP
|.         >         .         .         .| [1:1] Verb -> * 'saw'
|.         [---------]         .         .| [1:2] Verb -> 'saw' *
|.         >         .         .         .| [1:1] VP -> * Verb NP
|.         >         .         .         .| [1:1] VP -> * Verb
|.         [--------->

In [48]:
# then we test the different parsing Strategies. Note that the number of edges differ between the strategies.
# Top - Down
nltk.parse.chart.demo(1,print_times=False, trace=0, sent='I saw John with a dog',numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Top-down

Nr edges in chart: 48
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))



In [49]:
# Bottom - up
nltk.parse.chart.demo(2,print_times=False, trace=0, sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Bottom-up

Nr edges in chart: 53
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))



## Regexp parser

In [1]:
import nltk
from nltk.chunk import regexp

In [3]:
chunk_rules = regexp.ChunkRule('<.*>+','chunk everything')

In [2]:
reg_parser = regexp.RegexpParser('''
 NP: {<DT>? <JJ>* <NN>*}
  P: {<IN>}             
  V: {<V.*>}     
 PP: {<P> <NP>}          
 VP: {<V> <NP|PP>*}  
  ''')

In [3]:
test_sent = "Mr. Obama played a big role in the Health insurance bill"
tokens = nltk.word_tokenize(test_sent)
test_sent_pos = nltk.pos_tag(tokens)
paresed_out = reg_parser.parse(test_sent_pos)

In [4]:
print(paresed_out)

(S
  Mr./NNP
  Obama/NNP
  (VP
    (V played/VBD)
    (NP a/DT big/JJ role/NN)
    (PP (P in/IN) (NP the/DT)))
  Health/NNP
  (NP insurance/NN bill/NN))


## Dependency parser 

### Stanford Parser [Very useful]

http://nlp.stanford.edu:8080/parser/index.jsp

In [5]:
from nltk.parse.stanford import StanfordParser
english_parser = StanfordParser('stanford-parser.jar','stanford-parser-3.4-models.jar')
english_parser.raw_parse_sents(("this is the english parser test"))

LookupError: Could not find stanford-parser\.jar jar file at stanford-parser.jar