## The rule-based approach:
通过代码，人工设定语法规则：

In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [1]:
# Context Free Grammer
from nltk import CFG

In [6]:
toy_grammer = CFG.fromstring(
"""
S -> NP VP
VP -> V NP
V -> 'eats' | 'drinks'
NP -> DET N
Det -> 'a' | 'an' | 'the'
N -> 'president' | 'Obama' | 'apple' | 'coke'
""")

# 一个完整的句子 =                限定词 + 名词（词组） + 动词（词组） + 限定词 + 名词（词组）
# S= NP + VP = NP + (V + NP) = (DET  +   N    ) +   (V + (        DET +    N    ))

In [7]:
toy_grammer.productions()

[S -> NP VP,
 VP -> V NP,
 V -> 'eats',
 V -> 'drinks',
 NP -> DET N,
 Det -> 'a',
 Det -> 'an',
 Det -> 'the',
 N -> 'president',
 N -> 'Obama',
 N -> 'apple',
 N -> 'coke']

### 以上给定的规则可生成的句子有：
* President eats apple
* Obama drinks coke

但同样的语法也可能生成无意义的句子：
* Apple eats coke
* President drinks Obama

# Different types of parsers (skim)

## Unit tests for the  CFG class

In [9]:
from nltk import Nonterminal, nonterminals, Production, CFG

In [10]:
nt1 = Nonterminal("NP")
nt2 = Nonterminal("VP")
nt1.symbol()

'NP'

In [11]:
nt1 == Nonterminal('NP')

True

In [12]:
nt1 == nt2

False

In [20]:
S, NP, VP, PP = nonterminals('S, NP, VP, PP')
N, V, P, DT = nonterminals('N, V, P, DT')

In [21]:
S.symbol()
VP.symbol()

'S'

'VP'

In [28]:
prod1 = Production(S, [NP, VP])
prod2 = Production(NP, [DT, NP])
prod1.lhs() #Return the left-hand side of this ``Production``.
prod2.rhs()

S

(DT, NP)

In [29]:
prod1 == Production(S, [NP, VP])
prod1 == prod2

True

False

In [31]:
grammer = CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> 'the' N | PP | 'the' N PP
    VP -> V NP | V PP | V NP PP
    N -> 'cat' | 'dog' |'rug'
    V -> 'chased'| 'sat'
    P -> 'in'|'on'
    """)

In [32]:
grammar.productions()

[S -> NP VP,
 PP -> P NP,
 NP -> 'the' N,
 NP -> N PP,
 NP -> 'the' N PP,
 VP -> V NP,
 VP -> V PP,
 VP -> V NP PP,
 N -> 'cat',
 N -> 'dog',
 N -> 'rug',
 V -> 'chased',
 V -> 'sat',
 P -> 'in',
 P -> 'on']

## Recursive Descent Parser

In [35]:
from nltk.parse import RecursiveDescentParser
rd = RecursiveDescentParser(grammar)

In [36]:
# 分别解析歧义句和非歧义句
sentence1 = 'the cat chased the dog'
sentence2 = 'the cat chased the dog on the rug'
tokens1 = sentence1.split()
tokens2 = sentence2.split()

In [38]:
for t in rd.parse(tokens1):
    print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [39]:
for t in rd.parse(tokens2):
    print(t)

(S
  (NP the (N cat))
  (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
(S
  (NP the (N cat))
  (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))


### sr (Shift Reduce Parser) 

In [40]:
from nltk.parse import ShiftReduceParser
sr = ShiftReduceParser(grammer)

In [41]:
for t in sr.parse(tokens1):
    print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [42]:
for t in sr.parse(tokens2):
    print(t)     # 只会返回唯一的之前的解析

**当有多个可能的shift or reduce 操作可供选择时，sr 解析器采用heuristics 来做决策。而对于给定的语法，会选择错误的操作。**


## Chart Parser

In [43]:
import nltk

In [47]:
# First, we test tracing with a short sentence
nltk.parse.chart.demo(2, print_times=False, trace=1,sent='I saw a dog', numparses=1)

* Sentence:
I saw a dog
['I', 'saw', 'a', 'dog']

* Strategy: Bottom-up

|.    I    .   saw   .    a    .   dog   .|
|[---------]         .         .         .| [0:1] 'I'
|.         [---------]         .         .| [1:2] 'saw'
|.         .         [---------]         .| [2:3] 'a'
|.         .         .         [---------]| [3:4] 'dog'
|>         .         .         .         .| [0:0] NP -> * 'I'
|[---------]         .         .         .| [0:1] NP -> 'I' *
|>         .         .         .         .| [0:0] S  -> * NP VP
|>         .         .         .         .| [0:0] NP -> * NP PP
|[--------->         .         .         .| [0:1] S  -> NP * VP
|[--------->         .         .         .| [0:1] NP -> NP * PP
|.         >         .         .         .| [1:1] Verb -> * 'saw'
|.         [---------]         .         .| [1:2] Verb -> 'saw' *
|.         >         .         .         .| [1:1] VP -> * Verb NP
|.         >         .         .         .| [1:1] VP -> * Verb
|.         [--------->

In [48]:
# then we test the different parsing Strategies. Note that the number of edges differ between the strategies.
# Top - Down
nltk.parse.chart.demo(1,print_times=False, trace=0, sent='I saw John with a dog',numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Top-down

Nr edges in chart: 48
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))



In [49]:
# Bottom - up
nltk.parse.chart.demo(2,print_times=False, trace=0, sent='I saw John with a dog', numparses=2)

* Sentence:
I saw John with a dog
['I', 'saw', 'John', 'with', 'a', 'dog']

* Strategy: Bottom-up

Nr edges in chart: 53
(S
  (NP I)
  (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
(S
  (NP I)
  (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))



## Regexp parser

In [1]:
import nltk
from nltk.chunk import regexp

In [3]:
chunk_rules = regexp.ChunkRule('<.*>+','chunk everything')

In [2]:
reg_parser = regexp.RegexpParser('''
 NP: {<DT>? <JJ>* <NN>*}
  P: {<IN>}             
  V: {<V.*>}     
 PP: {<P> <NP>}          
 VP: {<V> <NP|PP>*}  
  ''')

In [3]:
test_sent = "Mr. Obama played a big role in the Health insurance bill"
tokens = nltk.word_tokenize(test_sent)
test_sent_pos = nltk.pos_tag(tokens)
paresed_out = reg_parser.parse(test_sent_pos)

In [4]:
print(paresed_out)

(S
  Mr./NNP
  Obama/NNP
  (VP
    (V played/VBD)
    (NP a/DT big/JJ role/NN)
    (PP (P in/IN) (NP the/DT)))
  Health/NNP
  (NP insurance/NN bill/NN))


## Dependency parser 

### Stanford Parser [Very useful]

http://nlp.stanford.edu:8080/parser/index.jsp

In [5]:
from nltk.parse.stanford import StanfordParser
english_parser = StanfordParser('stanford-parser.jar','stanford-parser-3.4-models.jar')
english_parser.raw_parse_sents(("this is the english parser test"))

LookupError: Could not find stanford-parser\.jar jar file at stanford-parser.jar

## Chunking (skim)

In [4]:
import nltk
from nltk.chunk.regexp import *
test_sent = "The prime minister announced he had asked the chief government whip, Philip Ruddock, to call\
            a special party room meeting for 9am on Monday to consider the spill motion."
tokens = nltk.word_tokenize(test_sent)
test_sent_pos = nltk.pos_tag(tokens)

In [5]:
rule_vp = ChunkRule(r'(<VB.*>)?(<VB.*>)+(<PRP>)?','Chunk VPs')
parser_vp = RegexpChunkParser([rule_vp], chunk_label='VP')
print(parser_vp.parse(test_sent_pos))

(S
  The/DT
  prime/JJ
  minister/NN
  (VP announced/VBD he/PRP)
  (VP had/VBD asked/VBN)
  the/DT
  chief/JJ
  government/NN
  whip/NN
  ,/,
  Philip/NNP
  Ruddock/NNP
  ,/,
  to/TO
  (VP call/VB)
  a/DT
  special/JJ
  party/NN
  room/NN
  meeting/NN
  for/IN
  9am/CD
  on/IN
  Monday/NNP
  to/TO
  (VP consider/VB)
  the/DT
  spill/NN
  motion/NN
  ./.)


In [7]:
rule_np = ChunkRule(r'(<DT>?<RB>?)?<JJ|CD>*(<JJ|CD><,>)*(<NN.*>)+','Chunk NPs')
parser_np = RegexpChunkParser([rule_np],chunk_label="NP")
print(parser_np.parse(test_sent_pos))

(S
  (NP The/DT prime/JJ minister/NN)
  announced/VBD
  he/PRP
  had/VBD
  asked/VBN
  (NP the/DT chief/JJ government/NN whip/NN)
  ,/,
  (NP Philip/NNP Ruddock/NNP)
  ,/,
  to/TO
  call/VB
  (NP a/DT special/JJ party/NN room/NN meeting/NN)
  for/IN
  9am/CD
  on/IN
  (NP Monday/NNP)
  to/TO
  consider/VB
  (NP the/DT spill/NN motion/NN)
  ./.)


# Information extraction
![Information extraction pipeline](pipeline.png)

### Named entity Recognization

In [10]:
import nltk
from nltk.chunk import ne_chunk

In [17]:
f = open('text_data.txt')
text=f.read()
text

'Tests showed that the chemical fipronil, which can harm people\'s kidneys, liver and thyroid glands, was found in eggs from the Netherlands.\nFipronil is used to treat lice and ticks in chickens.\nOne German official said up to 10 million of the contaminated eggs may have been sold in Germany.\nChristian Meyer, the agriculture minister for Lower Saxony, told German television that there was a risk to children if they ate two of the eggs a day.\nAbout 180 poultry farms in the Netherlands have been temporarily shut in recent days while investigations are held.\nMeanwhile, European supermarkets have moved to halt the distribution of eggs from the affected batches.\nHowever, Aldi - which has close to 4,000 stores in Germany - is the first retailer to stop selling all eggs as a precaution.\n"This is merely a precaution, there is no reason to assume there are any health risks," Aldi said in a statement.\nA spokeswoman for Aldi UK told the BBC its eggs were all British and were not affected 

In [19]:
sentences = nltk.sent_tokenize(text)
sentences

["Tests showed that the chemical fipronil, which can harm people's kidneys, liver and thyroid glands, was found in eggs from the Netherlands.",
 'Fipronil is used to treat lice and ticks in chickens.',
 'One German official said up to 10 million of the contaminated eggs may have been sold in Germany.',
 'Christian Meyer, the agriculture minister for Lower Saxony, told German television that there was a risk to children if they ate two of the eggs a day.',
 'About 180 poultry farms in the Netherlands have been temporarily shut in recent days while investigations are held.',
 'Meanwhile, European supermarkets have moved to halt the distribution of eggs from the affected batches.',
 'However, Aldi - which has close to 4,000 stores in Germany - is the first retailer to stop selling all eggs as a precaution.',
 '"This is merely a precaution, there is no reason to assume there are any health risks," Aldi said in a statement.',
 'A spokeswoman for Aldi UK told the BBC its eggs were all Britis

In [23]:
tokenized_sentences = [ nltk.word_tokenize(sentence) for sentence in sentences]
tokenized_sentences[:2]

[['Tests',
  'showed',
  'that',
  'the',
  'chemical',
  'fipronil',
  ',',
  'which',
  'can',
  'harm',
  'people',
  "'s",
  'kidneys',
  ',',
  'liver',
  'and',
  'thyroid',
  'glands',
  ',',
  'was',
  'found',
  'in',
  'eggs',
  'from',
  'the',
  'Netherlands',
  '.'],
 ['Fipronil',
  'is',
  'used',
  'to',
  'treat',
  'lice',
  'and',
  'ticks',
  'in',
  'chickens',
  '.']]

In [26]:
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
tagged_sentences[:2]

[[('Tests', 'NNS'),
  ('showed', 'VBD'),
  ('that', 'IN'),
  ('the', 'DT'),
  ('chemical', 'NN'),
  ('fipronil', 'NN'),
  (',', ','),
  ('which', 'WDT'),
  ('can', 'MD'),
  ('harm', 'VB'),
  ('people', 'NNS'),
  ("'s", 'POS'),
  ('kidneys', 'NNS'),
  (',', ','),
  ('liver', 'NN'),
  ('and', 'CC'),
  ('thyroid', 'JJ'),
  ('glands', 'NNS'),
  (',', ','),
  ('was', 'VBD'),
  ('found', 'VBN'),
  ('in', 'IN'),
  ('eggs', 'NNS'),
  ('from', 'IN'),
  ('the', 'DT'),
  ('Netherlands', 'NNP'),
  ('.', '.')],
 [('Fipronil', 'NNP'),
  ('is', 'VBZ'),
  ('used', 'VBN'),
  ('to', 'TO'),
  ('treat', 'VB'),
  ('lice', 'NN'),
  ('and', 'CC'),
  ('ticks', 'NNS'),
  ('in', 'IN'),
  ('chickens', 'NNS'),
  ('.', '.')]]

In [27]:
for sent in tagged_sentences:
    print(nltk.ne_chunk(sent))

(S
  Tests/NNS
  showed/VBD
  that/IN
  the/DT
  chemical/NN
  fipronil/NN
  ,/,
  which/WDT
  can/MD
  harm/VB
  people/NNS
  's/POS
  kidneys/NNS
  ,/,
  liver/NN
  and/CC
  thyroid/JJ
  glands/NNS
  ,/,
  was/VBD
  found/VBN
  in/IN
  eggs/NNS
  from/IN
  the/DT
  (GPE Netherlands/NNP)
  ./.)
(S
  (GPE Fipronil/NNP)
  is/VBZ
  used/VBN
  to/TO
  treat/VB
  lice/NN
  and/CC
  ticks/NNS
  in/IN
  chickens/NNS
  ./.)
(S
  One/CD
  (GPE German/JJ)
  official/NN
  said/VBD
  up/RB
  to/TO
  10/CD
  million/CD
  of/IN
  the/DT
  contaminated/VBN
  eggs/NNS
  may/MD
  have/VB
  been/VBN
  sold/VBN
  in/IN
  (GPE Germany/NNP)
  ./.)
(S
  (GPE Christian/JJ)
  (PERSON Meyer/NNP)
  ,/,
  the/DT
  agriculture/NN
  minister/NN
  for/IN
  (PERSON Lower/NNP Saxony/NNP)
  ,/,
  told/VBD
  (GPE German/JJ)
  television/NN
  that/IN
  there/EX
  was/VBD
  a/DT
  risk/NN
  to/TO
  children/NNS
  if/IN
  they/PRP
  ate/VBP
  two/CD
  of/IN
  the/DT
  eggs/NNS
  a/DT
  day/NN
  ./.)
(S
  About/IN
  180/C

## Relation extraction

In [28]:
import re

In [30]:
#组织与位置之间的关系已经被定义好了，我们要提取的是这些模式的所有组合
IN = re.compile(r'.*\bin\b(?!\b.+ing)')  # 这个pattern大致是提取 in 前后的名词
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): # Organization and location
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
