In [None]:
import nltk, re, pprint

## 1 Chunking:

#### Phrase chunking and tag patterns using regex:

In [6]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
    ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

grammar = "NP: {<DT>?<JJ>*<NN>}" # tag pattern

cp = nltk.RegexpParser(grammar) # chunk parser
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [7]:
result.draw()

#### Tag pattern combination:

In [24]:
grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN.*>} # determiner/ possesive, adjective and noun
        {<NNP>+}                # proper nouns
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
                 ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
result = cp.parse(sentence)

In [25]:
result.draw()

##### Overlapping matches:

In [162]:
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN>{2}}  # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN) fund/NN)


#### Exploring text corpora:

Search text corpora using a tag pattern:

In [158]:
grammar = 'CHUNK: {<V.*> <TO> <V.*>}'

cp = nltk.RegexpParser(grammar)
brown = nltk.corpus.brown

for x in brown.tagged_sents()[:50]:
    tree = cp.parse(x)
    for y in tree.subtrees():
        if y.label() == 'CHUNK': print(y)

(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)


In [170]:
def find_chunks(grammar, tagged_sents):
    cp = nltk.RegexpParser(grammar)
    for x in tagged_sents:
        tree = cp.parse(x)
        for y in tree.subtrees():
            if y.label() == grammar.split(':')[0]: print(y)
                
tagged_sents = brown.tagged_sents()[:50]
grammar = "NOUNS: {<N.*>{4,}}"

find_chunks(grammar, tagged_sents)

(NOUNS Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP)
(NOUNS Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP)
(NOUNS Georgia's/NP$ automobile/NN title/NN law/NN)
(NOUNS State/NN-TL Welfare/NN-TL Department's/NN$-TL handling/NN)
(NOUNS Fulton/NP-TL Tax/NN-TL Commissioner's/NN$-TL Office/NN-TL)
(NOUNS Mayor/NN-TL William/NP B./NP Hartsfield/NP)
(NOUNS Mrs./NP J./NP M./NP Cheshire/NP)
(NOUNS E./NP Pelham/NP Rd./NN-TL Aj/NN)
(NOUNS
  State/NN-TL
  Party/NN-TL
  Chairman/NN-TL
  James/NP
  W./NP
  Dorsey/NP)
(NOUNS Texas/NP Sen./NN-TL John/NP Tower/NP)


#### Chinking:

In [171]:
grammar = r"""
    NP:
        {<.*>+}       # Chunk everything
        }<VBD|IN>+{   # Chink sequences of VBD and IN
"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
       ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


## 2 Developing and evaluating chunkers:

In [172]:
text = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''

In [174]:
nltk.chunk.conllstr2tree(text, chunk_types=['NP', 'PP']).draw()

#### Simple evaluation and baselines:

In [179]:
from nltk.corpus import conll2000
cp = nltk.RegexpParser("") # searching for 'O' tag (i.e. not in chunk)
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [182]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [244]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t, c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for pos, chunktag in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [245]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chuncker = UnigramChunker(train_sents)
print(unigram_chuncker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [255]:
sentence = nltk.corpus.brown.tagged_sents()[0]
nltk.chunk.tree2conlltags(unigram_chuncker.parse(sentence))

[('The', 'AT', 'O'),
 ('Fulton', 'NP-TL', 'O'),
 ('County', 'NN-TL', 'O'),
 ('Grand', 'JJ-TL', 'O'),
 ('Jury', 'NN-TL', 'O'),
 ('said', 'VBD', 'O'),
 ('Friday', 'NR', 'O'),
 ('an', 'AT', 'O'),
 ('investigation', 'NN', 'B-NP'),
 ('of', 'IN', 'O'),
 ("Atlanta's", 'NP$', 'O'),
 ('recent', 'JJ', 'B-NP'),
 ('primary', 'NN', 'I-NP'),
 ('election', 'NN', 'I-NP'),
 ('produced', 'VBD', 'O'),
 ('``', '``', 'O'),
 ('no', 'AT', 'O'),
 ('evidence', 'NN', 'B-NP'),
 ("''", "''", 'O'),
 ('that', 'CS', 'O'),
 ('any', 'DTI', 'O'),
 ('irregularities', 'NNS', 'B-NP'),
 ('took', 'VBD', 'O'),
 ('place', 'NN', 'B-NP'),
 ('.', '.', 'O')]