In [1]:
# Exercise: 7-2
# Write a tag pattern to match noun phrases containing plural head nouns, e.g., many/JJ researchers/NNS, two/CD
# weeks/NNS, both/DT new/JJ positions/NNS. Try to do this by generalizing the tag pattern that handled singular
# noun phrases.

import nltk

grammar = r"""
    NP: {<DT>?<CD|JJ><NN.>}
"""
cp = nltk.RegexpParser(grammar)

In [2]:
nps =[[("many", "JJ"), ("researchers", "NNS")],
      [("two", "CD"), ("weeks", "NNS")],
      [("both", "DT"), ("new", "JJ"), ("positions", "NNS")]]

for n in nps:
    print(cp.parse(n))

(S (NP many/JJ researchers/NNS))
(S (NP two/CD weeks/NNS))
(S (NP both/DT new/JJ positions/NNS))


In [3]:
# Exercise: 7-3
# Pick one of the three chunk types in the CoNLL-2000 Chunking Corpus. Inspect the data and try to observe any
# patterns in the POS tag sequences that make up this kind of chunk. Develop a simple chunker using the regular
# expression chunker nltk.RegexpParser. Discuss any tag sequences that are difficult to chunk reliably.

from nltk.corpus import conll2000
for i in range(20):
    print(i, conll2000.chunked_sents('train.txt', chunk_types = ['VP'])[i])

0 (S
  Confidence/NN
  in/IN
  the/DT
  pound/NN
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  another/DT
  sharp/JJ
  dive/NN
  if/IN
  trade/NN
  figures/NNS
  for/IN
  September/NNP
  ,/,
  due/JJ
  for/IN
  release/NN
  tomorrow/NN
  ,/,
  (VP fail/VB to/TO show/VB)
  a/DT
  substantial/JJ
  improvement/NN
  from/IN
  July/NNP
  and/CC
  August/NNP
  's/POS
  near-record/JJ
  deficits/NNS
  ./.)
1 (S
  Chancellor/NNP
  of/IN
  the/DT
  Exchequer/NNP
  Nigel/NNP
  Lawson/NNP
  's/POS
  restated/VBN
  commitment/NN
  to/TO
  a/DT
  firm/NN
  monetary/JJ
  policy/NN
  (VP has/VBZ helped/VBN to/TO prevent/VB)
  a/DT
  freefall/NN
  in/IN
  sterling/NN
  over/IN
  the/DT
  past/JJ
  week/NN
  ./.)
2 (S
  But/CC
  analysts/NNS
  (VP reckon/VBP)
  underlying/VBG
  support/NN
  for/IN
  sterling/NN
  (VP has/VBZ been/VBN eroded/VBN)
  by/IN
  the/DT
  chancellor/NN
  's/POS
  failure/NN
  (VP to/TO announce/VB)
  any/DT
  new/JJ
  policy/NN
  measures/NNS
  in/IN
  his/PRP$
  Mansio

In [4]:
grammar = r"VP: {<[VRMT].*>+}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['VP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  94.3%%
    Precision:     64.2%%
    Recall:        80.4%%
    F-Measure:     71.4%%


In [5]:
grammar = r"VP: {<VB.>?<RB>*<MD>?<VB.>?<TO>?<VB.>}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['VP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.3%%
    Precision:     74.4%%
    Recall:        66.8%%
    F-Measure:     70.4%%


In [6]:
grammar = r"VP: {<VB.>?<RB.>*<MD>?<VB.>?<TO>?<VB.>}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['VP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.4%%
    Precision:     72.7%%
    Recall:        66.9%%
    F-Measure:     69.7%%


In [7]:
grammar = r"VP: {<VB.>?<RB>*<MD>?<VB.>?<TO>?<MD>?<RB>*<VB.>}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['VP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.3%%
    Precision:     74.6%%
    Recall:        66.9%%
    F-Measure:     70.5%%


In [8]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['VP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types = ['VP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  94.3%%
    Precision:     60.5%%
    Recall:        74.2%%
    F-Measure:     66.7%%


In [9]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  96.5%%
    Precision:     75.1%%
    Recall:        85.7%%
    F-Measure:     80.0%%


In [10]:
# Exercise: 7-4
# An early definition of chunk was the material that occurs between chinks. Develop a chunker that starts by
# putting the whole sentence in a single chunk, and then does the rest of its work solely by chinking. Determine
# which tags (or tag sequences) are most likely to make up chinks with the help of your own utility program.
# Compare the performance and simplicity of this approach relative to a chunker based entirely on chunk rules.

import nltk, re
from nltk.corpus import conll2000

grammar = r"""
  NP:
    {<.*>+}
    }<VB.|IN>+{
  """

cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  65.5%%
    Precision:     32.6%%
    Recall:        26.0%%
    F-Measure:     28.9%%


In [11]:
cp.evaluate(test_sents).incorrect()[20:40]

[ImmutableTree('NP', [('$', '$'), ('47', 'CD'), (',', ',')]),
 ImmutableTree('NP', [('well', 'RB')]),
 ImmutableTree('NP', [('--', ':'), ('Bruce', 'NNP'), ('Kafaroff', 'NNP'), ('.', '.')]),
 ImmutableTree('NP', [('the', 'DT'), ('magazine', 'NN'), ('.', '.')]),
 ImmutableTree('NP', [('1', 'CD'), ('3\\/8', 'CD'), ('to', 'TO'), ('58', 'CD'), ('1\\/2', 'CD'), (';', ':'), ('Coca-Cola', 'NNP'), ('Co.', 'NNP')]),
 ImmutableTree('NP', [('a', 'DT'), ('moderate', 'JJ'), ('3.5', 'CD'), ('million', 'CD'), ('ounces', 'NNS'), ('.', '.')]),
 ImmutableTree('NP', [('an', 'DT')]),
 ImmutableTree('NP', [('medical', 'JJ'), ('therapy', 'NN'), ('.', '.')]),
 ImmutableTree('NP', [(',', ',')]),
 ImmutableTree('NP', [('America', 'NNP'), ("'s", 'POS'), ('long', 'JJ'), ('history', 'NN')]),
 ImmutableTree('NP', [('investors', 'NNS'), ('to', 'TO'), ('focus', 'VB')]),
 ImmutableTree('NP', [('mostly', 'RB'), ('to', 'TO')]),
 ImmutableTree('NP', [('1936', 'CD'), (',', ','), ('John', 'NNP'), ('Maynard', 'NNP'), ('Keyn

In [12]:
grammar = r"""
  NP:
    {<.*>+}
    }<VB.|IN|,|.|CC|TO|''|MD|``|RB|>+{
  """

cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types = ['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  85.6%%
    Precision:     64.8%%
    Recall:        68.0%%
    F-Measure:     66.3%%


In [13]:
# Exercise: 7-5
# Write a tag pattern to cover noun phrases that contain gerunds, e.g., the/DT receiving/VBG end/NN, assistant/NN
# managing/VBG editor/NN. Add these patterns to the grammar, one per line. Test your work using some tagged
# sentences of your own devising.

grammar = """
    NP: {<DT><VBG><NN>}    # chunk determiner, gerund, and noun
        {<NN><VBG><NN>}    # chunk noun, gerund, and noun
"""
cp = nltk.RegexpParser(grammar)
sentences = [[("the", "DT"), ("receiving", "VBG"), ("end", "NN")],
             [("assistant", "NN"),  ("managing", "VBG"),  ("editor", "NN")]]

for sent in sentences:
    print(cp.parse(sent))

(S (NP the/DT receiving/VBG end/NN))
(S (NP assistant/NN managing/VBG editor/NN))


In [14]:
import re

grammar = r"""
    NP: {<DT|NN><VBG><NN>}    # chunk determiner/noun, gerund, and noun
"""
cp = nltk.RegexpParser(grammar)
sentences = [[("the", "DT"), ("receiving", "VBG"), ("end", "NN")],
             [("assistant", "NN"),  ("managing", "VBG"),  ("editor", "NN")]]

for sent in sentences:
    print(cp.parse(sent))

(S (NP the/DT receiving/VBG end/NN))
(S (NP assistant/NN managing/VBG editor/NN))


In [15]:
sentences = [[("a", "DT"), ("thriving", "VBG"), ("metropolis", "NN")],
             [("temporary", "NN"),  ("acting", "VBG"),  ("director", "NN")]]

for sent in sentences:
    print(cp.parse(sent))

(S (NP a/DT thriving/VBG metropolis/NN))
(S (NP temporary/NN acting/VBG director/NN))


In [16]:
# Exercise: 7-6
# Write one or more tag patterns to handle coordinated noun phrases, e.g., July/NNP and/CC August/NNP, all/DT
# your/PRP$ managers/NNS and/CC supervisors/NNS, company/NN courts/NNS and/CC adjudicators/NNS.

grammar = """
    NP: {<DT>?<PRP.>?<NN.*>+<CC><NN.>} # Chunk coordinated noun phrases
"""
cp = nltk.RegexpParser(grammar)
sentences = [[("July", "NNP"),  ("and", "CC"), ("August", "NNP")],
             [("all", "DT"), ("your", "PRP$"), ("managers", "NNS"),
              ("and", "CC"), ("supervisors", "NNS")],
             [("company", "NN"), ("courts", "NNS"),
              ("and", "CC"), ("adjudicators", "NNS")]]

for sent in sentences:
    print(cp.parse(sent))

(S (NP July/NNP and/CC August/NNP))
(S (NP all/DT your/PRP$ managers/NNS and/CC supervisors/NNS))
(S (NP company/NN courts/NNS and/CC adjudicators/NNS))


In [17]:
# Exercise: 7-7
# Carry out the following evaluation tasks for any of the chunkers you have developed earlier. (Note that most
# chunking corpora contain some internal inconsistencies, such that any reasonable rule-based approach will
# produce errors.)
# a: Evaluate your chunker on 100 sentences from a chunked corpus, and report the precision, recall, and F-measure.

grammar = r"VP: {<VB.>?<RB>*<MD>?<VB.>?<TO>?<MD>?<RB>*<VB.>}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt'[:100], chunk_types = ['VP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.3%%
    Precision:     74.6%%
    Recall:        66.9%%
    F-Measure:     70.5%%


In [18]:
# b: Use the chunkscore.missed() and chunkscore.incorrect() methods to identify the errors made by your chunker.
# Discuss.

cp.evaluate(test_sents).missed()[:20]

[ImmutableTree('VP', [("'ll", 'MD'), ('have', 'VB'), ('to', 'TO'), ('see', 'VB')]),
 ImmutableTree('VP', [('to', 'TO'), ('bid', 'VB')]),
 ImmutableTree('VP', [('declined', 'VBD'), ('to', 'TO'), ('comment', 'VB')]),
 ImmutableTree('VP', [('are', 'VBP'), ('experiencing', 'VBG')]),
 ImmutableTree('VP', [('to', 'TO'), ('yield', 'VB')]),
 ImmutableTree('VP', [('to', 'TO'), ('fuel', 'VB')]),
 ImmutableTree('VP', [('says', 'NNS')]),
 ImmutableTree('VP', [('said', 'VBD')]),
 ImmutableTree('VP', [('frequently', 'RB'), ('attempting', 'VBG'), ('to', 'TO'), ('enforce', 'VB')]),
 ImmutableTree('VP', [('would', 'MD'), ('not', 'RB'), ('honor', 'VB')]),
 ImmutableTree('VP', [('to', 'TO'), ('arrest', 'VB')]),
 ImmutableTree('VP', [('anticipate', 'VB'), ('filing', 'NN')]),
 ImmutableTree('VP', [('will', 'MD'), ('continue', 'VB')]),
 ImmutableTree('VP', [('has', 'VBZ'), ('forced', 'VBN')]),
 ImmutableTree('VP', [('show', 'VBP')]),
 ImmutableTree('VP', [('proceed', 'VB')]),
 ImmutableTree('VP', [('will', 

In [19]:
cp.evaluate(test_sents).incorrect()[:20]

[ImmutableTree('VP', [('proposed', 'VBN')]),
 ImmutableTree('VP', [('abated', 'VBN')]),
 ImmutableTree('VP', [('managing', 'VBG')]),
 ImmutableTree('VP', [('according', 'VBG')]),
 ImmutableTree('VP', [('stabilizing', 'VBG')]),
 ImmutableTree('VP', [('including', 'VBG')]),
 ImmutableTree('VP', [('based', 'VBN')]),
 ImmutableTree('VP', [('including', 'VBG')]),
 ImmutableTree('VP', [('continued', 'VBD')]),
 ImmutableTree('VP', [('already', 'RB'), ('sells', 'VBZ')]),
 ImmutableTree('VP', [('realized', 'VBN'), ('had', 'VBN')]),
 ImmutableTree('VP', [('point', 'VBP'), ('represents', 'VBZ')]),
 ImmutableTree('VP', [('shortchanged', 'VBN')]),
 ImmutableTree('VP', [('did', 'VBD')]),
 ImmutableTree('VP', [('established', 'VBN')]),
 ImmutableTree('VP', [('built', 'VBN')]),
 ImmutableTree('VP', [('simply', 'RB'), ('reflects', 'VBZ')]),
 ImmutableTree('VP', [('following', 'VBG')]),
 ImmutableTree('VP', [('are', 'VBP'), ('toiling', 'VBG')]),
 ImmutableTree('VP', [('have', 'VBP'), ('been', 'VBN'), ('

In [20]:
# c: Compare the performance of your chunker to the baseline chunker discussed in the evaluation section of
# this chapter.

cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt'[:100], chunk_types = ['VP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  84.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [21]:
# Exercise: 7-9
# Sometimes a word is incorrectly tagged, e.g., the head noun in 12/CD or/CC so/RB cases/VBZ. Instead of requiring
# manual correction of tagger output, good chunkers are able to work with the erroneous output of taggers. Look
# for other examples of correctly chunked noun phrases with incorrect tags.

misses = []

for (i, sent) in enumerate(conll2000.chunked_sents('train.txt')):
    for subtree in sent:
        # only want subtrees, so use `try-except` to eliminate
        # single nodes
        try:
            subtree.label()
        except AttributeError:
            pass
        else:
            if subtree.label() == 'NP':
                # set flag only for NPs
                flag = True
                # exclude subtrees that have nouns, pronouns,
                # numbers, relative pronouns, etc...
                for leaf in subtree.leaves():
                    if re.match(r'NN|PRP|CD|WP|EX|DT', leaf[1]):
                        flag = False

        # the flag will only still be True if the subtree is a NP
        # and doesn't have a noun, pronoun, etc...
        # if it does have a verb, though, we'll want to
        # inspect it
        if flag == True and re.match(r'VB.*', leaf[1]):
            # print the id and the subtree
            misses.append( (i, subtree) )
            # reset flag
            flag = False

In [22]:
for m in misses[:10]:
    print(m)

(293, Tree('NP', [('estimates', 'VBZ')]))
(324, Tree('NP', [("'s", 'POS'), ('holding', 'VBG')]))
(498, Tree('NP', [('operating', 'VBG'), ('results', 'VBZ')]))
(524, Tree('NP', [("'s", 'POS'), ('backing', 'VBG')]))
(587, Tree('NP', [('offers', 'VBZ')]))
(611, Tree('NP', [("'s", 'VBZ')]))
(725, Tree('NP', [("'s", 'POS'), ('standing', 'VBG')]))
(827, Tree('NP', [('around', 'IN'), ('$', '$'), ('5', 'VBG')]))
(867, Tree('NP', [('trading', 'VBG')]))
(876, Tree('NP', [('employees', 'VBZ')]))


In [23]:
print(conll2000.chunked_sents('train.txt')[293])

(S
  (PP As/IN)
  usual/JJ
  ,/,
  (NP estimates/VBZ)
  (PP on/IN)
  (NP the/DT fickle/JJ report/NN)
  (VP are/VBP)
  wide/JJ
  ,/,
  (VP running/VBG)
  (PP from/IN)
  (NP a/DT drop/NN)
  (PP of/IN)
  (NP 3.5/CD %/NN)
  (PP to/TO)
  (NP a/DT gain/NN)
  (PP of/IN)
  (NP 1.6/CD %/NN)
  ./.)


In [24]:
# Exercise: 7-10
# The bigram chunker scores about 90% accuracy. Study its errors and try to work out why it doesn’t get 100%
# accuracy. Experiment with trigram chunking. Are you able to improve the performance any more?

class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                       for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)


    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [25]:
test_sents = conll2000.chunked_sents('test.txt')
train_sents = conll2000.chunked_sents('train.txt')
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  89.3%%
    Precision:     81.2%%
    Recall:        86.2%%
    F-Measure:     83.6%%


In [26]:
cp.evaluate(test_sents).missed()[:20]

[ImmutableTree('NP', [('Showtime', 'NNP')]),
 ImmutableTree('NP', [('beef', 'NN'), ('and', 'CC'), ('pork', 'NN')]),
 ImmutableTree('NP', [('him', 'PRP')]),
 ImmutableTree('NP', [('returns', 'NNS')]),
 ImmutableTree('PP', [('on', 'IN')]),
 ImmutableTree('NP', [('the', 'DT'), ('government', 'NN'), ('lawsuit', 'NN')]),
 ImmutableTree('VP', [('was', 'VBD'), ('dismissed', 'VBN')]),
 ImmutableTree('NP', [('Charges', 'NNS')]),
 ImmutableTree('PP', [('Of', 'IN')]),
 ImmutableTree('NP', [('no', 'DT'), ('plans', 'NNS')]),
 ImmutableTree('PP', [('with', 'IN')]),
 ImmutableTree('VP', [('told', 'VBD')]),
 ImmutableTree('VP', [('frequently', 'RB'), ('attempting', 'VBG'), ('to', 'TO'), ('enforce', 'VB')]),
 ImmutableTree('NP', [('the', 'DT'), ('company', 'NN')]),
 ImmutableTree('NP', [('Houston-based', 'NNP'), ('Maryland', 'NNP'), ('Club', 'NNP'), ('Foods', 'NNPS')]),
 ImmutableTree('VP', [('were', 'VBD')]),
 ImmutableTree('NP', [('Union', 'NNP'), ('Carbide', 'NNP')]),
 ImmutableTree('VP', [('accepti

In [27]:
cp.evaluate(test_sents).incorrect()[:20]

[]

In [28]:
class TrigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                       for sent in train_sents]
        self.tagger = nltk.TrigramTagger(train_data)


    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [29]:
trigram_chunker = TrigramChunker(train_sents)
print(trigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     81.0%%
    Recall:        84.4%%
    F-Measure:     82.6%%


In [30]:
# Exercise: 7-11
# Apply the n-gram and Brill tagging methods to IOB chunk tagging. Instead of assigning POS tags to words,
# here we will assign IOB tags to the POS tags. E.g., if the tag DT (determiner) often occurs at the start of a
# chunk, it will be tagged B (begin). Evaluate the performance of these chunking methods relative to the regular
# expression chunking methods covered in this chapter.

import sys

from nltk import tbl, untag
from nltk.tag.brill_trainer import BrillTaggerTrainer

from nltk.corpus import conll2000
from nltk.chunk.util import tree2conlltags
from nltk.tag import DefaultTagger

def get_templates():

    pos10 = [[POS([0])],
             [POS([-1])],
             [POS([1])],
             [POS([-1]), POS([0])],
             [POS([0]), POS([1])],
             [POS([-1]), POS([1])],
             [POS([-2]), POS([-1])],
             [POS([1]), POS([2])],
             [POS([-1, -2, -3])],
             [POS([1, 2, 3])]]

    iobs5 = [[IOB([0])],
             [IOB([-1]), IOB([0])],
             [IOB([0]), IOB([1])],
             [IOB([-2]), IOB([-1])],
             [IOB([1]), IOB([2])]]

    templates = [tbl.Template(*pos + iob) for pos in pos10 for iob in iobs5]

    return templates

def build_multifeature_corpus():

    def tuple2dict_featureset(sent, tagnames = ("word", "pos", "trueiob")):
        return (dict(zip(tagnames, t)) for t in sent)

    def tag_tokens(tokens):
        return [(t, t["trueiob"]) for t in tokens]

    train_sents = conll2000.chunked_sents('train.txt')
    conlltagged_sents = (tree2conlltags(sent)
                        for sent in train_sents)
    conlltagged_tokens = (tuple2dict_featureset(sent)
                        for sent in conlltagged_sents)
    conlltagged_sequences = (tag_tokens(sent)
                            for sent in conlltagged_tokens)

    return conlltagged_sequences

class POS(tbl.Feature):
    @staticmethod
    def extract_property(tokens, index):
        return tokens[index][0]["pos"]

class IOB(tbl.Feature):
    @staticmethod
    def extract_property(tokens, index):
        return tokens[index][1]



class MyInitialTagger(DefaultTagger):
    def choose_tag(self, tokens, index, history):
        tokens_ = [t["word"] for t in tokens]
        return super().choose_tag(tokens_, index, history)


templates = get_templates()
trainon = 100

corpus = list(build_multifeature_corpus())
train, test = corpus[:trainon], corpus[trainon:]

print(train[0], "\n")

initial_tagger = MyInitialTagger('O')
print(initial_tagger.tag(untag(train[0])), "\n")

trainer = BrillTaggerTrainer(initial_tagger, templates, trace = 3)
tagger = trainer.train(train)

taggedtest = tagger.tag_sents([untag(t) for t in test])
print(test[0])
print(initial_tagger.tag(untag(test[0])))
print(taggedtest[0])
print()

tagger.print_template_statistics()

[({'word': 'Confidence', 'pos': 'NN', 'trueiob': 'B-NP'}, 'B-NP'), ({'word': 'in', 'pos': 'IN', 'trueiob': 'B-PP'}, 'B-PP'), ({'word': 'the', 'pos': 'DT', 'trueiob': 'B-NP'}, 'B-NP'), ({'word': 'pound', 'pos': 'NN', 'trueiob': 'I-NP'}, 'I-NP'), ({'word': 'is', 'pos': 'VBZ', 'trueiob': 'B-VP'}, 'B-VP'), ({'word': 'widely', 'pos': 'RB', 'trueiob': 'I-VP'}, 'I-VP'), ({'word': 'expected', 'pos': 'VBN', 'trueiob': 'I-VP'}, 'I-VP'), ({'word': 'to', 'pos': 'TO', 'trueiob': 'I-VP'}, 'I-VP'), ({'word': 'take', 'pos': 'VB', 'trueiob': 'I-VP'}, 'I-VP'), ({'word': 'another', 'pos': 'DT', 'trueiob': 'B-NP'}, 'B-NP'), ({'word': 'sharp', 'pos': 'JJ', 'trueiob': 'I-NP'}, 'I-NP'), ({'word': 'dive', 'pos': 'NN', 'trueiob': 'I-NP'}, 'I-NP'), ({'word': 'if', 'pos': 'IN', 'trueiob': 'O'}, 'O'), ({'word': 'trade', 'pos': 'NN', 'trueiob': 'B-NP'}, 'B-NP'), ({'word': 'figures', 'pos': 'NNS', 'trueiob': 'I-NP'}, 'I-NP'), ({'word': 'for', 'pos': 'IN', 'trueiob': 'B-PP'}, 'B-PP'), ({'word': 'September', 'pos': '

In [31]:
# Exercise: 7-12
# We saw in Chapter 5 that it is possible to establish an upper limit to tagging performance by looking for
# ambiguous n-grams, which are n-grams that are tagged in more than one possible way in the training data. Apply the
# same method to determine an upper bound on the performance of an n-gram chunker.

train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[2], y[2], z[0]), z[2])
           for sent in train_sents
           for x, y, z in nltk.trigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.35955948727207077

In [32]:
train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[1], y[1], z[0]), z[2])
           for sent in train_sents
           for x, y, z in nltk.trigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.22214427565573983

In [33]:
train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[1], x[2], y[1], y[2], z[0]), z[2])
           for sent in train_sents
           for x, y, z in nltk.trigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.15939442395481393

In [34]:
train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[2], y[0]), y[2])
           for sent in train_sents
           for x, y in nltk.bigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.43632113851206417

In [35]:
train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[1], y[0]), y[2])
           for sent in train_sents
           for x, y in nltk.bigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.3850516048542588

In [36]:
train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[1], x[2], y[0]), y[2])
           for sent in train_sents
           for x, y in nltk.bigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.27115601777199183

In [37]:
train_sents = conll2000.chunked_sents('train.txt')
cfd = nltk.ConditionalFreqDist(
           ((x[0], x[1], x[2], y[0], y[1], y[2], z[0]), z[2])
           for sent in train_sents
           for x, y, z in nltk.trigrams(tree2conlltags(sent)))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()

0.005591519872075929

In [38]:
# Exercise: 7-13
# Pick one of the three chunk types in the CoNLL Chunking Corpus. Write functions to do the following tasks for
# your chosen type:
# a: List all the tag sequences that occur with each instance of this chunk type.

np_tags = []

for (i, sent) in enumerate(conll2000.chunked_sents('train.txt')):
    for subtree in sent:
        # only want subtrees, so use `try-except` to eliminate
        # single nodes
        try:
            subtree.label()
        except AttributeError:
            pass
        else:
            if subtree.label() == 'NP':
                # concatenating a string with all the POS tags
                subtree_tag = ""
                for t in tree2conlltags(subtree):
                    if subtree_tag == "":
                        subtree_tag += t[1]
                    else:
                        subtree_tag += "/" + t[1]
                np_tags.append(subtree_tag)

len(set(np_tags))

2283

In [39]:
# b: Count the frequency of each tag sequence, and produce a ranked list in order of decreasing frequency;
# each line should consist of an integer (the frequency) and the tag sequence.

fd = nltk.FreqDist(np_tags)
for tag, value in fd.most_common(50):
    print("{:>4} {}".format(value, tag))

7223 DT/NN
3802 PRP
3282 NNS
3249 NNP
3245 NN
2642 NNP/NNP
2119 DT/JJ/NN
1722 JJ/NNS
1173 DT/NNS
1143 JJ/NN
1012 NN/NNS
 930 WDT
 921 DT/NN/NN
 866 CD
 830 CD/NN
 824 $/CD/CD
 690 CD/NNS
 677 NNP/NNP/NNP
 624 PRP$/NN
 552 POS/NN
 540 DT
 509 WP
 463 DT/NNP
 454 NN/NN
 446 $/CD
 399 DT/NNP/NN
 355 PRP$/NNS
 313 JJ/NN/NNS
 311 DT/NNP/NNP
 277 DT/JJ/NN/NN
 276 DT/JJ/NNS
 220 NNP/NNP/NNP/NNP
 204 POS/NNS
 200 CD/CD
 195 PRP$/JJ/NN
 189 EX
 183 NNP/CD
 183 NNP/NNS
 182 DT/JJ/JJ/NN
 171 POS/JJ/NN
 161 DT/NN/NNS
 158 DT/NNP/NNP/NNP
 149 JJ/JJ/NN
 146 JJ/JJ/NNS
 141 DT/VBN/NN
 124 DT/CD/NNS
 124 IN
 115 NNP/NN
 113 NNP/CC/NNP
 113 POS/NN/NN


In [40]:
grammar = r"""
    NP: {<DT|PRP$|POS|$|WP|EX>?<JJ.*>*<CD>*<NN.*|WDT>*<CD>*}
"""
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt'[:100], chunk_types = ['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.6%%
    Precision:     71.9%%
    Recall:        74.1%%
    F-Measure:     73.0%%


In [41]:
# Exercise: 7-16
# The Penn Treebank Corpus sample contains a section of tagged Wall Street Journal text that has been
# chunked into noun phrases. The format uses square brackets, and we have encountered it several times in this
# chapter. The corpus can be accessed using: for sent in nltk.corpus.treebank_chunk.chunked_sents(fileid).
# These are flat trees, just as we got using nltk.corpus.conll2000.chunked_sents().
# a: The functions nltk.tree.pprint() and nltk.chunk.tree2conllstr() can be used to create Treebank and IOB
# strings from a tree. Write functions chunk2brackets() and chunk2iob() that take a single chunk tree as their
# sole argument, and return the required multiline string representation.

nltk.tree.Tree.pprint(nltk.corpus.treebank_chunk.chunked_sents()[0])

(S
  (NP Pierre/NNP Vinken/NNP)
  ,/,
  (NP 61/CD years/NNS)
  old/JJ
  ,/,
  will/MD
  join/VB
  (NP the/DT board/NN)
  as/IN
  (NP a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD)
  ./.)


In [42]:
print(nltk.corpus.treebank_chunk.chunked_sents()[0])

(S
  (NP Pierre/NNP Vinken/NNP)
  ,/,
  (NP 61/CD years/NNS)
  old/JJ
  ,/,
  will/MD
  join/VB
  (NP the/DT board/NN)
  as/IN
  (NP a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD)
  ./.)


In [43]:
for sent in nltk.corpus.treebank_chunk.chunked_sents()[:10]:
    nltk.tree.Tree.pprint(sent)
    print(sent)

(S
  (NP Pierre/NNP Vinken/NNP)
  ,/,
  (NP 61/CD years/NNS)
  old/JJ
  ,/,
  will/MD
  join/VB
  (NP the/DT board/NN)
  as/IN
  (NP a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD)
  ./.)
(S
  (NP Pierre/NNP Vinken/NNP)
  ,/,
  (NP 61/CD years/NNS)
  old/JJ
  ,/,
  will/MD
  join/VB
  (NP the/DT board/NN)
  as/IN
  (NP a/DT nonexecutive/JJ director/NN Nov./NNP 29/CD)
  ./.)
(S
  (NP Mr./NNP Vinken/NNP)
  is/VBZ
  (NP chairman/NN)
  of/IN
  (NP Elsevier/NNP N.V./NNP)
  ,/,
  (NP the/DT Dutch/NNP publishing/VBG group/NN)
  ./.)
(S
  (NP Mr./NNP Vinken/NNP)
  is/VBZ
  (NP chairman/NN)
  of/IN
  (NP Elsevier/NNP N.V./NNP)
  ,/,
  (NP the/DT Dutch/NNP publishing/VBG group/NN)
  ./.)
(S
  (NP Rudolph/NNP Agnew/NNP)
  ,/,
  (NP 55/CD years/NNS)
  old/JJ
  and/CC
  (NP former/JJ chairman/NN)
  of/IN
  (NP Consolidated/NNP Gold/NNP Fields/NNP PLC/NNP)
  ,/,
  was/VBD
  named/VBN
  (NP a/DT nonexecutive/JJ director/NN)
  of/IN
  (NP this/DT British/JJ industrial/JJ conglomerate/NN)
  ./.)
(S
  

In [44]:
print(nltk.chunk.tree2conllstr(nltk.corpus.treebank_chunk.chunked_sents()[0]))

Pierre NNP B-NP
Vinken NNP I-NP
, , O
61 CD B-NP
years NNS I-NP
old JJ O
, , O
will MD O
join VB O
the DT B-NP
board NN I-NP
as IN O
a DT B-NP
nonexecutive JJ I-NP
director NN I-NP
Nov. NNP I-NP
29 CD I-NP
. . O


In [45]:
print(conll2000.chunked_sents('train.txt')[0])

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


In [None]:
# b: Write command-line conversion utilities bracket2iob.py and iob2bracket.py that take a file in Treebank or
# CoNLL format (respectively) and convert it to the other format. (Obtain some raw Treebank or CoNLL data from the
# NLTK Corpora, save it to a file, and then use for line in open(filename) to access it from Python.)