20. The bigram chunker scores about 90% accuracy. Study its errors and try to work out why it doesn't get 100% accuracy. Experiment with trigram chunking. Are you able to improve the performance anymore?

UNIGRAM CHUNKING

In [1]:
import nltk
from nltk import *
from nltk.corpus import conll2000

In [2]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
        print(train_data[0])
        
    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [3]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

In [4]:
unigram_chunker = UnigramChunker(train_sents)

[('NN', 'B-NP'), ('IN', 'O'), ('DT', 'B-NP'), ('NN', 'I-NP'), ('VBZ', 'O'), ('RB', 'O'), ('VBN', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NN', 'B-NP'), ('NNS', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), (',', 'O'), ('JJ', 'O'), ('IN', 'O'), ('NN', 'B-NP'), ('NN', 'B-NP'), (',', 'O'), ('VB', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), ('CC', 'I-NP'), ('NNP', 'I-NP'), ('POS', 'B-NP'), ('JJ', 'I-NP'), ('NNS', 'I-NP'), ('.', 'O')]


In [5]:
print(unigram_chunker.evaluate(test_sents))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(unigram_chunker.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


BIGRAM CHUNKING

In [6]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

In [7]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tokens_list=[[w for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        t_c_tuples_list=[[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        print(t_c_tuples_list[0])
        print()
        
        '''
        bigrams=[]
        i=0
        for tokens in t_c_tuples_list:
            bigrams.append(list(ngrams(tokens,2)))
            i=i+1
        
        print(bigrams[0])
        '''
        
        self.tagger = nltk.BigramTagger(t_c_tuples_list)
        
    def parse(self, sentence):
        
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [8]:
bigram_chunker = BigramChunker(train_sents)

[('NN', 'B-NP'), ('IN', 'O'), ('DT', 'B-NP'), ('NN', 'I-NP'), ('VBZ', 'O'), ('RB', 'O'), ('VBN', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NN', 'B-NP'), ('NNS', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), (',', 'O'), ('JJ', 'O'), ('IN', 'O'), ('NN', 'B-NP'), ('NN', 'B-NP'), (',', 'O'), ('VB', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), ('CC', 'I-NP'), ('NNP', 'I-NP'), ('POS', 'B-NP'), ('JJ', 'I-NP'), ('NNS', 'I-NP'), ('.', 'O')]



In [9]:
print(bigram_chunker.evaluate(test_sents))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(bigram_chunker.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%


TRIGRAM CHUNKING

In [10]:
class TrigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tokens_list=[[w for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        t_c_tuples_list=[[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        print(t_c_tuples_list[0])
        print()
        
        self.tagger = nltk.TrigramTagger(t_c_tuples_list)
        
    def parse(self, sentence):
        
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [11]:
trigram_chunker = TrigramChunker(train_sents)

[('NN', 'B-NP'), ('IN', 'O'), ('DT', 'B-NP'), ('NN', 'I-NP'), ('VBZ', 'O'), ('RB', 'O'), ('VBN', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NN', 'B-NP'), ('NNS', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), (',', 'O'), ('JJ', 'O'), ('IN', 'O'), ('NN', 'B-NP'), ('NN', 'B-NP'), (',', 'O'), ('VB', 'O'), ('TO', 'O'), ('VB', 'O'), ('DT', 'B-NP'), ('JJ', 'I-NP'), ('NN', 'I-NP'), ('IN', 'O'), ('NNP', 'B-NP'), ('CC', 'I-NP'), ('NNP', 'I-NP'), ('POS', 'B-NP'), ('JJ', 'I-NP'), ('NNS', 'I-NP'), ('.', 'O')]



In [12]:
print(trigram_chunker.evaluate(test_sents))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(trigram_chunker.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.5%%
    Recall:        86.8%%
    F-Measure:     84.6%%
