Creating features for feeding into max entropy classifier.

In [7]:
import string
from nltk.stem.snowball import SnowballStemmer
 

def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))
    
def features(sentence, i, history):
    word, pos = sentence[i]
    # init the stemmer
    stemmer = SnowballStemmer('english')
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
        prevprevword, prevprevpos = "<START2>", "<START2>"
        history = "<START> <START2>"
        i = i + 2
        previob = history[i - 2]
    elif i == 1:
        prevprevword, prevprevpos = "<START2>", "<START2>"
        prevword, prevpos = sentence[i-1]
        i = i + 1
        history = "<START>"
        previob = history[i - 1]
    else:
        prevword, prevpos = sentence[i-1]
        prevprevword, prevprevpos = sentence[i-2]
        previob = history[i - 1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
        nextnextword, nextnextpos = "<END2>", "<END2>"
        previob = history[i - 1]
    elif i == len(sentence)-2:
        nextnextword, nextnextpos = "<END2>", "<END2>"
        nextword, nextpos = sentence[i+1]
        previob = history[i - 1]
    else:
        nextword, nextpos = sentence[i+1]
        nextnextword, nextnextpos = sentence[i+2]
        previob = history[i - 1]
    capitalized = word[0] in string.ascii_uppercase
 
    #prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase
 
    #nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = nextword[0] in string.ascii_uppercase
    return {"pos": pos,
            "word": word,
            "lemma": stemmer.stem(word[0]),
            #"isnumeric": word[0].isdigit,
            "prevpos": prevpos,
            "prevprevpos": prevprevpos,
            "nextpos": nextpos,
            "previob": previob,
            "nextnextpos": nextnextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "prevprevpost+prevpos+pos": "%s+%s+%s" % (prevprevpos, prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "pos+nextpos+nextnextpos": "%s+%s+%s" % (pos, nextpos, nextnextpos),
            "capitalized": capitalized,
            "prevcapitalized": prevcapitalized,
            "nextcapitalized": nextcapitalized,
            "tags-since-dt": tags_since_dt(sentence, i)
           }

Creating the chunker classifier with custom features as mentioned above

In [8]:
import nltk
class NamedEntityChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            #untagged_sent = [(word,tag) for word,tag,iob in sent]

            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        nltk.config_megam('C:\megam_0.92\megam.exe')
        self.classifier = nltk.MaxentClassifier.train( 
            train_set, algorithm='megam', trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class NamedEntityChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = NamedEntityChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

Reading the data pre formatted and tagged (POS + IOB)  gold master data with ConllCorpusReader and creating list of sentences in Conll IOB format

In [9]:
import os
import collections
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
import string
from nltk.stem.snowball import SnowballStemmer

from nltk import pos_tag, word_tokenize

  
from nltk.corpus import ConllCorpusReader
my_corpus = ConllCorpusReader('.\dataset4', '.*\.txt', columntypes=('words', 'pos','chunk'), encoding="utf-8")





sents = list(my_corpus.iob_sents())

sentences = []

for sent in sents:
    reader = [((word,tag),iob) for word,tag,iob in sent]
    sentences.append(reader)




splitting the data in training and test data set

In [10]:
from sklearn.model_selection import train_test_split
test_pct=0.3
training_samples, test_samples = train_test_split(sentences,test_size=test_pct)
print(len(training_samples))
print(len(test_samples))


47903
20531


Training the classifier on training data set

In [11]:
chunker = NamedEntityChunker(training_samples)


evaluating the classifier on test data set

In [12]:
score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_samples])


In [14]:
print(score.accuracy())

0.8813096808256099
