In [1]:
import util
import nltk

In [2]:
labeled_tokens = util.get_labeled_tokens()
train, test = util.make_train_test(labeled_tokens)

    

In [3]:
def featurize_bigram(tokens):
    """input is a list/iterable of tokens, output/generator is list of dictionary features, like
    [{word_nm1: the, word_n: dog}]
    if tok == <s>, word_nm1 = "</s>" (padding)
    """
    prev = "</S>" # end of sentence marker
    for tok, lab in tokens:
        feature_dict = {}
        feature_dict["word_n"] = tok
        feature_dict["word_nm1"] = prev
        prev = tok
        yield feature_dict, lab
            
    

In [4]:
trainfeat = list(featurize_bigram(train))
testfeat = list(featurize_bigram(test))


In [5]:
trainfeat[0:5]

[({'word_n': '<S>', 'word_nm1': '</S>'}, 'O'),
 ({'word_n': 'थः ', 'word_nm1': '<S>'}, 'B-EC'),
 ({'word_n': 'नं', 'word_nm1': 'थः '}, 'I-EC'),
 ({'word_n': 'छम्ह', 'word_nm1': 'नं'}, 'I-EC'),
 ({'word_n': 'शक्तिशाली', 'word_nm1': 'छम्ह'}, 'I-EC')]

In [6]:
classifier = nltk.NaiveBayesClassifier.train(trainfeat)
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
test_pred = list(map(classifier.classify, [tok for tok, lab in testfeat]))
train_toks, train_true = zip(*train)
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, test_pred)

processed 711 tokens with 33 phrases; found: 113 phrases; correct: 8.
accuracy:  71.87%; precision:   7.08%; recall:  24.24%; FB1:  10.96
               EC: precision:   7.08%; recall:  24.24%; FB1:  10.96  113



('711', '33', '113', '8', '71.87', '7.08', '24.24', '10.96')

In [7]:
def featurize_wordandtag_bigram(tokens, classify=False):
    """input is a list/iterable of tokens, output/generator is list of dictionary features, like
    [{word_nm1: the, word_n: dog}]
    if tok == <s>, word_nm1 = "</s>" (padding)
    """
    prev_tok = "</S>" # end of sentence marker
    prev_lab = "O"
    for tok, lab in tokens:
        feature_dict = {}
        feature_dict["word_n"] = tok
        feature_dict["word_nm1"] = prev_tok
        feature_dict["lab_nm1"] = prev_lab
        prev_tok = tok
        if classify: # this is the part that makes it honest fair, see below
            lab = classify(feature_dict)
        prev_lab = lab
        yield feature_dict, lab
            
    

In [8]:
trainfeat = list(featurize_wordandtag_bigram(train))
testfeat = list(featurize_wordandtag_bigram(test))
classifier = nltk.NaiveBayesClassifier.train(trainfeat)

train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

test_pred = list(map(classifier.classify, [tok for tok, lab in testfeat]))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, test_pred)

processed 3997 tokens with 168 phrases; found: 169 phrases; correct: 153.
accuracy:  99.57%; precision:  90.53%; recall:  91.07%; FB1:  90.80
               EC: precision:  90.53%; recall:  91.07%; FB1:  90.80  169

processed 711 tokens with 33 phrases; found: 40 phrases; correct: 21.
accuracy:  97.33%; precision:  52.50%; recall:  63.64%; FB1:  57.53
               EC: precision:  52.50%; recall:  63.64%; FB1:  57.53  40



('711', '33', '40', '21', '97.33', '52.50', '63.64', '57.53')

#### the previous is not honest b/c we consider knowing the true preceding takes, as opposed to the predicted preceding tags!

In [9]:
trainfeat = list(featurize_wordandtag_bigram(train))
classifier = nltk.NaiveBayesClassifier.train(trainfeat)

# this way predicts over-optimistically because the preceding tags/labels are known
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

# this way should be more fair/honest
train_pred = list(featurize_wordandtag_bigram(train, classify=classifier.classify))
util.conlleval(train_toks, train_true, [pred for _, pred in train_pred])

# fair/honest for test
test_pred = list(featurize_wordandtag_bigram(test, classify=classifier.classify))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, [pred for _, pred in test_pred])

processed 3997 tokens with 168 phrases; found: 169 phrases; correct: 153.
accuracy:  99.57%; precision:  90.53%; recall:  91.07%; FB1:  90.80
               EC: precision:  90.53%; recall:  91.07%; FB1:  90.80  169

processed 3997 tokens with 168 phrases; found: 163 phrases; correct: 153.
accuracy:  97.25%; precision:  93.87%; recall:  91.07%; FB1:  92.45
               EC: precision:  93.87%; recall:  91.07%; FB1:  92.45  163

processed 711 tokens with 33 phrases; found: 33 phrases; correct: 19.
accuracy:  84.39%; precision:  57.58%; recall:  57.58%; FB1:  57.58
               EC: precision:  57.58%; recall:  57.58%; FB1:  57.58  33



('711', '33', '33', '19', '84.39', '57.58', '57.58', '57.58')

In [10]:
classifier.show_most_informative_features()

Most Informative Features
                word_nm1 = 'धकाः'              O : B-EC   =     32.3 : 1.0
                word_nm1 = 'धका'               O : I-EC   =     29.6 : 1.0
                  word_n = 'धका'            I-EC : O      =     17.4 : 1.0
                  word_n = 'धकाः'           I-EC : O      =     11.6 : 1.0
                word_nm1 = '<S>'            B-EC : O      =     11.1 : 1.0
                 lab_nm1 = 'I-EC'           I-EC : O      =     10.4 : 1.0
                word_nm1 = 'नं'                O : B-EC   =      7.8 : 1.0
                word_nm1 = 'हे'             I-EC : B-EC   =      7.7 : 1.0
                  word_n = 'आदिवासी'        I-EC : O      =      7.0 : 1.0
                  word_n = 'गुलि'           I-EC : O      =      7.0 : 1.0


### Try maxent

In [11]:

trainfeat = list(featurize_wordandtag_bigram(train))
classifier = nltk.MaxentClassifier.train(trainfeat)

# this way predicts over-optimistically because the preceding tags/labels are known
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

# this way should be more fair/honest
train_pred = list(featurize_wordandtag_bigram(train, classify=classifier.classify))
util.conlleval(train_toks, train_true, [pred for _, pred in train_pred])

# fair/honest for test
test_pred = list(featurize_wordandtag_bigram(test, classify=classifier.classify))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, [pred for _, pred in test_pred])

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.479
             2          -0.48843        0.922
             3          -0.35555        0.981
             4          -0.28188        0.987
             5          -0.23479        0.990
             6          -0.20185        0.991
             7          -0.17739        0.994
             8          -0.15846        0.995
             9          -0.14335        0.996
            10          -0.13098        0.997
            11          -0.12067        0.997
            12          -0.11192        0.997
            13          -0.10442        0.997
            14          -0.09790        0.997
            15          -0.09218        0.997
            16          -0.08712        0.998
            17          -0.08262        0.998
            18          -0.07857        0.998
            19          -0.07493        0.998
 

('711', '33', '41', '26', '83.12', '63.41', '78.79', '70.27')