In [1]:
import util
import nltk

In [2]:
labeled_tokens = util.get_labeled_tokens()
train, test = util.make_train_test(labeled_tokens)

    

In [3]:
def featurize_bigram(tokens):
    """input is a list/iterable of tokens, output/generator is list of dictionary features, like
    [{word_nm1: the, word_n: dog}]
    if tok == <s>, word_nm1 = "</s>" (padding)
    """
    prev = "</S>" # end of sentence marker
    for tok, lab in tokens:
        feature_dict = {}
        feature_dict["word_n"] = tok
        feature_dict["word_nm1"] = prev
        prev = tok
        yield feature_dict, lab
            
    

In [4]:
trainfeat = list(featurize_bigram(train))
testfeat = list(featurize_bigram(test))


In [5]:
trainfeat[0:5]

[({'word_n': '<S>', 'word_nm1': '</S>'}, 'O'),
 ({'word_n': 'थः ', 'word_nm1': '<S>'}, 'B-EC'),
 ({'word_n': 'नं', 'word_nm1': 'थः '}, 'I-EC'),
 ({'word_n': 'छम्ह', 'word_nm1': 'नं'}, 'I-EC'),
 ({'word_n': 'शक्तिशाली', 'word_nm1': 'छम्ह'}, 'I-EC')]

In [6]:
classifier = nltk.NaiveBayesClassifier.train(trainfeat)
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
test_pred = list(map(classifier.classify, [tok for tok, lab in testfeat]))
train_toks, train_true = zip(*train)
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, test_pred)

processed 711 tokens with 33 phrases; found: 113 phrases; correct: 8.
accuracy:  71.87%; precision:   7.08%; recall:  24.24%; FB1:  10.96
               EC: precision:   7.08%; recall:  24.24%; FB1:  10.96  113



('711', '33', '113', '8', '71.87', '7.08', '24.24', '10.96')

In [7]:
def featurize_wordandtag_bigram(tokens, classify=False):
    """input is a list/iterable of tokens, output/generator is list of dictionary features, like
    [{word_nm1: the, word_n: dog}]
    if tok == <s>, word_nm1 = "</s>" (padding)
    """
    prev_tok = "</S>" # end of sentence marker
    prev_lab = "O"
    for tok, lab in tokens:
        feature_dict = {}
        feature_dict["word_n"] = tok
        feature_dict["word_nm1"] = prev_tok
        feature_dict["lab_nm1"] = prev_lab
        prev_tok = tok
        if classify: # this is the part that makes it honest fair, see below
            lab = classify(feature_dict)
        prev_lab = lab
        yield feature_dict, lab
            
    

In [8]:
trainfeat = list(featurize_wordandtag_bigram(train))
testfeat = list(featurize_wordandtag_bigram(test))
classifier = nltk.NaiveBayesClassifier.train(trainfeat)

train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

test_pred = list(map(classifier.classify, [tok for tok, lab in testfeat]))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, test_pred)

processed 3997 tokens with 168 phrases; found: 169 phrases; correct: 153.
accuracy:  99.57%; precision:  90.53%; recall:  91.07%; FB1:  90.80
               EC: precision:  90.53%; recall:  91.07%; FB1:  90.80  169

processed 711 tokens with 33 phrases; found: 40 phrases; correct: 21.
accuracy:  97.33%; precision:  52.50%; recall:  63.64%; FB1:  57.53
               EC: precision:  52.50%; recall:  63.64%; FB1:  57.53  40



('711', '33', '40', '21', '97.33', '52.50', '63.64', '57.53')

#### the previous is not honest b/c we consider knowing the true preceding takes, as opposed to the predicted preceding tags!

In [9]:
trainfeat = list(featurize_wordandtag_bigram(train))
classifier = nltk.NaiveBayesClassifier.train(trainfeat)

# this way predicts over-optimistically because the preceding tags/labels are known
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

# this way should be more fair/honest
train_pred = list(featurize_wordandtag_bigram(train, classify=classifier.classify))
util.conlleval(train_toks, train_true, [pred for _, pred in train_pred])

# fair/honest for test
test_pred = list(featurize_wordandtag_bigram(test, classify=classifier.classify))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, [pred for _, pred in test_pred])

processed 3997 tokens with 168 phrases; found: 169 phrases; correct: 153.
accuracy:  99.57%; precision:  90.53%; recall:  91.07%; FB1:  90.80
               EC: precision:  90.53%; recall:  91.07%; FB1:  90.80  169

processed 3997 tokens with 168 phrases; found: 163 phrases; correct: 153.
accuracy:  97.25%; precision:  93.87%; recall:  91.07%; FB1:  92.45
               EC: precision:  93.87%; recall:  91.07%; FB1:  92.45  163

processed 711 tokens with 33 phrases; found: 33 phrases; correct: 19.
accuracy:  84.39%; precision:  57.58%; recall:  57.58%; FB1:  57.58
               EC: precision:  57.58%; recall:  57.58%; FB1:  57.58  33



('711', '33', '33', '19', '84.39', '57.58', '57.58', '57.58')

In [10]:
classifier.show_most_informative_features()

Most Informative Features
                word_nm1 = 'धकाः'              O : B-EC   =     32.3 : 1.0
                word_nm1 = 'धका'               O : I-EC   =     29.6 : 1.0
                  word_n = 'धका'            I-EC : O      =     17.4 : 1.0
                  word_n = 'धकाः'           I-EC : O      =     11.6 : 1.0
                word_nm1 = '<S>'            B-EC : O      =     11.1 : 1.0
                 lab_nm1 = 'I-EC'           I-EC : O      =     10.4 : 1.0
                word_nm1 = 'नं'                O : B-EC   =      7.8 : 1.0
                word_nm1 = 'हे'             I-EC : B-EC   =      7.7 : 1.0
                  word_n = 'आदिवासी'        I-EC : O      =      7.0 : 1.0
                  word_n = 'गुलि'           I-EC : O      =      7.0 : 1.0


धकाः - dhaka, dhakāḥ1  (irr. form of dhāye)  1. quotation marker: having said that etc.  2. (with dat.) for s.o.'s benefit

नं - na/no? na5 part.  1. na ... na  denoting doubt, uncertainty: va na vayi na mavayi. Who knows whether he will come or not.  2. nearly, almost

हे -
haṂ1 interj., part.  1. yes, indeed  2. marker of reported speech: vaṃ chanta vā dhāla haṂ.  He told you to come, he said. 3. expression of surprise; haṂyā khaṂ  rumour
haḥ3  interj.  expression of approval used by older people to younger people
hu n.  wheat (goblins' language)
huṂ1  interj.  yes, well
he1 emph.part.  indeed, definitely, really: thuthāy jike dhebā he madu. Right here, I really don't have any money.

आदिवासी - tribal? https://hi.wikipedia.org/wiki/%E0%A4%86%E0%A4%A6%E0%A4%BF%E0%A4%B5%E0%A4%BE%E0%A4%B8%E0%A5%80

गुलि - how much https://www.chegg.com/flashcards/newar-phrases-c1d04688-212d-4ee4-a77b-b350cabdf6dc/deck
gathe (var. gay) pron.  how; ~ki adv.  such as, for instance; ~khese mine  to feel uneasy; ~gathe adv.  how come, don't know (how it happened); ~jaka pron.  how; ~bhanaṃ adv. \
 how much; ~yānā pron.  how; ~hana adv.  as expected
 gapāy pron.  1. how much, to what extent  2. to such an extent (cf. apāy, thapāy); ~cvaḥ pron.  to what extent; ~dhaṃ pron.  how big; ~hākaḥ pron.  how long

### Try maxent

In [21]:

trainfeat = list(featurize_wordandtag_bigram(train))
classifier = nltk.MaxentClassifier.train(trainfeat)

# this way predicts over-optimistically because the preceding tags/labels are known
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

# this way should be more fair/honest
train_pred = list(featurize_wordandtag_bigram(train, classify=classifier.classify))
util.conlleval(train_toks, train_true, [pred for _, pred in train_pred])

# fair/honest for test
test_pred = list(featurize_wordandtag_bigram(test, classify=classifier.classify))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, [pred for _, pred in test_pred])

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.479
             2          -0.48843        0.922
             3          -0.35555        0.981
             4          -0.28188        0.987
             5          -0.23479        0.990
             6          -0.20185        0.991
             7          -0.17739        0.994
             8          -0.15846        0.995
             9          -0.14335        0.996
            10          -0.13098        0.997
            11          -0.12067        0.997
            12          -0.11192        0.997
            13          -0.10442        0.997
            14          -0.09790        0.997
            15          -0.09218        0.997
            16          -0.08712        0.998
            17          -0.08262        0.998
            18          -0.07857        0.998
            19          -0.07493        0.998
 

('711', '33', '41', '26', '83.12', '63.41', '78.79', '70.27')

In [22]:
classifier.show_most_informative_features(100)

  20.548 word_n=='न्ह्यथसें ' and label is 'B-EC'
  14.491 word_n=='नेवाःत ' and label is 'B-EC'
  11.732 word_n=='बुद्धजयन्ती ' and label is 'B-EC'
  11.243 word_n==']तःगु' and label is 'I-EC'
 -10.890 word_nm1=='धकाः' and label is 'B-EC'
  10.462 word_n=='छुं ' and label is 'B-EC'
  10.364 word_n=='वय्कलं ' and label is 'B-EC'
  -9.529 word_nm1=='धकाः' and label is 'I-EC'
   9.171 word_nm1=='थहां' and label is 'O'
   9.076 word_n=='आः ' and label is 'B-EC'
   8.858 word_nm1=='नीस्वनेधुंकल' and label is 'O'
   8.858 word_nm1=='दइमखु' and label is 'O'
   8.858 word_nm1=='वइतिनि' and label is 'O'
   8.549 word_nm1=='धका:' and label is 'O'
   8.490 word_nm1=='न्ह्याःवःगु' and label is 'O'
   8.325 word_n=='ख्यायेगु' and label is 'I-EC'
   8.325 word_n==']दयेकूगु' and label is 'I-EC'
   8.325 word_n==']न्हिया' and label is 'I-EC'
   8.325 word_n==']उल्लेख' and label is 'I-EC'
   8.325 word_n=='आधिकारीक' and label is 'I-EC'
   8.325 word_n==']सीक' and label is 'I-EC'
  -7.952 word_nm1=='धक

In [None]:
न्ह्यथसें
nhe-thane v.t.  1. to mention  2. to propose, to suggest, to raise a question: jiṃ thva khaṂ suṃ āju pākheṃ nhethane. I shall raise this matter with one of the elders.

नेवाःत
nevāḥ n.anim. (-vāla-)  Newar; ~ākhaḥ n. (-khala-)  Newari script; ~khaṂ n.  Newari language; ~jā n.  kind of `bali' offering: boiled rice rolled into a ball; ~bhāy n. (-bhāsa\
-)  Newari language

बुद्धजयन्ती
https://en.wikipedia.org/wiki/Vesak

]तःगु
? tegu

धकाः
dhaka

छुं
chu (var. chū) pron.  what, which: chu khaḥ?  How are you?  chaṃ chu bicāḥ thva khaṂy yānā?  What do you think about this?
    
वय्कलं
? vayākathaṃ adv.  abruptly
धकाः dhaka

थहां
thathe (var. thay) adv.  in this way

thatheṃ adv.  immediately, instantly; ~he adv.  id.

thatheka adv.  like this

tha-thene v.i.  1. to reach, to approach the top  2. to get promoted

thathe|-dakhāḥ adv.  exactly; ~bhanaṃ adv.  thus, in that way; ~he adv.  exactly like this

आः
aa

नीस्वनेधुंकल
? neesvanedhunkal

दइमखु
? daṂkaḥmi (var. dakaḥmi) n.anim.  mason, brick-layer; ~kā n. (-kā)  line of bricks; ~cupi n. (-pu)  tool with a blade for cutting and shaping bricks; ~jyā n.  masonry; ~nāyaḥ n\
.anim. (-yeka-)  foreman of masons

वइतिनि
धका:
न्ह्याःवःगु

In [16]:
for i, (tr, pr, tok) in enumerate(zip(train_true, [pred for _, pred in train_pred], train_toks)):
    if tr != pr or i:
        print(i, tr, pr, tok)
                           

434 I-EC O नं
435 I-EC O मोह
577 O I-EC </S>
578 O I-EC <S>
579 O B-EC अथे
580 O I-EC हे
581 O I-EC सक्व
582 O I-EC नाप
583 O I-EC स्वापू
584 O I-EC दुगु
585 O I-EC ग्रन्थ
586 O I-EC मणिशैल
587 O I-EC महावरणय्
588 O I-EC धाःसा
813 O I-EC </S>
814 O I-EC <S>
3309 I-EC O हे
3310 I-EC O स्वीकार
3311 I-EC O मयात
3312 I-EC O धाःसा


In [18]:
700-147

553

In [27]:
# actually the previous was not really bigram, only two word context here's a better bigram
# using joint features
def featurize_wordandtag_bigram2(tokens, classify=False):
    """input is a list/iterable of tokens, output/generator is list of dictionary features, like
    [{word_nm1: the, word_n: dog}]
    if tok == <s>, word_nm1 = "</s>" (padding)
    """
    prev_tok = "</S>" # end of sentence marker
    prev_lab = "O"
    for tok, lab in tokens:
        feature_dict = {}
        feature_dict["word_n"] = tok
        feature_dict["word_n-1"] = prev_tok
        feature_dict["word_n-1,word_n"] = prev_tok + ","  + tok
        feature_dict["lab_n-1"] = prev_lab
        feature_dict["lab_n-1,word_n"] = prev_lab + ","  + tok
        feature_dict["lab_n-1,word_n-1"] = prev_lab + ","  + prev_tok
        feature_dict["lab_n-1,word_n-1,word_n"] = prev_lab + ","  + prev_tok + "," + tok
        prev_tok = tok
        if classify: # this is the part that makes it honest fair, see below
            lab = classify(feature_dict)
        prev_lab = lab
        yield feature_dict, lab
            

In [28]:
# try again
trainfeat = list(featurize_wordandtag_bigram2(train))
classifier = nltk.MaxentClassifier.train(trainfeat)

# this way predicts over-optimistically because the preceding tags/labels are known
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
train_toks, train_true = zip(*train)
util.conlleval(train_toks, train_true, train_pred)

# this way should be more fair/honest
train_pred = list(featurize_wordandtag_bigram2(train, classify=classifier.classify))
util.conlleval(train_toks, train_true, [pred for _, pred in train_pred])

# fair/honest for test
test_pred = list(featurize_wordandtag_bigram2(test, classify=classifier.classify))
test_toks, test_true = zip(*test)
util.conlleval(test_toks, test_true, [pred for _, pred in test_pred])

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.479
             2          -0.43712        0.951
             3          -0.30547        0.997
             4          -0.23673        0.998
             5          -0.19374        0.999
             6          -0.16415        0.999
             7          -0.14249        0.999
             8          -0.12594        0.999
             9          -0.11287        0.999
            10          -0.10229        0.999
            11          -0.09354        0.999
            12          -0.08618        0.999
            13          -0.07991        0.999
            14          -0.07450        0.999
            15          -0.06978        0.999
            16          -0.06563        0.999
            17          -0.06195        0.999
            18          -0.05867        0.999
            19          -0.05573        0.999
 

('711', '33', '37', '27', '89.73', '72.97', '81.82', '77.14')

In [29]:
classifier.show_most_informative_features(50)

  -9.862 word_n-1=='धकाः' and label is 'B-EC'
  -4.649 word_n-1=='नं' and label is 'B-EC'
  -4.252 word_n-1=='हे' and label is 'B-EC'
  -4.170 word_n-1=='धकाः' and label is 'I-EC'
   4.103 word_n-1,word_n=='हे,थ्व ' and label is 'B-EC'
   4.103 lab_n-1,word_n-1,word_n=='O,हे,थ्व ' and label is 'B-EC'
   3.972 word_n=='न्ह्यथसें ' and label is 'B-EC'
   3.972 word_n-1,word_n=='धकाः,न्ह्यथसें ' and label is 'B-EC'
   3.972 lab_n-1,word_n=='O,न्ह्यथसें ' and label is 'B-EC'
   3.972 lab_n-1,word_n-1,word_n=='O,धकाः,न्ह्यथसें ' and label is 'B-EC'
  -3.705 lab_n-1,word_n-1=='I-EC,धकाः' and label is 'I-EC'
   3.677 word_n-1,word_n=='हे,वय्कलं ' and label is 'B-EC'
   3.677 lab_n-1,word_n-1,word_n=='O,हे,वय्कलं ' and label is 'B-EC'
   3.442 lab_n-1,word_n-1,word_n=='I-EC,दु,</S>' and label is 'O'
  -3.429 lab_n-1,word_n=='I-EC,हे' and label is 'O'
   3.392 word_n=='नेवाःत ' and label is 'B-EC'
   3.392 word_n-1,word_n=='नं,नेवाःत ' and label is 'B-EC'
   3.392 lab_n-1,word_n=='O,नेवाःत ' an