In [1]:
!ls

README.md                    lined_up.txt
abe_simple_naive_bayes.ipynb new_dedup.txt
[31mconlleval.pl[m[m                 newfile.txt
exploreNewa.ipynb            tmp.txt
[34mfor_ml[m[m                       tmp.txt~
good_length.txt              tmp1.txt
just_ec.conll                tmp1.txt~
ki.txt                       total_true_emb
ki_true.txt


In [2]:
# read and prepare data by adding sentence beginning and ending tags
infilename = "just_ec.conll"
labeled_tokens = []
labeled_tokens.append(("<S>", "O")) #beginning of sentence marker
for line in open(infilename):
    try:
        tok, lab, _ = line.split("\t")
        labeled_tokens.append((tok, lab))
    except ValueError:
        labeled_tokens.append(("</S>", "O")) # end of sentence marker
        labeled_tokens.append(("<S>", "O")) # beginning of sentence marker


In [3]:
# find a good place to split 
list(enumerate(labeled_tokens[3990:4010]))

[(0, ('(ल्याटिन', 'I-EC')),
 (1, ('भायः', 'I-EC')),
 (2, ('नोभेमःगुं)', 'I-EC')),
 (3, ('धका', 'I-EC')),
 (4, ('धागु', 'O')),
 (5, ('खः', 'O')),
 (6, ('</S>', 'O')),
 (7, ('<S>', 'O')),
 (8, ('मुस्मांतय् ', 'B-EC')),
 (9, ('बाहुल्यता', 'I-EC')),
 (10, ('दुगु', 'I-EC')),
 (11, ('नेपालगञ्जय्', 'I-EC')),
 (12, ('‘मुस्लिम', 'I-EC')),
 (13, ('विरोधी', 'I-EC')),
 (14, ('सरकार', 'I-EC')),
 (15, ('मूर्दावाद', 'I-EC')),
 (16, ('!’', 'I-EC')),
 (17, ('धकाः', 'I-EC')),
 (18, ('च्वयातःगु', 'O')),
 (19, ('ब्यानर', 'O'))]

In [4]:
labeled_tokens[3996] # end of a sentence

('</S>', 'O')

In [5]:
train, test = labeled_tokens[:3997], labeled_tokens[3997:]

In [6]:
#check
print(train[-1:])
print(test[0])

[('</S>', 'O')]
('<S>', 'O')


In [7]:
import nltk

In [8]:
# cf: https://www.nltk.org/book/ch06.html

In [9]:
def featfn_just_word(tok):
    """just get the word as a feature"""
    return {"word": tok}    

In [10]:
trainfeat = [({"word": tok}, lab) for (tok, lab) in train]
testfeat  = [({"word": tok}, lab) for (tok, lab) in test]
len(trainfeat), len(testfeat)

(3997, 711)

In [11]:
classifier = nltk.NaiveBayesClassifier.train(trainfeat)

In [12]:
featfn_just_word('<S>')

{'word': '<S>'}

In [13]:
classifier.classify(featfn_just_word('<S>'))

'O'

In [14]:
classifier.classify(featfn_just_word('छगू'))

'I-EC'

In [15]:
# prediction/inference
train_pred = list(map(classifier.classify, [tok for tok, lab in trainfeat]))
test_pred = list(map(classifier.classify, [tok for tok, lab in testfeat]))

In [16]:
import subprocess
import sys

In [17]:
train_toks, train_true = zip(*train)
test_toks, test_true = zip(*test)

In [18]:
train_toks[0:5], train_true[0:5], train_pred[0:5]

(('<S>', 'थः ', 'नं', 'छम्ह', 'शक्तिशाली'),
 ('O', 'B-EC', 'I-EC', 'I-EC', 'I-EC'),
 ['O', 'O', 'O', 'I-EC', 'I-EC'])

In [19]:
list(zip(train_toks[0:5], train_true[0:5], train_pred[0:5]))

[('<S>', 'O', 'O'),
 ('थः ', 'B-EC', 'O'),
 ('नं', 'I-EC', 'O'),
 ('छम्ह', 'I-EC', 'I-EC'),
 ('शक्तिशाली', 'I-EC', 'I-EC')]

In [20]:
for tok, true, pred in zip(train_toks[0:5], train_true[0:5], train_pred[0:5]):
    print(tok, true, pred)

<S> O O
थः  B-EC O
नं I-EC O
छम्ह I-EC I-EC
शक्तिशाली I-EC I-EC


In [21]:
# 头疼
p1 = subprocess.Popen(["cat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
p2 = subprocess.Popen(["./conlleval.pl", "-d", "\t"], stdin=p1.stdout, stdout=subprocess.PIPE)
for tok, true, pred in zip(train_toks, train_true, train_pred):
    #print(tok, true, pred, file=p1.stdin, sep="\t")
    p1.stdin.write(("\t".join([tok, true, pred])+"\n").encode("utf-8"))
p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
p1.stdin.close() # close the input.
output = p2.communicate()[0]
print(output.decode("utf-8") )


processed 3997 tokens with 168 phrases; found: 572 phrases; correct: 19.
accuracy:  84.14%; precision:   3.32%; recall:  11.31%; FB1:   5.14
               EC: precision:   3.32%; recall:  11.31%; FB1:   5.14  572



In [22]:
import re
def evaluate(toks, trues, preds):
    """returns num_tokens, num_phrases, num_found, num_correct, accuracy, precision, recall, fb1"""
    p1 = subprocess.Popen(["cat"], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
    p2 = subprocess.Popen(["./conlleval.pl", "-d", "\t"], stdin=p1.stdout, stdout=subprocess.PIPE)
    for tok, true, pred in zip(toks, trues, preds):
        #print(tok, true, pred, file=p1.stdin, sep="\t")
        p1.stdin.write(("\t".join([tok, true, pred])+"\n").encode("utf-8"))
    p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
    p1.stdin.close() # close the input.
    output = p2.communicate()[0].decode("utf-8")
    print(output)
    pattern = re.compile(r"^processed (\d+) tokens with (\d+) phrases; found (\d) phrases; correct: (\d)\.\n*accuracy: (\d*\.\d*)%; precision: +(\d*\.*\d*); +recall: +(\d*\.\d*); *FB1 *(\d*\.\d*)", 
                         re.MULTILINE)
    pattern = re.compile(r"processed (\d+) tokens with (\d+) phrases; *found: (\d*) phrases; *correct: (\d+).\naccuracy: *(\d*\.\d*)%; *precision: *(\d*\.\d*)%; *recall: +(\d*\.\d*)%; *FB1: *(\d*\.\d*)", 
                         re.MULTILINE)
    match = re.search(pattern, output)
    return match.groups()


In [23]:
m = evaluate(train_toks, train_true, train_pred)

processed 3997 tokens with 168 phrases; found: 572 phrases; correct: 19.
accuracy:  84.14%; precision:   3.32%; recall:  11.31%; FB1:   5.14
               EC: precision:   3.32%; recall:  11.31%; FB1:   5.14  572



In [24]:
m

('3997', '168', '572', '19', '84.14', '3.32', '11.31', '5.14')

In [25]:
evaluate(test_toks, test_true, test_pred)

processed 711 tokens with 33 phrases; found: 126 phrases; correct: 0.
accuracy:  52.88%; precision:   0.00%; recall:   0.00%; FB1:   0.00
               EC: precision:   0.00%; recall:   0.00%; FB1:   0.00  126



('711', '33', '126', '0', '52.88', '0.00', '0.00', '0.00')

i.e., very bad performance