# Train Name Entity Recognition (NER) Model

In [1]:
import pycrfsuite
from itertools import chain
import json
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

## Load Corpus

In [2]:
train_corpus_text = open("../corpus/id_beritagar-ner-train.conllu", "r").read()
test_corpus_text = open("../corpus/id_beritagar-ner-test.conllu", "r").read()

In [3]:
train_corpus_text = train_corpus_text.split("\n\n")
test_corpus_text = test_corpus_text.split("\n\n")

In [4]:
print("Total Corpus Train: %d " % len(train_corpus_text))
print("Total Corpus Test: %d " % len(test_corpus_text))

Total Corpus Train: 2500 
Total Corpus Test: 500 


In [5]:
train_corpus = []
for corpus in train_corpus_text:
    train_corpus.append([line.split("\t") for line in corpus.split("\n")[2:]])

test_corpus = []
for corpus in test_corpus_text:
    test_corpus.append([line.split("\t") for line in corpus.split("\n")[2:]])

In [6]:
train_corpus[0]

[['1', 'Celah', 'NN', '_', 'O'],
 ['2', 'itulah', 'NN', '_', 'O'],
 ['3', 'yang', 'SC', '_', 'O'],
 ['4', 'digunakan', 'VB', '_', 'O'],
 ['5', 'hakim', 'NN', '_', 'O'],
 ['6', 'MK', 'NNP', '_', 'U-ORG'],
 ['7', 'untuk', 'SC', '_', 'O'],
 ['8', 'mempersilakan', 'VB', '_', 'O'],
 ['9', 'kuasa', 'NN', '_', 'O'],
 ['10', 'hukum', 'NN', '_', 'O'],
 ['11', 'paslon', 'NN', '_', 'O'],
 ['12', '02', 'CD', '_', 'O'],
 ['13', 'untuk', 'SC', '_', 'O'],
 ['14', 'membacakan', 'VB', '_', 'O'],
 ['15', 'permohonan', 'NN', '_', 'O'],
 ['16', 'yang', 'SC', '_', 'O'],
 ['17', 'sudah', 'MD', '_', 'O'],
 ['18', 'dikoreksi', 'VB', '_', 'O'],
 ['19', '.', 'Z', '_', 'O']]

In [7]:
test_corpus[0]

[['1', '"', 'Z', '_', 'O'],
 ['2', 'Banyak', 'CD', '_', 'O'],
 ['3', 'populasi', 'NN', '_', 'O'],
 ['4', 'dengan', 'IN', '_', 'O'],
 ['5', 'pertumbuhan', 'NN', '_', 'O'],
 ['6', 'tercepat', 'JJ', '_', 'O'],
 ['7', 'terjadi', 'VB', '_', 'O'],
 ['8', 'di', 'IN', '_', 'O'],
 ['9', 'negara-negara', 'NN', '_', 'O'],
 ['10', 'termiskin', 'JJ', '_', 'O'],
 ['11', 'di', 'IN', '_', 'O'],
 ['12', 'dunia', 'NN', '_', 'O'],
 ['13', ',', 'Z', '_', 'O'],
 ['14', '"', 'Z', '_', 'O'],
 ['15', 'kata', 'VB', '_', 'O'],
 ['16', 'Spoorenberg', 'NNP', '_', 'U-PERSON'],
 ['17', '.', 'Z', '_', 'O']]

## Features Extraction 

In [8]:
def word2features(sent, i):
    id_, word, postag, head, ner = sent[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-4:]=' + word[-4:],
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isalnum=%s' % word.isalnum(),
        'word.isalpha=%s' % word.isalpha(),
        'word.isdecimal=%s' % word.isdecimal(),
        'word.isnumeric=%s' % word.isnumeric(),
        'word.isspace=%s' % word.isspace(),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        id_1, word1, postag1, head1, ner1  = sent[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.isalnum=%s' % word1.isalnum(),
            '-1:word.isalpha=%s' % word1.isalpha(),
            '-1:word.isdecimal=%s' % word1.isdecimal(),
            '-1:word.isnumeric=%s' % word1.isnumeric(),
            '-1:word.isspace=%s' % word1.isspace(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isdigit=%s' % word1.isdigit(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        id_1, word1, postag1, head1, ner1 = sent[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.isalnum=%s' % word1.isalnum(),
            '+1:word.isalpha=%s' % word1.isalpha(),
            '+1:word.isdecimal=%s' % word1.isdecimal(),
            '+1:word.isnumeric=%s' % word1.isnumeric(),
            '+1:word.isspace=%s' % word1.isspace(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isdigit=%s' % word1.isdigit(),
        ])
    else:
        features.append('EOS')
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [ner.replace("U-","B-").replace("L-", "I-") for id_, word, tag, head, ner in sent]

def sent2tokens(sent):
    return [word for id_, word, tag, head, ner in sent]

In [9]:
sent2features(train_corpus[0])[:1]

[['bias',
  'word.lower=celah',
  'word[-4:]=elah',
  'word[-3:]=lah',
  'word[-2:]=ah',
  'word.isalnum=True',
  'word.isalpha=True',
  'word.isdecimal=False',
  'word.isnumeric=False',
  'word.isspace=False',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'BOS',
  '+1:word.lower=itulah',
  '+1:word.isalnum=True',
  '+1:word.isalpha=True',
  '+1:word.isdecimal=False',
  '+1:word.isnumeric=False',
  '+1:word.isspace=False',
  '+1:word.isupper=False',
  '+1:word.istitle=False',
  '+1:word.isdigit=False']]

## Train Model

In [10]:
%%time
X_train = [sent2features(s) for s in train_corpus]
y_train = [sent2labels(s) for s in train_corpus]

X_test = [sent2features(s) for s in test_corpus]
y_test = [sent2labels(s) for s in test_corpus]

CPU times: user 1.17 s, sys: 108 ms, total: 1.28 s
Wall time: 1.32 s


In [11]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 2.06 s, sys: 83.3 ms, total: 2.14 s
Wall time: 2.27 s


In [12]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [13]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

## Save Model

In [14]:
%%time
trainer.train('../model/id_beritagar-ner.crfsuite')

CPU times: user 7.4 s, sys: 162 ms, total: 7.56 s
Wall time: 7.8 s


In [15]:
!ls -lh ../model/id_beritagar-ner.crfsuite

-rw-r--r--  1 abdulaziz  staff   316K Jun 25 15:48 ../model/id_beritagar-ner.crfsuite


## Load Model

In [16]:
tagger = pycrfsuite.Tagger()
tagger.open('../model/id_beritagar-ner.crfsuite')

<contextlib.closing at 0x128e23f28>

## Test Model

In [17]:
example_sent = test_corpus[100]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

Badan Meteorologi , Klimatologi , dan Geofisika ( BMKG ) memprakirakan , seluruh wilayah Sulawesi Barat berawan pagi ini , Senin ( 17 / 06 / 2019 ) . Sementara , berawan diprediksi akan terjadi di sebagian besar wilayah ini pada siang hari .

Predicted: B-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG O O O O O B-GPE I-GPE O O O O O O O O O O O O O O O O O O O O O O O O O O O O
Correct:   B-ORG I-ORG I-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG O O O O O B-GPE I-GPE O O O O O O O O O O O O O O O O O O O O O O O O O O O O


In [18]:
all_ner = []
ner = []
for i, a in enumerate(tagger.tag(sent2features(example_sent))):
    b = example_sent[i]
    if a != "O":
        split_ner = a.split("-")
        if split_ner[0] in ["B", "U"]:
            if len(ner)>0:
                all_ner.append((" ".join([z[0] for z in ner]), " ".join(set([z[1] for z in ner]))))
            ner = []
            ner.append((b[1],split_ner[1]))
        else:
            ner.append((b[1],split_ner[1]))
all_ner.append((" ".join([z[0] for z in ner]), " ".join(set([z[1] for z in ner]))))
print(all_ner)

[('Badan Meteorologi , Klimatologi , dan Geofisika', 'ORG'), ('BMKG', 'ORG'), ('Sulawesi Barat', 'GPE')]


## Evaluation Model

In [19]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    )

In [20]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 369 ms, sys: 87.6 ms, total: 457 ms
Wall time: 662 ms


In [22]:
print(bio_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     B-EVENT       0.88      0.69      0.77        51
     I-EVENT       0.80      0.57      0.67        70
       B-GPE       0.90      0.87      0.89       359
       I-GPE       0.89      0.90      0.90       184
      B-MERK       0.94      0.58      0.71        26
       B-ORG       0.87      0.85      0.86       507
       I-ORG       0.88      0.83      0.86       434
    B-PERSON       0.84      0.78      0.81       362
    I-PERSON       0.78      0.76      0.77       156
   B-PRODUCT       0.71      0.29      0.42        17
   I-PRODUCT       0.67      0.08      0.14        26

   micro avg       0.87      0.81      0.84      2192
   macro avg       0.83      0.66      0.71      2192
weighted avg       0.86      0.81      0.83      2192
 samples avg       0.10      0.10      0.10      2192

