# Train Part of Spech Tag (POS Tag) Model

In [1]:
import pycrfsuite
from itertools import chain
import json
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

## Load Corpus

In [2]:
train_corpus_text = open("../corpus/id_beritagar-ner-train.conllu", "r").read()
test_corpus_text = open("../corpus/id_beritagar-ner-test.conllu", "r").read()

In [3]:
train_corpus_text = train_corpus_text.split("\n\n")
test_corpus_text = test_corpus_text.split("\n\n")

In [4]:
print("Total Corpus Train: %d " % len(train_corpus_text))
print("Total Corpus Test: %d " % len(test_corpus_text))

Total Corpus Train: 2500 
Total Corpus Test: 500 


In [5]:
train_corpus = []
for corpus in train_corpus_text:
    train_corpus.append([line.split("\t") for line in corpus.split("\n")[2:]])

test_corpus = []
for corpus in test_corpus_text:
    test_corpus.append([line.split("\t") for line in corpus.split("\n")[2:]])

In [6]:
train_corpus[0]

[['1', 'Celah', 'NN', '_', 'O'],
 ['2', 'itulah', 'NN', '_', 'O'],
 ['3', 'yang', 'SC', '_', 'O'],
 ['4', 'digunakan', 'VB', '_', 'O'],
 ['5', 'hakim', 'NN', '_', 'O'],
 ['6', 'MK', 'NNP', '_', 'U-ORG'],
 ['7', 'untuk', 'SC', '_', 'O'],
 ['8', 'mempersilakan', 'VB', '_', 'O'],
 ['9', 'kuasa', 'NN', '_', 'O'],
 ['10', 'hukum', 'NN', '_', 'O'],
 ['11', 'paslon', 'NN', '_', 'O'],
 ['12', '02', 'CD', '_', 'O'],
 ['13', 'untuk', 'SC', '_', 'O'],
 ['14', 'membacakan', 'VB', '_', 'O'],
 ['15', 'permohonan', 'NN', '_', 'O'],
 ['16', 'yang', 'SC', '_', 'O'],
 ['17', 'sudah', 'MD', '_', 'O'],
 ['18', 'dikoreksi', 'VB', '_', 'O'],
 ['19', '.', 'Z', '_', 'O']]

In [7]:
test_corpus[0]

[['1', '"', 'Z', '_', 'O'],
 ['2', 'Banyak', 'CD', '_', 'O'],
 ['3', 'populasi', 'NN', '_', 'O'],
 ['4', 'dengan', 'IN', '_', 'O'],
 ['5', 'pertumbuhan', 'NN', '_', 'O'],
 ['6', 'tercepat', 'JJ', '_', 'O'],
 ['7', 'terjadi', 'VB', '_', 'O'],
 ['8', 'di', 'IN', '_', 'O'],
 ['9', 'negara-negara', 'NN', '_', 'O'],
 ['10', 'termiskin', 'JJ', '_', 'O'],
 ['11', 'di', 'IN', '_', 'O'],
 ['12', 'dunia', 'NN', '_', 'O'],
 ['13', ',', 'Z', '_', 'O'],
 ['14', '"', 'Z', '_', 'O'],
 ['15', 'kata', 'VB', '_', 'O'],
 ['16', 'Spoorenberg', 'NNP', '_', 'U-PERSON'],
 ['17', '.', 'Z', '_', 'O']]

## Features Extraction 

In [8]:
def word2features(sent, i):
    id_, word, postag, head, ner = sent[i]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-4:]=' + word[-4:],
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isalnum=%s' % word.isalnum(),
        'word.isalpha=%s' % word.isalpha(),
        'word.isdecimal=%s' % word.isdecimal(),
        'word.isnumeric=%s' % word.isnumeric(),
        'word.isspace=%s' % word.isspace(),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        id_1, word1, postag1, head1, ner1  = sent[i-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.isalnum=%s' % word1.isalnum(),
            '-1:word.isalpha=%s' % word1.isalpha(),
            '-1:word.isdecimal=%s' % word1.isdecimal(),
            '-1:word.isnumeric=%s' % word1.isnumeric(),
            '-1:word.isspace=%s' % word1.isspace(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isdigit=%s' % word1.isdigit(),
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        id_1, word1, postag1, head1, ner1 = sent[i+1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.isalnum=%s' % word1.isalnum(),
            '+1:word.isalpha=%s' % word1.isalpha(),
            '+1:word.isdecimal=%s' % word1.isdecimal(),
            '+1:word.isnumeric=%s' % word1.isnumeric(),
            '+1:word.isspace=%s' % word1.isspace(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isdigit=%s' % word1.isdigit(),
        ])
    else:
        features.append('EOS')
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tag for id_, word, tag, head, ner in sent]

def sent2tokens(sent):
    return [word for id_, word, tag, head, ner in sent]

In [9]:
sent2features(train_corpus[0])[:1]

[['bias',
  'word.lower=celah',
  'word[-4:]=elah',
  'word[-3:]=lah',
  'word[-2:]=ah',
  'word.isalnum=True',
  'word.isalpha=True',
  'word.isdecimal=False',
  'word.isnumeric=False',
  'word.isspace=False',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'BOS',
  '+1:word.lower=itulah',
  '+1:word.isalnum=True',
  '+1:word.isalpha=True',
  '+1:word.isdecimal=False',
  '+1:word.isnumeric=False',
  '+1:word.isspace=False',
  '+1:word.isupper=False',
  '+1:word.istitle=False',
  '+1:word.isdigit=False']]

## Train Model

In [10]:
%%time
X_train = [sent2features(s) for s in train_corpus]
y_train = [sent2labels(s) for s in train_corpus]

X_test = [sent2features(s) for s in test_corpus]
y_test = [sent2labels(s) for s in test_corpus]

CPU times: user 1.04 s, sys: 81.4 ms, total: 1.12 s
Wall time: 1.13 s


In [11]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 1.77 s, sys: 35.8 ms, total: 1.81 s
Wall time: 1.84 s


In [12]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [13]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

## Save Model

In [14]:
%%time
trainer.train('../model/id_beritagar-postag.crfsuite')

CPU times: user 12.5 s, sys: 64.5 ms, total: 12.6 s
Wall time: 12.6 s


In [15]:
!ls -lh ../model/id_beritagar-postag.crfsuite

-rw-r--r--  1 abdulaziz  staff   418K Jun 25 15:47 ../model/id_beritagar-postag.crfsuite


## Load Model

In [16]:
tagger = pycrfsuite.Tagger()
tagger.open('../model/id_beritagar-postag.crfsuite')

<contextlib.closing at 0x121d4a240>

## Test Model

In [17]:
example_sent = test_corpus[100]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

Badan Meteorologi , Klimatologi , dan Geofisika ( BMKG ) memprakirakan , seluruh wilayah Sulawesi Barat berawan pagi ini , Senin ( 17 / 06 / 2019 ) . Sementara , berawan diprediksi akan terjadi di sebagian besar wilayah ini pada siang hari .

Predicted: NNP NNP Z NNP Z CC NNP Z NNP Z VB Z CD NN NNP NNP VB NN PR Z NNP Z CD Z CD Z CD Z Z SC Z NN NN MD VB IN NN JJ NN PR IN NN NN Z
Correct:   NNP NNP Z NNP Z CC NNP Z NNP Z VB Z CD NN NNP NNP VB NN PR Z NNP Z CD Z CD Z CD Z Z SC Z NN NN MD VB IN NN JJ NN PR IN NN NN Z


## Evaluation Model

In [18]:
def postag_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    )

In [19]:
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 334 ms, sys: 11.6 ms, total: 345 ms
Wall time: 346 ms


In [20]:
print(postag_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          CC       0.99      0.99      0.99       477
          CD       0.98      0.98      0.98      1044
          DT       1.00      1.00      1.00        25
          FW       0.95      0.62      0.75        65
          IN       0.99      0.99      0.99      1273
          JJ       0.95      0.90      0.92       498
          MD       0.99      0.99      0.99       315
         NEG       0.98      0.97      0.98       122
          NN       0.95      0.97      0.96      3861
         NND       0.95      0.90      0.93        42
         NNP       0.98      0.98      0.98      2763
          OD       0.93      0.88      0.90        32
          PR       1.00      1.00      1.00       380
         PRP       0.99      0.98      0.98       125
          RB       0.96      0.96      0.96       490
          RP       0.96      0.96      0.96        23
          SC       0.99      0.97      0.98       585
         SYM       1.00    