# Preamble

In [1]:
from flair.datasets import ColumnCorpus
from flair.embeddings import FlairEmbeddings
from flair.embeddings import TokenEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from typing import List
import numpy as np
import os
import torch
import random

In [2]:
PATH_SPOTTING_DATASET = "../../data/concept-spotting/infoboxes/"
PATH_FLAIR_FOLDER = "../../data/flair-models/infoboxes/"

# Infobox-Spotter: Training

In [3]:
def set_seed(seed):
    # For reproducibility
    # (https://pytorch.org/docs/stable/notes/randomness.html)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
columns = {0: 'text', 1: 'pos', 2: 'chunk_BIO'}
tag_type = "chunk_BIO"
corpus = ColumnCorpus(PATH_SPOTTING_DATASET, columns)
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(corpus)

2020-10-16 11:24:04,000 Reading data from ../../data/concept-spotting/infoboxes
2020-10-16 11:24:04,001 Train: ../../data/concept-spotting/infoboxes/train.txt
2020-10-16 11:24:04,002 Dev: ../../data/concept-spotting/infoboxes/dev.txt
2020-10-16 11:24:04,003 Test: ../../data/concept-spotting/infoboxes/test.txt
Corpus: 179 train + 38 dev + 39 test sentences


In [5]:
set_seed(42)
embedding_types: List[TokenEmbeddings] = [
    FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward')]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
set_seed(42)
tagger: SequenceTagger = SequenceTagger(hidden_size=128,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True,
                                        dropout=0.25,
                                        rnn_layers=3)
set_seed(42)
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [6]:
set_seed(42)
result = trainer.train(PATH_FLAIR_FOLDER,
                       learning_rate=0.2,
                       mini_batch_size=16,
                       max_epochs=20,
                       shuffle=True,
                       num_workers=0)

2020-08-14 09:54:02,007 ----------------------------------------------------------------------------------------------------
2020-08-14 09:54:02,008 Evaluation method: MICRO_F1_SCORE
2020-08-14 09:54:02,380 ----------------------------------------------------------------------------------------------------
2020-08-14 09:54:02,831 epoch 1 - iter 0/12 - loss 18.98071289
2020-08-14 09:54:05,017 epoch 1 - iter 1/12 - loss 15.99734879
2020-08-14 09:54:07,015 epoch 1 - iter 2/12 - loss 12.33985551
2020-08-14 09:54:09,030 epoch 1 - iter 3/12 - loss 10.40538406
2020-08-14 09:54:11,148 epoch 1 - iter 4/12 - loss 9.71126137
2020-08-14 09:54:13,257 epoch 1 - iter 5/12 - loss 9.60899353
2020-08-14 09:54:15,314 epoch 1 - iter 6/12 - loss 9.13590976
2020-08-14 09:54:17,575 epoch 1 - iter 7/12 - loss 8.58234262
2020-08-14 09:54:19,666 epoch 1 - iter 8/12 - loss 8.31953790
2020-08-14 09:54:22,065 epoch 1 - iter 9/12 - loss 8.16345787
2020-08-14 09:54:24,104 epoch 1 - iter 10/12 - loss 7.78753927
2020-

2020-08-14 09:57:03,610 epoch 7 - iter 11/12 - loss 1.76534521
2020-08-14 09:57:05,379 ----------------------------------------------------------------------------------------------------
2020-08-14 09:57:05,381 EPOCH 7 done: loss 1.7653 - lr 0.2000 - bad epochs 1
2020-08-14 09:57:06,349 DEV : loss 0.7082177400588989 - score 0.9714
2020-08-14 09:57:07,520 TEST : loss 1.1993913650512695 - score 0.9052
2020-08-14 09:57:09,039 ----------------------------------------------------------------------------------------------------
2020-08-14 09:57:09,155 epoch 8 - iter 0/12 - loss 0.80568194
2020-08-14 09:57:11,135 epoch 8 - iter 1/12 - loss 0.74763250
2020-08-14 09:57:13,030 epoch 8 - iter 2/12 - loss 0.67963624
2020-08-14 09:57:14,988 epoch 8 - iter 3/12 - loss 0.81095186
2020-08-14 09:57:16,889 epoch 8 - iter 4/12 - loss 0.78555005
2020-08-14 09:57:18,831 epoch 8 - iter 5/12 - loss 0.94282862
2020-08-14 09:57:20,760 epoch 8 - iter 6/12 - loss 1.18258132
2020-08-14 09:57:22,720 epoch 8 - ite

2020-08-14 09:59:59,839 epoch 14 - iter 5/12 - loss 0.53312579
2020-08-14 10:00:01,729 epoch 14 - iter 6/12 - loss 0.52798381
2020-08-14 10:00:03,673 epoch 14 - iter 7/12 - loss 0.48291794
2020-08-14 10:00:05,847 epoch 14 - iter 8/12 - loss 0.49231795
2020-08-14 10:00:07,783 epoch 14 - iter 9/12 - loss 0.45564792
2020-08-14 10:00:09,776 epoch 14 - iter 10/12 - loss 0.47498730
2020-08-14 10:00:11,677 epoch 14 - iter 11/12 - loss 0.52425854
2020-08-14 10:00:13,537 ----------------------------------------------------------------------------------------------------
2020-08-14 10:00:13,539 EPOCH 14 done: loss 0.5243 - lr 0.1000 - bad epochs 2
2020-08-14 10:00:14,512 DEV : loss 0.4634469449520111 - score 0.9714
2020-08-14 10:00:15,668 TEST : loss 0.8091835975646973 - score 0.9519
2020-08-14 10:00:17,175 ----------------------------------------------------------------------------------------------------
2020-08-14 10:00:17,281 epoch 15 - iter 0/12 - loss 1.71253371
2020-08-14 10:00:19,262 epo

2020-08-14 10:02:58,981 ----------------------------------------------------------------------------------------------------
2020-08-14 10:02:58,982 Testing using best model ...
2020-08-14 10:02:58,988 loading file ../../data/flair-data/infoboxes/best-model.pt
2020-08-14 10:03:00,718 0.9468	0.957	0.9519
2020-08-14 10:03:00,721 
MICRO_AVG: acc 0.9082 - f1-score 0.9519
MACRO_AVG: acc 0.9082 - f1-score 0.9519
C          tp: 89 - fp: 5 - fn: 4 - tn: 89 - precision: 0.9468 - recall: 0.9570 - accuracy: 0.9082 - f1-score: 0.9519
2020-08-14 10:03:00,723 ----------------------------------------------------------------------------------------------------


In [7]:
assert result['test_score'] ==  0.9519