# Preamble

In [1]:
from flair.datasets import ColumnCorpus
from flair.embeddings import FlairEmbeddings
from flair.embeddings import TokenEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from typing import List
import numpy as np
import os
import torch
import random

In [2]:
PATH_SPOTTING_DATASET = "../../data/concept-spotting/lists/"
PATH_FLAIR_FOLDER = "../../data/flair-models/lists/"

# List-Spotter: Training

In [3]:
def set_seed(seed):
    # For reproducibility
    # (https://pytorch.org/docs/stable/notes/randomness.html)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
columns = {0: 'text', 1: 'pos', 2: 'chunk_BIO'}
tag_type = "chunk_BIO"
corpus = ColumnCorpus(PATH_SPOTTING_DATASET, columns)
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(corpus)

2020-10-16 11:24:22,193 Reading data from ../../data/concept-spotting/lists
2020-10-16 11:24:22,194 Train: ../../data/concept-spotting/lists/train.txt
2020-10-16 11:24:22,195 Dev: ../../data/concept-spotting/lists/dev.txt
2020-10-16 11:24:22,195 Test: ../../data/concept-spotting/lists/test.txt
Corpus: 358 train + 76 dev + 78 test sentences


In [5]:
set_seed(42)
embedding_types: List[TokenEmbeddings] = [
    FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward')]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
set_seed(42)
tagger: SequenceTagger = SequenceTagger(hidden_size=128,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True,
                                        dropout=0.25,
                                        rnn_layers=2)
set_seed(42)
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [6]:
set_seed(42)
result = trainer.train(PATH_FLAIR_FOLDER,
                       learning_rate=0.3,
                       mini_batch_size=16,
                       max_epochs=20,
                       shuffle=True,
                       num_workers=0)

2020-08-14 09:45:27,946 ----------------------------------------------------------------------------------------------------
2020-08-14 09:45:27,948 Evaluation method: MICRO_F1_SCORE
2020-08-14 09:45:28,579 ----------------------------------------------------------------------------------------------------
2020-08-14 09:45:28,817 epoch 1 - iter 0/23 - loss 8.79233646
2020-08-14 09:45:30,617 epoch 1 - iter 2/23 - loss 6.16552440
2020-08-14 09:45:32,498 epoch 1 - iter 4/23 - loss 5.29194376
2020-08-14 09:45:34,572 epoch 1 - iter 6/23 - loss 5.03094617
2020-08-14 09:45:36,524 epoch 1 - iter 8/23 - loss 4.69931691
2020-08-14 09:45:38,505 epoch 1 - iter 10/23 - loss 4.50073297
2020-08-14 09:45:40,560 epoch 1 - iter 12/23 - loss 4.20511235
2020-08-14 09:45:42,548 epoch 1 - iter 14/23 - loss 3.93507350
2020-08-14 09:45:44,569 epoch 1 - iter 16/23 - loss 3.93161163
2020-08-14 09:45:46,344 epoch 1 - iter 18/23 - loss 3.62402451
2020-08-14 09:45:48,263 epoch 1 - iter 20/23 - loss 3.53260322
2020

2020-08-14 09:48:06,104 epoch 7 - iter 20/23 - loss 0.71229163
2020-08-14 09:48:07,764 epoch 7 - iter 22/23 - loss 0.73737930
2020-08-14 09:48:09,227 ----------------------------------------------------------------------------------------------------
2020-08-14 09:48:09,229 EPOCH 7 done: loss 0.7374 - lr 0.3000 - bad epochs 3
2020-08-14 09:48:10,168 DEV : loss 0.33759820461273193 - score 0.8304
2020-08-14 09:48:11,181 TEST : loss 0.5285789370536804 - score 0.8945
Epoch     6: reducing learning rate of group 0 to 1.5000e-01.
2020-08-14 09:48:11,185 ----------------------------------------------------------------------------------------------------
2020-08-14 09:48:11,267 epoch 8 - iter 0/23 - loss 0.05725226
2020-08-14 09:48:12,917 epoch 8 - iter 2/23 - loss 0.66863814
2020-08-14 09:48:14,580 epoch 8 - iter 4/23 - loss 0.59104909
2020-08-14 09:48:16,275 epoch 8 - iter 6/23 - loss 0.53983421
2020-08-14 09:48:17,979 epoch 8 - iter 8/23 - loss 0.49011072
2020-08-14 09:48:19,657 epoch 8 - i

2020-08-14 09:50:31,600 epoch 14 - iter 6/23 - loss 0.47111191
2020-08-14 09:50:33,069 epoch 14 - iter 8/23 - loss 0.45031701
2020-08-14 09:50:34,687 epoch 14 - iter 10/23 - loss 0.44408365
2020-08-14 09:50:36,299 epoch 14 - iter 12/23 - loss 0.43639244
2020-08-14 09:50:37,938 epoch 14 - iter 14/23 - loss 0.42225286
2020-08-14 09:50:39,570 epoch 14 - iter 16/23 - loss 0.41832784
2020-08-14 09:50:41,207 epoch 14 - iter 18/23 - loss 0.41061636
2020-08-14 09:50:42,871 epoch 14 - iter 20/23 - loss 0.38532212
2020-08-14 09:50:44,443 epoch 14 - iter 22/23 - loss 0.36137966
2020-08-14 09:50:45,854 ----------------------------------------------------------------------------------------------------
2020-08-14 09:50:45,855 EPOCH 14 done: loss 0.3614 - lr 0.1500 - bad epochs 2
2020-08-14 09:50:46,808 DEV : loss 0.3322215676307678 - score 0.8735
2020-08-14 09:50:47,907 TEST : loss 0.6000284552574158 - score 0.8844
2020-08-14 09:50:47,912 ------------------------------------------------------------

2020-08-14 09:52:57,957 EPOCH 20 done: loss 0.3158 - lr 0.0375 - bad epochs 0
2020-08-14 09:52:58,905 DEV : loss 0.3610283136367798 - score 0.8902
2020-08-14 09:53:00,003 TEST : loss 0.6045132279396057 - score 0.9154
2020-08-14 09:53:03,163 ----------------------------------------------------------------------------------------------------
2020-08-14 09:53:03,168 Testing using best model ...
2020-08-14 09:53:03,173 loading file ../../data/flair-data/lists/best-model.pt
2020-08-14 09:53:06,737 0.92	0.9109	0.9154
2020-08-14 09:53:06,741 
MICRO_AVG: acc 0.844 - f1-score 0.9154
MACRO_AVG: acc 0.844 - f1-score 0.9154
C          tp: 92 - fp: 8 - fn: 9 - tn: 92 - precision: 0.9200 - recall: 0.9109 - accuracy: 0.8440 - f1-score: 0.9154
2020-08-14 09:53:06,743 ----------------------------------------------------------------------------------------------------


In [7]:
assert result['test_score'] == 0.9154