# Preamble

In [1]:
from flair.datasets import ColumnCorpus
from flair.embeddings import FlairEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import TokenEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from typing import List
import numpy as np
import os
import torch
import random

In [2]:
PATH_SPOTTING_DATASET = "../../data/concept-spotting/sentences/"
PATH_FLAIR_FOLDER = "../../data/flair-models/sentences/"

# Sentence-Spotter: Training

In [3]:
def set_seed(seed):
    # For reproducibility
    # (https://pytorch.org/docs/stable/notes/randomness.html)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
columns = {0: 'text', 1: 'pos', 2: 'chunk_BIO'}
tag_type = "chunk_BIO"
corpus = ColumnCorpus(PATH_SPOTTING_DATASET, columns)
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(corpus)

2020-10-16 11:22:55,358 Reading data from ../../data/concept-spotting/sentences
2020-10-16 11:22:55,359 Train: ../../data/concept-spotting/sentences/train.txt
2020-10-16 11:22:55,360 Dev: ../../data/concept-spotting/sentences/dev.txt
2020-10-16 11:22:55,361 Test: ../../data/concept-spotting/sentences/test.txt
Corpus: 583 train + 127 dev + 122 test sentences


In [6]:
set_seed(42)
embedding_types: List[TokenEmbeddings] = [
    CharacterEmbeddings(), 
    FlairEmbeddings('news-forward'), 
    FlairEmbeddings('news-backward')]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
set_seed(42)
tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True,
                                        dropout=0.25,
                                        rnn_layers=2)
set_seed(42)
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [7]:
set_seed(42)
result = trainer.train(PATH_FLAIR_FOLDER,
                       learning_rate=0.2,
                       mini_batch_size=32,
                       max_epochs=20,
                       shuffle=True,
                       num_workers=0)

2020-08-14 10:08:57,226 ----------------------------------------------------------------------------------------------------
2020-08-14 10:08:57,228 Evaluation method: MICRO_F1_SCORE
2020-08-14 10:08:57,674 ----------------------------------------------------------------------------------------------------
2020-08-14 10:08:59,212 epoch 1 - iter 0/19 - loss 76.63780212
2020-08-14 10:09:02,610 epoch 1 - iter 1/19 - loss 60.40762711
2020-08-14 10:09:05,838 epoch 1 - iter 2/19 - loss 49.04114660
2020-08-14 10:09:09,761 epoch 1 - iter 3/19 - loss 46.01602268
2020-08-14 10:09:13,178 epoch 1 - iter 4/19 - loss 41.76025848
2020-08-14 10:09:16,692 epoch 1 - iter 5/19 - loss 39.94511986
2020-08-14 10:09:20,100 epoch 1 - iter 6/19 - loss 37.34616525
2020-08-14 10:09:23,520 epoch 1 - iter 7/19 - loss 36.07658005
2020-08-14 10:09:26,893 epoch 1 - iter 8/19 - loss 34.50795958
2020-08-14 10:09:30,326 epoch 1 - iter 9/19 - loss 33.74957237
2020-08-14 10:09:33,922 epoch 1 - iter 10/19 - loss 32.8784101

2020-08-14 10:14:35,028 EPOCH 5 done: loss 12.5421 - lr 0.2000 - bad epochs 0
2020-08-14 10:14:39,275 DEV : loss 8.877015113830566 - score 0.5352
2020-08-14 10:14:43,299 TEST : loss 9.536831855773926 - score 0.549
2020-08-14 10:14:44,794 ----------------------------------------------------------------------------------------------------
2020-08-14 10:14:45,969 epoch 6 - iter 0/19 - loss 10.20831108
2020-08-14 10:14:48,877 epoch 6 - iter 1/19 - loss 11.12036514
2020-08-14 10:14:51,968 epoch 6 - iter 2/19 - loss 11.04925283
2020-08-14 10:14:54,938 epoch 6 - iter 3/19 - loss 11.73008513
2020-08-14 10:14:57,867 epoch 6 - iter 4/19 - loss 11.69302883
2020-08-14 10:15:00,739 epoch 6 - iter 5/19 - loss 11.82952261
2020-08-14 10:15:03,823 epoch 6 - iter 6/19 - loss 11.96185044
2020-08-14 10:15:06,520 epoch 6 - iter 7/19 - loss 12.13456595
2020-08-14 10:15:09,174 epoch 6 - iter 8/19 - loss 12.17293178
2020-08-14 10:15:12,067 epoch 6 - iter 9/19 - loss 12.33402014
2020-08-14 10:15:15,453 epoch 6

2020-08-14 10:20:10,598 ----------------------------------------------------------------------------------------------------
2020-08-14 10:20:10,600 EPOCH 10 done: loss 9.9495 - lr 0.2000 - bad epochs 2
2020-08-14 10:20:15,296 DEV : loss 6.116652488708496 - score 0.5562
2020-08-14 10:20:19,904 TEST : loss 6.259420394897461 - score 0.562
2020-08-14 10:20:19,908 ----------------------------------------------------------------------------------------------------
2020-08-14 10:20:21,076 epoch 11 - iter 0/19 - loss 7.32911921
2020-08-14 10:20:24,181 epoch 11 - iter 1/19 - loss 10.61243176
2020-08-14 10:20:27,313 epoch 11 - iter 2/19 - loss 10.33816767
2020-08-14 10:20:30,380 epoch 11 - iter 3/19 - loss 9.87300146
2020-08-14 10:20:33,564 epoch 11 - iter 4/19 - loss 10.32144613
2020-08-14 10:20:36,666 epoch 11 - iter 5/19 - loss 9.94738221
2020-08-14 10:20:39,882 epoch 11 - iter 6/19 - loss 10.16236530
2020-08-14 10:20:42,956 epoch 11 - iter 7/19 - loss 9.84806770
2020-08-14 10:20:46,206 epoc

2020-08-14 10:25:55,492 epoch 15 - iter 17/19 - loss 8.69073171
2020-08-14 10:25:57,846 epoch 15 - iter 18/19 - loss 8.62791325
2020-08-14 10:25:59,752 ----------------------------------------------------------------------------------------------------
2020-08-14 10:25:59,755 EPOCH 15 done: loss 8.6279 - lr 0.2000 - bad epochs 0
2020-08-14 10:26:04,132 DEV : loss 4.742476463317871 - score 0.6116
2020-08-14 10:26:08,170 TEST : loss 4.897988319396973 - score 0.6007
2020-08-14 10:26:08,175 ----------------------------------------------------------------------------------------------------
2020-08-14 10:26:09,483 epoch 16 - iter 0/19 - loss 5.40030003
2020-08-14 10:26:12,644 epoch 16 - iter 1/19 - loss 10.02645159
2020-08-14 10:26:15,928 epoch 16 - iter 2/19 - loss 8.45272064
2020-08-14 10:26:19,014 epoch 16 - iter 3/19 - loss 7.83067083
2020-08-14 10:26:21,876 epoch 16 - iter 4/19 - loss 8.03633423
2020-08-14 10:26:24,858 epoch 16 - iter 5/19 - loss 8.04019324
2020-08-14 10:26:27,972 epoc

2020-08-14 10:31:34,814 epoch 20 - iter 15/19 - loss 7.51337883
2020-08-14 10:31:37,779 epoch 20 - iter 16/19 - loss 7.48747699
2020-08-14 10:31:40,437 epoch 20 - iter 17/19 - loss 7.56146963
2020-08-14 10:31:42,658 epoch 20 - iter 18/19 - loss 7.41155660
2020-08-14 10:31:44,185 ----------------------------------------------------------------------------------------------------
2020-08-14 10:31:44,188 EPOCH 20 done: loss 7.4116 - lr 0.2000 - bad epochs 1
2020-08-14 10:31:49,436 DEV : loss 5.327756881713867 - score 0.6275
2020-08-14 10:31:54,372 TEST : loss 5.582951545715332 - score 0.6504
2020-08-14 10:31:55,783 ----------------------------------------------------------------------------------------------------
2020-08-14 10:31:55,785 Testing using best model ...
2020-08-14 10:31:55,790 loading file ../../data/flair-data/texts/best-model.pt
2020-08-14 10:32:01,137 0.6227	0.6724	0.6466
2020-08-14 10:32:01,145 
MICRO_AVG: acc 0.4777 - f1-score 0.6466
MACRO_AVG: acc 0.4777 - f1-score 0.64

In [8]:
assert result['test_score'] == 0.6466