In [2]:
import flair

In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Corpus
from flair.datasets import ColumnCorpus

In [4]:
# define column data
columns = {0 : 'text', 1 : 'ner'}

# directory with data 
data_folder = '../data/'

# Import data
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'abstracts-eudract-train.bio',
                              test_file = 'abstracts-eudract-test.bio',
                              dev_file = 'abstracts-eudract-dev.bio')

2020-09-24 18:54:45,139 Reading data from ../data
2020-09-24 18:54:45,141 Train: ../data/abstracts-eudract-train.bio
2020-09-24 18:54:45,142 Dev: ../data/abstracts-eudract-dev.bio
2020-09-24 18:54:45,143 Test: ../data/abstracts-eudract-test.bio


In [5]:
# Show data of 1st sentence
print(corpus.dev[0].to_tagged_string('ner'))

Efectos del trandolapril <B-CHEM> en monoterapia <B-PROC> y asociado con verapamil <B-CHEM> , sobre la presión <B-PROC> arterial <I-PROC> , albuminuria <B-DISO> y control metabólico en pacientes hipertensos <B-DISO> con diabetes <B-DISO> tipo <I-DISO> 2 <I-DISO> y albuminuria <B-DISO>


In [6]:
# Create the tag dictionary
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [7]:
# Import embeddings
from flair.embeddings import FlairEmbeddings, StackedEmbeddings, WordEmbeddings

In [8]:
# Stack different embeddings 
custom_embeddings = WordEmbeddings("../data/embeddings/EMEA+scielo-es_skipgram_w=10_dim=100_minfreq=1_neg=10_lr=1e-4.gensim")
embeddings_fw = FlairEmbeddings('spanish-forward')
embeddings_bw = FlairEmbeddings('spanish-backward')

stacked_embeddings = StackedEmbeddings([
    custom_embeddings,
    embeddings_fw,
    embeddings_bw
])


In [9]:
# Initialize the Sequence Tagger model: a bi-directional LSTM + CRF in last layer
tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=stacked_embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
# Show model architecture
print(tagger)

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('../data/embeddings/EMEA+scielo-es_skipgram_w=10_dim=100_minfreq=1_neg=10_lr=1e-4.gensim')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.5, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.5, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=12, bias=

In [10]:
%%time
# This creates a new directory called resources in current working directory with
# logs of training, loss information to the predictions on the test set with confidence score
trainer : ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/flair-ner-model10',
              learning_rate=0.1,
              mini_batch_size=32, # Same as in the BERT paper (Devlin et al. 2019)
              embeddings_storage_mode='gpu',
              max_epochs=100)

2020-09-24 18:54:59,328 ----------------------------------------------------------------------------------------------------
2020-09-24 18:54:59,329 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('../data/embeddings/EMEA+scielo-es_skipgram_w=10_dim=100_minfreq=1_neg=10_lr=1e-4.gensim')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.5, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.5, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2020-09-24 18:56:43,311 ----------------------------------------------------------------------------------------------------
2020-09-24 18:56:47,584 epoch 2 - iter 39/393 - loss 4.29395855 - samples/sec: 292.33 - lr: 0.100000
2020-09-24 18:56:51,658 epoch 2 - iter 78/393 - loss 4.16873676 - samples/sec: 306.63 - lr: 0.100000
2020-09-24 18:56:56,237 epoch 2 - iter 117/393 - loss 4.18519009 - samples/sec: 272.77 - lr: 0.100000
2020-09-24 18:57:00,647 epoch 2 - iter 156/393 - loss 4.15668739 - samples/sec: 283.18 - lr: 0.100000
2020-09-24 18:57:05,102 epoch 2 - iter 195/393 - loss 4.13506569 - samples/sec: 280.41 - lr: 0.100000
2020-09-24 18:57:09,714 epoch 2 - iter 234/393 - loss 4.12640215 - samples/sec: 270.84 - lr: 0.100000
2020-09-24 18:57:14,181 epoch 2 - iter 273/393 - loss 4.08401344 - samples/sec: 279.60 - lr: 0.100000
2020-09-24 18:57:18,611 epoch 2 - iter 312/393 - loss 4.02061641 - samples/sec: 281.99 - lr: 0.100000
2020-09-24 18:57:23,218 epoch 2 - iter 351/393 - loss 3.99433

{'test_score': 0.8305665008720368,
 'dev_score_history': [0.611360530938903,
  0.6725804316165761,
  0.7181359128794793,
  0.7220084773394195,
  0.7390802502885608,
  0.7481191694252182,
  0.7448587294806859,
  0.769827370527896,
  0.7408897302413631,
  0.7659777235094406,
  0.7734779086433601,
  0.7652962515114874,
  0.7713833799732328,
  0.7743246473822616,
  0.7911524473856794,
  0.7927071424273422,
  0.7936214751877848,
  0.796275293977198,
  0.793810765947356,
  0.8017803792451679,
  0.8022426717045523,
  0.8028143163046804,
  0.8058905520877521,
  0.8090229221134554,
  0.8101143306515033,
  0.8102956352819766,
  0.8066864051320011,
  0.8101003954974141,
  0.8113793927803012,
  0.8192843782055149,
  0.8155529742617839,
  0.8110433847257081,
  0.815065594583157,
  0.8148193063303419,
  0.8167507810622446,
  0.8194706419227259,
  0.8227104896798014,
  0.8233585476550681,
  0.8210067236053062,
  0.8233284529038556,
  0.8210526315789475,
  0.8238932686476653,
  0.8191463708705089,
  0