In [None]:
# Required if used in google Colab
!pip install flair

Collecting flair
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[K     |████████████████████████████████| 322 kB 5.7 MB/s 
[?25hCollecting bpemb>=0.3.2
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting gdown==3.12.2
  Downloading gdown-3.12.2.tar.gz (8.2 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting wikipedia-api
  Downloading Wikipedia-API-0.5.4.tar.gz (18 kB)
Collecting more-itertools~=8.8.0
  Downloading more_itertools-8.8.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 4.5 MB/s 
Collecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 37.6 MB/s 
Collecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting sqlitedict>=1.6.0
  Downloading sqlitedict-1.7.0.tar.gz (28 kB)
Collecting segtok>=1.5.7
  Downloading segtok-1.5.10.

In [None]:
# Required if used in google Colab and files stored in google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'ner'}

data_folder = 'resources/taggers/lv-ner-flair-edit'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

print(corpus)

2021-12-14 22:17:19,272 Reading data from /content/drive/MyDrive/lv-ner-flair-edit
2021-12-14 22:17:19,273 Train: /content/drive/MyDrive/lv-ner-flair-edit/train.txt
2021-12-14 22:17:19,283 Dev: /content/drive/MyDrive/lv-ner-flair-edit/dev.txt
2021-12-14 22:17:19,284 Test: /content/drive/MyDrive/lv-ner-flair-edit/test.txt
Corpus: 6873 train + 2242 dev + 2310 test sentences


In [None]:
#Embeddings

#these are deprecated, but still work
#from flair.embeddings import BertEmbeddings

# recomennded up to date alternative to using BertEmbeddings
from flair.embeddings import TransformerWordEmbeddings

from flair.models import SequenceTagger

from flair.trainers import ModelTrainer

In [None]:

label_type = 'ner'

In [None]:
# Label dictionary creation. Datasets have been cleared from all other labels beforehand
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

2021-12-14 22:17:40,928 Computing label dictionary. Progress:


100%|██████████| 6873/6873 [00:00<00:00, 11611.53it/s]

2021-12-14 22:17:41,597 Corpus contains the labels: ner (#112978)
2021-12-14 22:17:41,601 Created (for label 'ner') Dictionary with 4 tags: <unk>, O, B-person, I-person
Dictionary with 4 tags: <unk>, O, B-person, I-person





In [None]:
# Initializing the embeddings, which triggers the download from Huggingface

embeddings = TransformerWordEmbeddings(model='FFZG-cleopatra/bert-emoji-latvian-twitter', 
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       model_max_length=576
                                       )

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

In [None]:
# Initializing the Tagger to be trained

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type='ner',
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

In [None]:
# Model trainer initialization 
trainer = ModelTrainer(tagger, corpus)

In [None]:
# Fine tuning. Parameters selected according to Flair documentation recommendations.
# checkpoint enabled

trainer.fine_tune(data_folder,
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  checkpoint=True,
                  write_weights=True,
                  mini_batch_chunk_size=4,
                  )


2021-12-14 22:19:35,960 ----------------------------------------------------------------------------------------------------
2021-12-14 22:19:35,968 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, 

{'dev_loss_history': [tensor(0.0187, device='cuda:0'),
  tensor(0.0138, device='cuda:0'),
  tensor(0.0141, device='cuda:0'),
  tensor(0.0142, device='cuda:0'),
  tensor(0.0139, device='cuda:0'),
  tensor(0.0145, device='cuda:0'),
  tensor(0.0146, device='cuda:0'),
  tensor(0.0146, device='cuda:0'),
  tensor(0.0148, device='cuda:0'),
  tensor(0.0149, device='cuda:0')],
 'dev_score_history': [0.8712241653418125,
  0.9357495881383854,
  0.9425478767693588,
  0.9457627118644069,
  0.9489795918367347,
  0.9390142021720969,
  0.9453924914675769,
  0.9531914893617021,
  0.9529411764705882,
  0.9512605042016807],
 'test_score': 0.9390557939914163,
 'train_loss_history': [0.2599629068349757,
  0.082351806254405,
  0.07535523442458815,
  0.07415776804786306,
  0.07233249365501175,
  0.07075565277462117,
  0.07029246700411478,
  0.06912414421478975,
  0.06846908077632015,
  0.06630205454114527]}