In [None]:
# Required if used in google Colab
#!pip install flair

Collecting flair
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[?25l[K     |█                               | 10 kB 19.6 MB/s eta 0:00:01[K     |██                              | 20 kB 23.2 MB/s eta 0:00:01[K     |███                             | 30 kB 18.6 MB/s eta 0:00:01[K     |████                            | 40 kB 15.3 MB/s eta 0:00:01[K     |█████                           | 51 kB 5.6 MB/s eta 0:00:01[K     |██████                          | 61 kB 6.1 MB/s eta 0:00:01[K     |███████                         | 71 kB 5.5 MB/s eta 0:00:01[K     |████████▏                       | 81 kB 6.1 MB/s eta 0:00:01[K     |█████████▏                      | 92 kB 6.1 MB/s eta 0:00:01[K     |██████████▏                     | 102 kB 5.4 MB/s eta 0:00:01[K     |███████████▏                    | 112 kB 5.4 MB/s eta 0:00:01[K     |████████████▏                   | 122 kB 5.4 MB/s eta 0:00:01[K     |█████████████▏                  | 133 kB 5.4 MB/s eta 0:00:01[K    

In [None]:
# Required if used in google Colab and files stored in google drive
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

In [None]:
# define columns
columns_rus = {0: 'text', 1: 'pos', 2: 'pos2' , 3: 'ner'}

# Data from https://competitions.codalab.org/competitions/36044#participate-get_data
data_folder_rus = 'resources/taggers/rus-ner-flair-edit'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus_rus: Corpus = ColumnCorpus(data_folder_rus, columns_rus,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

print(corpus_rus)

2021-12-15 00:11:13,000 Reading data from /content/drive/MyDrive/rus-ner-flair-edit
2021-12-15 00:11:13,003 Train: /content/drive/MyDrive/rus-ner-flair-edit/train.txt
2021-12-15 00:11:13,006 Dev: /content/drive/MyDrive/rus-ner-flair-edit/dev.txt
2021-12-15 00:11:13,008 Test: /content/drive/MyDrive/rus-ner-flair-edit/test.txt
Corpus: 14501 train + 800 dev + 799 test sentences


In [None]:
#Embeddings

#these are deprecated, but still work
#from flair.embeddings import BertEmbeddings

# recomennded up to date alternative to using BertEmbeddings
from flair.embeddings import TransformerWordEmbeddings

from flair.models import SequenceTagger

from flair.trainers import ModelTrainer

In [None]:
label_type = 'ner'

In [None]:
# Label dictionary creation. Datasets have been cleared from all other labels beforehand
label_dict_rus = corpus_rus.make_label_dictionary(label_type=label_type)
print(label_dict_rus)

2021-12-15 00:11:36,383 Computing label dictionary. Progress:


100%|██████████| 14501/14501 [00:01<00:00, 10881.12it/s]

2021-12-15 00:11:37,784 Corpus contains the labels: pos (#230304), pos2 (#230304), ner (#230304)





2021-12-15 00:11:37,789 Created (for label 'ner') Dictionary with 4 tags: <unk>, O, B-PER, I-PER
Dictionary with 4 tags: <unk>, O, B-PER, I-PER


In [None]:
# Initializing the embeddings, which triggers the download from Huggingface
embeddings = TransformerWordEmbeddings(model='DeepPavlov/rubert-base-cased',
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       model_max_length=768
                                       )

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

In [None]:
# Initializing the Tagger to be trained

tagger = SequenceTagger(hidden_size=768,
                        embeddings=embeddings,
                        tag_dictionary=label_dict_rus,
                        tag_type='ner',
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

In [None]:
# Model trainer initialization 
trainer = ModelTrainer(tagger, corpus_rus)

In [None]:
# Fine tuning. Parameters selected according to Flair documentation recommendations.
# checkpoint enabled

trainer.fine_tune(data_folder_rus,
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  checkpoint=True,
                  write_weights=True,
                  mini_batch_chunk_size=4, 
                  )


2021-12-15 00:13:35,023 ----------------------------------------------------------------------------------------------------
2021-12-15 00:13:35,031 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): 

{'dev_loss_history': [tensor(0.0555, device='cuda:0'),
  tensor(0.0422, device='cuda:0'),
  tensor(0.0595, device='cuda:0'),
  tensor(0.0703, device='cuda:0'),
  tensor(0.0743, device='cuda:0'),
  tensor(0.0791, device='cuda:0'),
  tensor(0.0843, device='cuda:0'),
  tensor(0.0886, device='cuda:0'),
  tensor(0.0940, device='cuda:0'),
  tensor(0.0958, device='cuda:0')],
 'dev_score_history': [0.735576923076923,
  0.7896103896103895,
  0.7719298245614036,
  0.7787114845938375,
  0.8130081300813008,
  0.8148148148148148,
  0.8021390374331551,
  0.8083989501312335,
  0.7913279132791327,
  0.8073878627968337],
 'test_score': 0.7357512953367875,
 'train_loss_history': [0.24591269878505195,
  0.11728439754706656,
  0.10598519772939095,
  0.09152207903503555,
  0.08351352641316465,
  0.07654144208817659,
  0.07376060656721113,
  0.07003329989009709,
  0.06943536581099295,
  0.06934782727571162]}