In [None]:
%%capture
!pip install flair

In [None]:
import torch, flair

# dataset, model and embedding imports
from flair.datasets import UniversalDependenciesCorpus
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from torch.optim.lr_scheduler import OneCycleLR
from flair.data import Sentence
from flair.models import MultiTagger

In [None]:
print(torch.cuda.is_available())
flair.device = 'cuda:0'

True


In [None]:
flair.device = 'cuda:0'

In [None]:
hf_model = "dbmdz/bert-base-turkish-cased"
tag_type = "upos"
dataset = "boun"
path = f"./drive/MyDrive/Colab Notebooks/{dataset}-treebank"
output_folder = f"./drive/MyDrive/Colab Notebooks/{dataset}-{hf_model}-{tag_type}"

In [None]:
corpus = UniversalDependenciesCorpus(data_folder=path,
                                    train_file=f"tr_{dataset}-ud-train.conllu",
                                    dev_file=f"tr_{dataset}-ud-dev.conllu",
                                    test_file=f"tr_{dataset}-ud-test.conllu")

2022-06-06 20:17:28,741 Reading data from drive/MyDrive/Colab Notebooks/boun-treebank
2022-06-06 20:17:28,744 Train: drive/MyDrive/Colab Notebooks/boun-treebank/tr_boun-ud-train.conllu
2022-06-06 20:17:28,746 Dev: drive/MyDrive/Colab Notebooks/boun-treebank/tr_boun-ud-dev.conllu
2022-06-06 20:17:28,748 Test: drive/MyDrive/Colab Notebooks/boun-treebank/tr_boun-ud-test.conllu


In [None]:
tag_dictionary = corpus.make_label_dictionary(tag_type)

2022-06-06 20:17:44,595 Computing label dictionary. Progress:


7803it [00:00, 23650.64it/s]

2022-06-06 20:17:44,936 Dictionary created for label 'upos' with 18 values: NOUN (seen 30027 times), VERB (seen 16690 times), PUNCT (seen 16190 times), ADJ (seen 8395 times), ADV (seen 4871 times), PROPN (seen 4736 times), CCONJ (seen 4287 times), DET (seen 4063 times), PRON (seen 2814 times), ADP (seen 2601 times), NUM (seen 2142 times), AUX (seen 1291 times), INTJ (seen 84 times), X (seen 14 times), SCONJ (seen 7 times), SYM (seen 1 times), PART (seen 1 times)





In [None]:
embeddings = TransformerWordEmbeddings(model=hf_model,
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       use_context=True,
                                       )
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=tag_type,
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False,
                        )

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

2022-06-06 20:18:26,387 SequenceTagger predicts: Dictionary with 18 tags: <unk>, NOUN, VERB, PUNCT, ADJ, ADV, PROPN, CCONJ, DET, PRON, ADP, NUM, AUX, INTJ, X, SCONJ, SYM, PART


In [None]:
trainer = ModelTrainer(tagger, corpus)

trainer.fine_tune(output_folder,
                  learning_rate=5.0e-6,
                  mini_batch_size=4,
                  )

2022-04-28 19:14:30,986 ----------------------------------------------------------------------------------------------------
2022-04-28 19:14:30,992 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(32000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): D

100%|██████████| 245/245 [00:34<00:00,  7.09it/s]

2022-04-28 19:22:59,014 Evaluating as a multi-label problem: False
2022-04-28 19:22:59,121 DEV : loss 0.5654515624046326 - f1-score (micro avg)  0.8445





2022-04-28 19:22:59,165 BAD EPOCHS (no improvement): 4
2022-04-28 19:22:59,174 ----------------------------------------------------------------------------------------------------
2022-04-28 19:23:49,137 epoch 2 - iter 195/1951 - loss 0.78065831 - samples/sec: 15.62 - lr: 0.000005
2022-04-28 19:24:36,643 epoch 2 - iter 390/1951 - loss 0.74918158 - samples/sec: 16.43 - lr: 0.000005
2022-04-28 19:25:24,339 epoch 2 - iter 585/1951 - loss 0.73252949 - samples/sec: 16.36 - lr: 0.000005
2022-04-28 19:26:14,183 epoch 2 - iter 780/1951 - loss 0.71259519 - samples/sec: 15.66 - lr: 0.000005
2022-04-28 19:27:04,939 epoch 2 - iter 975/1951 - loss 0.69873467 - samples/sec: 15.37 - lr: 0.000005
2022-04-28 19:27:55,479 epoch 2 - iter 1170/1951 - loss 0.68832793 - samples/sec: 15.44 - lr: 0.000005
2022-04-28 19:28:44,517 epoch 2 - iter 1365/1951 - loss 0.67733521 - samples/sec: 15.91 - lr: 0.000005
2022-04-28 19:29:33,757 epoch 2 - iter 1560/1951 - loss 0.66625789 - samples/sec: 15.85 - lr: 0.000005
2

100%|██████████| 245/245 [00:41<00:00,  5.86it/s]

2022-04-28 19:32:05,347 Evaluating as a multi-label problem: False
2022-04-28 19:32:05,443 DEV : loss 0.4216236472129822 - f1-score (micro avg)  0.8857





2022-04-28 19:32:05,493 BAD EPOCHS (no improvement): 4
2022-04-28 19:32:05,501 ----------------------------------------------------------------------------------------------------
2022-04-28 19:33:01,070 epoch 3 - iter 195/1951 - loss 0.56080603 - samples/sec: 14.04 - lr: 0.000004
2022-04-28 19:33:54,963 epoch 3 - iter 390/1951 - loss 0.55540410 - samples/sec: 14.48 - lr: 0.000004
2022-04-28 19:34:51,399 epoch 3 - iter 585/1951 - loss 0.55649548 - samples/sec: 13.83 - lr: 0.000004
2022-04-28 19:35:44,805 epoch 3 - iter 780/1951 - loss 0.55084190 - samples/sec: 14.61 - lr: 0.000004
2022-04-28 19:36:37,363 epoch 3 - iter 975/1951 - loss 0.54727650 - samples/sec: 14.85 - lr: 0.000004
2022-04-28 19:37:29,505 epoch 3 - iter 1170/1951 - loss 0.54694277 - samples/sec: 14.97 - lr: 0.000004
2022-04-28 19:38:20,744 epoch 3 - iter 1365/1951 - loss 0.54290320 - samples/sec: 15.23 - lr: 0.000004
2022-04-28 19:39:11,803 epoch 3 - iter 1560/1951 - loss 0.54477695 - samples/sec: 15.28 - lr: 0.000004
2

100%|██████████| 245/245 [01:14<00:00,  3.30it/s]

2022-04-28 19:42:38,526 Evaluating as a multi-label problem: False





2022-04-28 19:42:38,630 DEV : loss 0.36752283573150635 - f1-score (micro avg)  0.899
2022-04-28 19:42:38,662 BAD EPOCHS (no improvement): 4
2022-04-28 19:42:38,671 ----------------------------------------------------------------------------------------------------
2022-04-28 19:43:54,709 epoch 4 - iter 195/1951 - loss 0.50441577 - samples/sec: 10.26 - lr: 0.000004
2022-04-28 19:45:08,936 epoch 4 - iter 390/1951 - loss 0.50076933 - samples/sec: 10.51 - lr: 0.000004
2022-04-28 19:46:26,002 epoch 4 - iter 585/1951 - loss 0.49428978 - samples/sec: 10.12 - lr: 0.000004
2022-04-28 19:47:40,656 epoch 4 - iter 780/1951 - loss 0.49865409 - samples/sec: 10.45 - lr: 0.000004
2022-04-28 19:48:56,200 epoch 4 - iter 975/1951 - loss 0.49385320 - samples/sec: 10.33 - lr: 0.000004
2022-04-28 19:50:09,596 epoch 4 - iter 1170/1951 - loss 0.49341001 - samples/sec: 10.63 - lr: 0.000004
2022-04-28 19:51:23,752 epoch 4 - iter 1365/1951 - loss 0.49244425 - samples/sec: 10.52 - lr: 0.000004
2022-04-28 19:52:35

100%|██████████| 245/245 [00:59<00:00,  4.10it/s]

2022-04-28 19:55:59,526 Evaluating as a multi-label problem: False





2022-04-28 19:55:59,636 DEV : loss 0.35849931836128235 - f1-score (micro avg)  0.9009
2022-04-28 19:55:59,668 BAD EPOCHS (no improvement): 4
2022-04-28 19:55:59,678 ----------------------------------------------------------------------------------------------------
2022-04-28 19:57:08,274 epoch 5 - iter 195/1951 - loss 0.46323278 - samples/sec: 11.38 - lr: 0.000003
2022-04-28 19:58:16,015 epoch 5 - iter 390/1951 - loss 0.45412527 - samples/sec: 11.52 - lr: 0.000003
2022-04-28 19:59:22,702 epoch 5 - iter 585/1951 - loss 0.46123938 - samples/sec: 11.70 - lr: 0.000003
2022-04-28 20:00:27,799 epoch 5 - iter 780/1951 - loss 0.46066088 - samples/sec: 11.99 - lr: 0.000003
2022-04-28 20:01:33,241 epoch 5 - iter 975/1951 - loss 0.46192989 - samples/sec: 11.92 - lr: 0.000003
2022-04-28 20:02:36,483 epoch 5 - iter 1170/1951 - loss 0.46347397 - samples/sec: 12.34 - lr: 0.000003
2022-04-28 20:03:37,504 epoch 5 - iter 1365/1951 - loss 0.46322557 - samples/sec: 12.79 - lr: 0.000003
2022-04-28 20:04:3

100%|██████████| 245/245 [00:38<00:00,  6.34it/s]

2022-04-28 20:07:17,691 Evaluating as a multi-label problem: False
2022-04-28 20:07:17,798 DEV : loss 0.3476703464984894 - f1-score (micro avg)  0.9047





2022-04-28 20:07:17,836 BAD EPOCHS (no improvement): 4
2022-04-28 20:07:17,845 ----------------------------------------------------------------------------------------------------
2022-04-28 20:08:15,004 epoch 6 - iter 195/1951 - loss 0.44238479 - samples/sec: 13.65 - lr: 0.000003
2022-04-28 20:09:08,420 epoch 6 - iter 390/1951 - loss 0.44663474 - samples/sec: 14.61 - lr: 0.000003
2022-04-28 20:10:02,747 epoch 6 - iter 585/1951 - loss 0.44473056 - samples/sec: 14.36 - lr: 0.000003
2022-04-28 20:10:55,217 epoch 6 - iter 780/1951 - loss 0.44069057 - samples/sec: 14.87 - lr: 0.000003
2022-04-28 20:11:48,138 epoch 6 - iter 975/1951 - loss 0.43929999 - samples/sec: 14.74 - lr: 0.000003
2022-04-28 20:12:44,403 epoch 6 - iter 1170/1951 - loss 0.44466709 - samples/sec: 13.87 - lr: 0.000002
2022-04-28 20:13:36,441 epoch 6 - iter 1365/1951 - loss 0.44318935 - samples/sec: 15.00 - lr: 0.000002
2022-04-28 20:14:27,009 epoch 6 - iter 1560/1951 - loss 0.44195227 - samples/sec: 15.43 - lr: 0.000002
2

100%|██████████| 245/245 [00:22<00:00, 11.03it/s]


2022-04-28 20:16:34,005 Evaluating as a multi-label problem: False
2022-04-28 20:16:34,111 DEV : loss 0.3414258360862732 - f1-score (micro avg)  0.9057
2022-04-28 20:16:34,144 BAD EPOCHS (no improvement): 4
2022-04-28 20:16:34,151 ----------------------------------------------------------------------------------------------------
2022-04-28 20:17:26,454 epoch 7 - iter 195/1951 - loss 0.43367734 - samples/sec: 14.92 - lr: 0.000002
2022-04-28 20:18:18,733 epoch 7 - iter 390/1951 - loss 0.42639063 - samples/sec: 14.93 - lr: 0.000002
2022-04-28 20:19:11,210 epoch 7 - iter 585/1951 - loss 0.42541275 - samples/sec: 14.87 - lr: 0.000002
2022-04-28 20:20:03,450 epoch 7 - iter 780/1951 - loss 0.42650478 - samples/sec: 14.94 - lr: 0.000002
2022-04-28 20:20:55,255 epoch 7 - iter 975/1951 - loss 0.42711352 - samples/sec: 15.06 - lr: 0.000002
2022-04-28 20:21:48,078 epoch 7 - iter 1170/1951 - loss 0.42645666 - samples/sec: 14.77 - lr: 0.000002
2022-04-28 20:22:41,339 epoch 7 - iter 1365/1951 - loss

100%|██████████| 245/245 [00:22<00:00, 10.92it/s]

2022-04-28 20:25:41,214 Evaluating as a multi-label problem: False





2022-04-28 20:25:41,327 DEV : loss 0.33738604187965393 - f1-score (micro avg)  0.9067
2022-04-28 20:25:41,364 BAD EPOCHS (no improvement): 4
2022-04-28 20:25:41,372 ----------------------------------------------------------------------------------------------------
2022-04-28 20:26:33,812 epoch 8 - iter 195/1951 - loss 0.40751786 - samples/sec: 14.88 - lr: 0.000002
2022-04-28 20:27:26,780 epoch 8 - iter 390/1951 - loss 0.41690926 - samples/sec: 14.73 - lr: 0.000002
2022-04-28 20:28:19,914 epoch 8 - iter 585/1951 - loss 0.42030728 - samples/sec: 14.69 - lr: 0.000002
2022-04-28 20:29:13,419 epoch 8 - iter 780/1951 - loss 0.41333949 - samples/sec: 14.58 - lr: 0.000001
2022-04-28 20:30:06,066 epoch 8 - iter 975/1951 - loss 0.41523481 - samples/sec: 14.82 - lr: 0.000001
2022-04-28 20:30:57,487 epoch 8 - iter 1170/1951 - loss 0.41194311 - samples/sec: 15.18 - lr: 0.000001
2022-04-28 20:31:50,310 epoch 8 - iter 1365/1951 - loss 0.41268349 - samples/sec: 14.77 - lr: 0.000001
2022-04-28 20:32:4

100%|██████████| 245/245 [00:22<00:00, 10.86it/s]


2022-04-28 20:34:49,572 Evaluating as a multi-label problem: False
2022-04-28 20:34:49,686 DEV : loss 0.3400098979473114 - f1-score (micro avg)  0.9075
2022-04-28 20:34:49,720 BAD EPOCHS (no improvement): 4
2022-04-28 20:34:49,729 ----------------------------------------------------------------------------------------------------
2022-04-28 20:35:41,311 epoch 9 - iter 195/1951 - loss 0.40845524 - samples/sec: 15.13 - lr: 0.000001
2022-04-28 20:36:34,806 epoch 9 - iter 390/1951 - loss 0.40565403 - samples/sec: 14.59 - lr: 0.000001
2022-04-28 20:37:27,179 epoch 9 - iter 585/1951 - loss 0.40681798 - samples/sec: 14.90 - lr: 0.000001
2022-04-28 20:38:21,130 epoch 9 - iter 780/1951 - loss 0.40393714 - samples/sec: 14.46 - lr: 0.000001
2022-04-28 20:39:15,322 epoch 9 - iter 975/1951 - loss 0.40201279 - samples/sec: 14.40 - lr: 0.000001
2022-04-28 20:40:09,229 epoch 9 - iter 1170/1951 - loss 0.40226673 - samples/sec: 14.48 - lr: 0.000001
2022-04-28 20:41:02,228 epoch 9 - iter 1365/1951 - loss

100%|██████████| 245/245 [00:23<00:00, 10.61it/s]


2022-04-28 20:44:06,812 Evaluating as a multi-label problem: False
2022-04-28 20:44:06,938 DEV : loss 0.34189292788505554 - f1-score (micro avg)  0.9091
2022-04-28 20:44:06,975 BAD EPOCHS (no improvement): 4
2022-04-28 20:44:06,982 ----------------------------------------------------------------------------------------------------
2022-04-28 20:44:59,984 epoch 10 - iter 195/1951 - loss 0.41655891 - samples/sec: 14.72 - lr: 0.000001
2022-04-28 20:45:53,768 epoch 10 - iter 390/1951 - loss 0.39995197 - samples/sec: 14.51 - lr: 0.000000
2022-04-28 20:46:46,952 epoch 10 - iter 585/1951 - loss 0.39860256 - samples/sec: 14.67 - lr: 0.000000
2022-04-28 20:47:41,144 epoch 10 - iter 780/1951 - loss 0.39863155 - samples/sec: 14.40 - lr: 0.000000
2022-04-28 20:48:35,147 epoch 10 - iter 975/1951 - loss 0.40142567 - samples/sec: 14.45 - lr: 0.000000
2022-04-28 20:49:28,861 epoch 10 - iter 1170/1951 - loss 0.40026444 - samples/sec: 14.53 - lr: 0.000000
2022-04-28 20:50:21,861 epoch 10 - iter 1365/195

100%|██████████| 245/245 [00:23<00:00, 10.62it/s]


2022-04-28 20:53:23,996 Evaluating as a multi-label problem: False
2022-04-28 20:53:24,107 DEV : loss 0.3404305577278137 - f1-score (micro avg)  0.9095
2022-04-28 20:53:24,147 BAD EPOCHS (no improvement): 4
2022-04-28 20:53:26,022 ----------------------------------------------------------------------------------------------------
2022-04-28 20:53:26,030 Testing using last state of model ...


100%|██████████| 245/245 [00:25<00:00,  9.58it/s]

2022-04-28 20:53:51,678 Evaluating as a multi-label problem: False





2022-04-28 20:53:51,795 0.9101	0.9101	0.9101	0.9101
2022-04-28 20:53:51,798 
Results:
- F-score (micro) 0.9101
- F-score (macro) 0.8612
- Accuracy 0.9101

By class:
              precision    recall  f1-score   support

        NOUN     0.8983    0.9028    0.9005      3736
        VERB     0.9358    0.9345    0.9351      2075
       PUNCT     0.9881    0.9960    0.9920      1996
         ADJ     0.8290    0.7944    0.8113      1080
         ADV     0.8219    0.7639    0.7918       610
         DET     0.9467    0.9735    0.9599       529
       PROPN     0.7351    0.8536    0.7899       478
       CCONJ     0.9776    0.9601    0.9688       501
        PRON     0.9191    0.8903    0.9045       319
         NUM     0.9056    0.8721    0.8885       297
         ADP     0.9437    0.9210    0.9322       291
         AUX     0.9274    0.9651    0.9459       172
        INTJ     0.6000    0.2727    0.3750        11

    accuracy                         0.9101     12095
   macro avg     0.8791

{'dev_loss_history': [0.5654515624046326,
  0.4216236472129822,
  0.36752283573150635,
  0.35849931836128235,
  0.3476703464984894,
  0.3414258360862732,
  0.33738604187965393,
  0.3400098979473114,
  0.34189292788505554,
  0.3404305577278137],
 'dev_score_history': [0.8444591684611562,
  0.885704820274971,
  0.8990392579095577,
  0.9008613549776379,
  0.9046711943018055,
  0.9056650654298493,
  0.906658936557893,
  0.9074871624979295,
  0.9091436143780023,
  0.9094749047540169],
 'test_score': 0.9101281521289789,
 'train_loss_history': [2.0157283904083605,
  0.6517031759100741,
  0.5404005301108655,
  0.4912895114313042,
  0.46433972863404094,
  0.44037913971891385,
  0.42609303984797486,
  0.41429817472205865,
  0.40624075144184024,
  0.399900907472507]}

In [None]:
tagger = SequenceTagger.load(output_folder+'/final-model.pt')

2022-04-14 20:15:49,425 loading file ./drive/MyDrive/Colab Notebooks/boun-dbmdz/bert-base-turkish-cased-dependency/final-model.pt
2022-04-14 20:15:53,009 SequenceTagger predicts: Dictionary with 42 tags: <unk>, punct, obl, nmod:poss, root, nsubj, amod, obj, conj, advmod, det, acl, case, cc, advcl, compound, flat, nmod, advmod:emph, nummod, ccomp, compound:lvc, cop, csubj, compound:redup, discourse, aux:q, parataxis, aux, mark, iobj, cc:preconj, appos, clf, xcomp, vocative, list, orphan, dislocated, fixed, dep, goeswith


In [None]:
sentence = Sentence('Kullanıcı kayıt sistemine belirli bir kimlik numarası ve şifresi kendi isim ve soyismine kayıtlı olacak şekilde erişimi olan kişidir. ')
#sentence = Sentence("Ankara Türkiye'nin başkentidir. ")
#sentence = Sentence("Trabzon Türkiye'nin bir şehridir. ")

sentence = Sentence("E-dilekçe öğrenciler tarafından danışmanlarına gönderilen dilekçelerdir.")
tagger.predict(sentence)

for i in sentence:
  print(i)

Token[0]: "E-dilekçe" → amod (0.3527)
Token[1]: "öğrenciler" → nmod:poss (0.9901)
Token[2]: "tarafından" → obl (0.9898)
Token[3]: "danışmanlarına" → obl (0.9649)
Token[4]: "gönderilen" → acl (0.9945)
Token[5]: "dilekçelerdir" → root (0.9938)
Token[6]: "." → punct (0.9998)
