In [None]:
import re

In [None]:
def read_corp(file_path):

  with open(file_path, 'r') as f:
    raw_text = f.read()

  raw_docs = re.split(r'\n\t?\n', raw_text)
  token_docs = []
  tag_docs = []
  for doc in raw_docs:
      tokens = []
      tags = []
      for line in doc.split('\n'):
        if line:
          if line.split()[0] != '#': 
            info = line.split('\t')
            tokens.append(info[1])
            tags.append(info[3])
      token_docs.append(tokens)
      tag_docs.append(tags)

  return token_docs, tag_docs

texts, tags = read_corp('/UD_Old_Church_Slavonic-PROIEL/chu_all.conllu')

In [None]:
len(texts)

6339

In [None]:
train_texts = texts[:5071]
train_tags = tags[:5071]
val_texts = texts[5071:]
val_tags = tags[5071:]

In [None]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
task = "pos"
model_checkpoint = "distilbert-base-uncased"
batch_size = 4

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
encoded_tags_train = [[tag2id[tag] for tag in doc] for doc in train_tags]

In [None]:
encoded_tags_val = [[tag2id[tag] for tag in doc] for doc in val_tags]

In [None]:
label_all_tokens = True

def tokenize_and_align_labels(texts, tags):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    return tokenized_inputs, labels

In [None]:
tokenized_train, train_labels = tokenize_and_align_labels(train_texts, encoded_tags_train)

In [None]:
tokenized_val, val_labels = tokenize_and_align_labels(val_texts, encoded_tags_val)

In [None]:
import torch

In [None]:
class CHUDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = CHUDataset(tokenized_train, train_labels)
val_dataset = CHUDataset(tokenized_val, val_labels)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(unique_tags))

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"/content/drive/MyDrive/{model_name}-finetuned-1-{task}-chu",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from datasets import load_metric

In [None]:
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Loading model from /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-11000).
***** Running training *****
  Num examples = 5071
  Num Epochs = 15
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 19020
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 8
  Continuing training from global step 11000
  Will skip the first 8 epochs then the first 856 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/856 [00:00<?, ?it/s]

  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
9,0.2064,0.473198,0.725554,0.676957,0.700414,0.873927
10,0.2563,0.453411,0.742138,0.704704,0.722937,0.884366
11,0.221,0.448164,0.744609,0.726479,0.735432,0.885327
12,0.1907,0.468794,0.742957,0.71968,0.731134,0.885791
13,0.172,0.495118,0.740535,0.722437,0.731374,0.88607
14,0.1549,0.503142,0.744422,0.726571,0.735388,0.88669
15,0.1383,0.521004,0.739975,0.720599,0.730159,0.884273


***** Running Evaluation *****
  Num examples = 1268
  Batch size = 4
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-11500
Configuration saved in /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-11500/config.json
Model weights saved in /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-11500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-11500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-11500/special_tokens_map.json
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
Saving model checkpoint to /content/drive/MyDrive/distilbert-base-uncased-finetuned-1-pos-chu/checkpoint-120

TrainOutput(global_step=19020, training_loss=0.08985667565894302, metrics={'train_runtime': 3419.3971, 'train_samples_per_second': 22.245, 'train_steps_per_second': 5.562, 'total_flos': 8891895008395080.0, 'train_loss': 0.08985667565894302, 'epoch': 15.0})

In [None]:
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 1268
  Batch size = 4
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}




{'CONJ': {'f1': 0.7622171203673336,
  'number': 1525,
  'precision': 0.7624671916010499,
  'recall': 0.7619672131147541},
 'DJ': {'f1': 0.6503719447396387,
  'number': 1007,
  'precision': 0.6994285714285714,
  'recall': 0.6077457795431976},
 'DP': {'f1': 0.9630064591896653,
  'number': 861,
  'precision': 0.9738717339667459,
  'recall': 0.9523809523809523},
 'DV': {'f1': 0.8068930562595031,
  'number': 1039,
  'precision': 0.8522483940042827,
  'recall': 0.766121270452358},
 'ERB': {'f1': 0.6942114093959731,
  'number': 2318,
  'precision': 0.6755102040816326,
  'recall': 0.7139775668679896},
 'ET': {'f1': 0.5104895104895104,
  'number': 145,
  'precision': 0.5177304964539007,
  'recall': 0.503448275862069},
 'NTJ': {'f1': 0.8925619834710743,
  'number': 60,
  'precision': 0.8852459016393442,
  'recall': 0.9},
 'OUN': {'f1': 0.6762849413886384,
  'number': 1703,
  'precision': 0.6927339901477833,
  'recall': 0.6605989430416911},
 'RON': {'f1': 0.7123190963642781,
  'number': 1406,
  '