In [None]:
!pip install evaluate seqeval "transformers>=4.30.0"

In [None]:
import os
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import evaluate
import torch

from utils.parse import *
from utils.data import ParsemeDataset
from utils.predictions import *
from utils.train import *


In [None]:
def compute_metrics(eval_pred):
    global model
    logits, labels = eval_pred
    preds_list, labels_list = align_predictions(
        logits, labels, model.config.id2label
    )

    results = seqeval.compute(
        predictions=preds_list,
        references=labels_list
    )

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results.get("overall_accuracy", 0.0)
    }

In [None]:
data_dir = "drive/MyDrive/datasets/parseme/subtask1/PL"

In [None]:
train_model(
    train_file=os.path.join(data_dir, "train.cupt"),
    dev_file=os.path.join(data_dir, "dev.cupt"),
    output_dir="pl_model",
    model_name="xlm-roberta-base",
    compute_metrics=compute_metrics,
)

In [None]:
output_dir = "pl_model"

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForTokenClassification.from_pretrained(output_dir)

In [None]:
model_dir = "./pl_model"
id2label = model.config.id2label

fill_cupt_with_predictions(
    model_dir,
    "pl.test.blind.cupt",
    "pl_prediction.cupt",
    id2label
)