In [None]:
!pip install evaluate seqeval "transformers>=4.30.0"

In [None]:
import os
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import evaluate
import torch

In [None]:
def read_cupt(path):
    """
    Returns a list of sentences.
    Each sentence is a list of token dicts:
    {
        "id": "3",
        "form": "ran",
        "lemma": "_",
        ...
        "mwe": "1:VID"
    }
    """
    sentences = []
    current = []

    with open(path, "r", encoding="utf8") as f:
        for line in f:
            line = line.strip()

            if not line:
                if current:
                    sentences.append(current)
                    current = []
                continue

            if line.startswith("#"):
                continue

            cols = line.split("\t")
            if "-" in cols[0]:  # multi-word token line
                continue

            tok = {
                "id": cols[0],
                "form": cols[1],
                "lemma": cols[2],
                "upos": cols[3],
                "xpos": cols[4],
                "feats": cols[5],
                "head": cols[6],
                "deprel": cols[7],
                "deps": cols[8],
                "misc": cols[9],
                "mwe": cols[10]
            }
            current.append(tok)

    if current:
        sentences.append(current)

    return sentences

In [None]:

def parse_mwe_column(col):
    """
    Handle:
        "_", "*"           → empty
        "1"                → continuation of MWE 1 (no type)
        "1:VID"            → start of MWE 1 type VID
        "1;2"              → continuation of both
        "1;2:VID"          → mix
    """
    if col in ("_", "*", ""):
        return []

    entries = []
    for part in col.split(";"):
        if ":" in part:
            num, typ = part.split(":")
            entries.append((int(num), typ))
        else:
            entries.append((int(part), None))
    return entries


############################################
# 3. Convert CUPT → BIO
############################################

def normalize_label(lbl):
    """
    Enforce single label:
    "B-LVC.full;B-VID" → "B-LVC.full"
    """
    return lbl.split(";")[0]


def cupt_to_bio(sent):
    """
    Converts one sentence (list of tokens) → list of BIO labels.
    Handles multi-MWE, continuations, etc.
    Always returns ONE label per token (normalized).
    """

    # collect known types (first non-None determines type)
    mwe_types = {}
    for tok in sent:
        for mwe_id, typ in parse_mwe_column(tok["mwe"]):
            if typ is not None:
                mwe_types[mwe_id] = typ

    tags = []
    started = {}

    for tok in sent:
        entries = parse_mwe_column(tok["mwe"])

        if not entries:
            tags.append("O")
            continue

        token_labels = []
        for mwe_id, typ in entries:
            the_type = mwe_types.get(mwe_id, "UNK")

            if mwe_id not in started:
                token_labels.append(f"B-{the_type}")
                started[mwe_id] = True
            else:
                token_labels.append(f"I-{the_type}")

        # enforce single selection
        final = normalize_label(";".join(token_labels))
        tags.append(final)

    return tags

def extract_label_set(train_sents):
    labels = set()
    for sent in train_sents:
        bio = cupt_to_bio(sent)
        for t in bio:
            labels.add(normalize_label(t))
    return sorted(labels)

def extract_label_set_from(*sentence_lists):
    """
    Build the sorted set of labels (BIO single-label normalized) found
    across all provided sentence lists.

    Each argument is a list of sentences (as returned by read_cupt).
    """
    labels = set()
    for sents in sentence_lists:
        for sent in sents:
            bio = cupt_to_bio(sent)
            for t in bio:
                labels.add(normalize_label(t))
    return sorted(labels)

In [None]:
class ParsemeDataset:
    def __init__(self, sentences, tokenizer, label2id, max_length=256):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.id2label = {v: k for k, v in label2id.items()}
        self.max_length = max_length
        self._unknown_label_warned = False  # single-time warning

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx]
        words = [tok["form"] for tok in sent]
        labels = [normalize_label(t) for t in cupt_to_bio(sent)]

        encoding = self.tokenizer(
            words,
            truncation=True,
            is_split_into_words=True,
            max_length=self.max_length,
            return_offsets_mapping=True,
        )

        # align BIO labels to wordpieces
        word_ids = encoding.word_ids()
        label_ids = []
        prev_word = None

        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            else:
                if wid != prev_word:
                    lab = labels[wid]
                    if lab not in self.label2id:
                        # fallback: map unseen label to "O"
                        if not self._unknown_label_warned:
                            print(f"Warning: unseen label '{lab}' encountered — mapping to 'O'.")
                            self._unknown_label_warned = True
                        lab = "O"
                    label_ids.append(self.label2id[lab])
                else:
                    label_ids.append(-100)
            prev_word = wid

        encoding.pop("offset_mapping")
        encoding["labels"] = label_ids
        return encoding

In [None]:

seqeval = evaluate.load("seqeval")

def clean_pred_label(lbl):
    return lbl.split(";")[0]


def align_predictions(predictions, label_ids, id2label):
    preds = np.argmax(predictions, axis=-1)

    batch_preds = []
    batch_labels = []

    for pred_seq, gold_seq in zip(preds, label_ids):
        p_list = []
        l_list = []

        for p, g in zip(pred_seq, gold_seq):
            if g == -100:
                continue
            p_lbl = clean_pred_label(id2label[int(p)])
            g_lbl = clean_pred_label(id2label[int(g)])
            p_list.append(p_lbl)
            l_list.append(g_lbl)

        batch_preds.append(p_list)
        batch_labels.append(l_list)

    return batch_preds, batch_labels


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds_list, labels_list = align_predictions(
        logits, labels, model.config.id2label
    )

    results = seqeval.compute(
        predictions=preds_list,
        references=labels_list
    )

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results.get("overall_accuracy", 0.0)
    }

In [None]:
def train_model(train_file, dev_file, output_dir, model_name="xlm-roberta-base"):
    print("Reading data...")
    train_sents = read_cupt(train_file)
    dev_sents = read_cupt(dev_file)

    print("Extracting labels from train+dev (to avoid unseen labels)...")
    labels = extract_label_set_from(train_sents, dev_sents)
    if "O" not in labels:
        labels = ["O"] + labels  # ensure 'O' present
    label2id = {l: i for i, l in enumerate(labels)}
    id2label = {i: l for l, i in label2id.items()}

    print("Loading model & tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    global model
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id,
    )

    print("Preparing datasets...")
    train_dataset = ParsemeDataset(train_sents, tokenizer, label2id)
    dev_dataset = ParsemeDataset(dev_sents, tokenizer, label2id)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=100,
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    print("Training...")
    trainer.train()
    trainer.save_model(output_dir)
    print("Done!")


In [None]:
def bio_to_parseme_tags(bio_tags):
    """
    Convert BIO tags (e.g. B-LVC.full, I-LVC.full, O)
    into official PARSEME MWE tags:
      - O            → "*"
      - B-TYPE       → "1:TYPE"
      - I-TYPE       → "1"
    Supports multiple MWEs via incremental numbering.
    """

    mwe_id_counter = 1
    active_mwes = {}  # mwe_type -> assigned ID
    result = []

    for tag in bio_tags:
        if tag == "O":
            result.append("*")
            continue

        bio, mwe_type = tag.split("-", 1)

        if bio == "B":
            # Start new MWE
            active_mwes[mwe_type] = mwe_id_counter
            result.append(f"{mwe_id_counter}:{mwe_type}")
            mwe_id_counter += 1

        elif bio == "I":
            # Continue existing MWE
            if mwe_type in active_mwes:
                result.append(str(active_mwes[mwe_type]))
            else:
                # Inconsistent segmentation: treat as new MWE
                active_mwes[mwe_type] = mwe_id_counter
                result.append(f"{mwe_id_counter}:{mwe_type}")
                mwe_id_counter += 1

    return result




def fill_cupt_with_predictions(
    model_dir: str,
    input_cupt_path: str,
    output_cupt_path: str,
    id2label: dict
):
    """
    Loads a trained Parseme MWE model and fills the last column of a .cupt file
    with predicted PARSEME-compliant MWE labels (using "*" for O tags).
    """

    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    model = AutoModelForTokenClassification.from_pretrained(model_dir)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # ---- Utilities ---------------------------------------------------------

    def read_cupt(path):
        sents = []
        cur = []
        with open(path, "r", encoding="utf8") as f:
            for line in f:
                if line.strip() == "":
                    if cur:
                        sents.append(cur)
                        cur = []
                elif line.startswith("#"):
                    cur.append(line)
                else:
                    fields = line.rstrip("\n").split("\t")
                    cur.append(fields)
        if cur:
            sents.append(cur)
        return sents

    def write_cupt(sents, path):
        with open(path, "w", encoding="utf8") as f:
            for sent in sents:
                for line in sent:
                    if isinstance(line, str):
                        f.write(line)
                    else:
                        f.write("\t".join(line) + "\n")
                f.write("\n")

    # ------------------------------------------------------------------------

    sents = read_cupt(input_cupt_path)
    print(f"Loaded {len(sents)} sentences.")

    for sent in sents:

        tokens = [fields[1] for fields in sent if isinstance(fields, list)]

        encoded = tokenizer(
            tokens,
            is_split_into_words=True,
            return_offsets_mapping=True,  # used only for alignment
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        with torch.no_grad():
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            ).logits

        pred_ids = logits.argmax(dim=-1).squeeze(0).tolist()
        word_ids = encoded.word_ids()

        # convert subword predictions → per-token predictions
        token_preds = []
        last_word = None
        for pred_id, word_idx in zip(pred_ids, word_ids):
            if word_idx is None:
                continue
            if word_idx != last_word:
                token_preds.append(id2label[pred_id])
                last_word = word_idx

        assert len(token_preds) == len(tokens), "Alignment mismatch!"

        # ---- Convert to PARSEME format ------------------------------------
        parseme_tags = bio_to_parseme_tags(token_preds)

        # ---- Write into last column ---------------------------------------
        idx = 0
        for row in sent:
            if isinstance(row, list):
                row[-1] = parseme_tags[idx]
                idx += 1

    write_cupt(sents, output_cupt_path)
    print(f"Wrote predictions to: {output_cupt_path}")

In [None]:
data_dir = "drive/MyDrive/datasets/parseme/subtask1/PL"

In [None]:
train_model(
    train_file=os.path.join(data_dir, "train.cupt"),
    dev_file=os.path.join(data_dir, "dev.cupt"),
    output_dir="pl_model",
    model_name="xlm-roberta-base"
)

Reading data...
Extracting labels from train+dev (to avoid unseen labels)...
Loading model & tokenizer...


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Preparing datasets...
Training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0978,0.096509,0.786541,0.735453,0.76014,0.975725
2,0.0563,0.087538,0.745361,0.788227,0.766195,0.97558
3,0.0511,0.08722,0.799866,0.805819,0.802831,0.980233
4,0.0189,0.109395,0.825034,0.829499,0.82726,0.981318
5,0.0131,0.121054,0.849687,0.826116,0.837736,0.982812


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Done!


In [None]:
output_dir = "pl_model"

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForTokenClassification.from_pretrained(output_dir)

In [None]:
model_dir = "./pl_model"
id2label = model.config.id2label

fill_cupt_with_predictions(
    model_dir,
    "pl.test.blind.cupt",
    "pl_prediction.cupt",
    id2label
)

Loaded 1127 sentences.
Wrote predictions to: pl_prediction.cupt
