In [None]:
!pip install transformers
!pip install datasets
!pip install seqeval

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import datasets
import numpy as np
import pandas as pd

from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline

from datasets import load_metric
from datasets import load_from_disk
from datasets import Dataset, ClassLabel, Sequence, Features, Value

## Обучение

In [None]:
clad_ner = load_from_disk('/content/drive/My Drive/clad_ner')

model_checkpoint = 'cointegrated/rubert-tiny'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

metric = load_metric("seqeval")

In [None]:
#huggingface
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        
        elif word_id is None:
            new_labels.append(-100)
        
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
#huggingface
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True,
        max_length=512
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = clad_ner.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=clad_ner['train'].column_names,
)

In [None]:
#huggingface
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
label_names = clad_ner['train'].features['ner_tags'].feature.names
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
args = TrainingArguments(
    "rubert-tiny-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
#!zip -r rubert-tiny-finetuned-ner.zip rubert-finetuned-ner/checkpoint-18000

## Инференс

In [None]:
model_checkpoint = '/content/rubert-finetuned-ner/checkpoint-18000'
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [None]:
sample = 'автомобиль8девять5@ в хорошемвосемь ,90пять 1один'
print(sample, token_classifier(sample))