In [None]:
!pip install transformers
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import datasets
import numpy as np
import pandas as pd

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline

from datasets import load_metric
from datasets import load_from_disk
from datasets import Dataset, ClassLabel, Sequence, Features, Value

## Обучение

In [None]:
clad_class = load_from_disk('/content/drive/My Drive/clad_class')
model_checkpoint = '/content/drive/My Drive/rubert-finetuned-ner'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, 
                                                           num_labels=2,
                                                           ignore_mismatched_sizes=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = load_metric("roc_auc")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], 
                     truncation=True, 
                     max_length=512)

tokenized_datasets = clad_class.map(preprocess_function, batched=True)

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    logits = torch.from_numpy(logits)
    pred_scores = torch.nn.functional.softmax(logits, dim=1)[:, 1].numpy()
    return metric.compute(references=labels, prediction_scores=pred_scores)

In [None]:
args = TrainingArguments(
    "rubert-tiny-finetuned-class",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5, 
    weight_decay=0.01, о
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
#download
#!zip -r rubert-tiny-finetuned-class.zip rubert-tiny-finetuned-class/checkpoint-[num]

## Инференс

In [None]:
inference_checkpoint = '/content/rubert-tiny-finetuned-class'
classifier = pipeline("sentiment-analysis", 
                      model=inference_checkpoint, 
                      function_to_apply="softmax")

In [None]:
sample = clad_class['validation'][0]['sentence']
classifier(sample)