In [None]:
!pip install --quiet transformers evaluate datasets baal

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install --quiet --upgrade accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/219.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m215.0/219.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/MyDrive/RuSportSum/train.csv" /content/train.csv
!cp "/content/drive/MyDrive/RuSportSum/eval.csv" /content/eval.csv

#### Разметка релевантных новостей с активным обучением

In [None]:
import os
import random
from copy import deepcopy

import numpy as np
import torch.backends
import transformers
import evaluate

from datasets import load_dataset
from tqdm import tqdm
from transformers import BertForSequenceClassification
from transformers import BertTokenizer, TrainingArguments
from transformers import set_seed

from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PretrainedConfig,
    set_seed,
    Trainer,
    TrainingArguments
)

transformers.utils.logging.set_verbosity_warning()

from baal.active import get_heuristic
from baal.active.dataset.nlp_datasets import (
    active_huggingface_dataset,
    HuggingFaceDatasets,
)
from baal.bayesian.dropout import patch_module
from baal.transformers_trainer_wrapper import BaalTransformersTrainer
from baal.active import get_heuristic, ActiveLearningDataset
from baal.active.dataset.base import Dataset

from typing import List
from pprint import pprint

SEED = 3

random.seed(SEED)
torch.manual_seed(SEED)

# Set tranformer seed to ensure that initial weights are identical
set_seed(SEED)

In [None]:
class CustomHuggingFaceDatasets(Dataset):
    """
    Support for `huggingface.datasets`: (https://github.com/huggingface/datasets).
    The purpose of this wrapper is to separate the labels from the rest of the sample information
    and make the dataset ready to be used by `baal.active.ActiveLearningDataset`.
    Args:
        dataset (Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter
            sequences.
    """

    def __init__(
        self,
        dataset,
        tokenizer=None,
        target_key: str = "label",
        input_key_1: str = "sentence1",
        input_key_2: str = "sentence1",
        max_seq_len: int = 400,
    ):
        self.dataset = dataset
        self.targets, self.text1, self.text2 = self.dataset[target_key], self.dataset[input_key_1], self.dataset[input_key_2]
        self.targets_list: List = np.unique(self.targets).tolist()
        self.input_ids, self.attention_masks = (
            self._tokenize(tokenizer, max_seq_len) if tokenizer else ([], [])
        )

    @property
    def num_classes(self):
        return len(self.targets_list)

    def _tokenize(self, tokenizer, max_seq_len):
        # For speed purposes, we should use fast tokenizers here, but that is up to the caller
        tokenized = tokenizer(
            self.text1,
            self.text2,
            add_special_tokens=True,
            max_length=max_seq_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True,
        )
        return tokenized["input_ids"], tokenized["attention_mask"]

    def label(self, idx: int, value: int):
        """Label the item.
        Args:
            idx: index to label
            value: Value to label the index.
        """
        self.targets[idx] = value

    def __len__(self):
        return len(self.text1)

    def __getitem__(self, idx):
        target = self.targets_list.index(self.targets[idx])

        return {
            "input_ids": self.input_ids[idx].flatten() if len(self.input_ids) > 0 else None,
            "inputs": self.text1[idx] + '\n' + self.text2[idx],
            "attention_mask": self.attention_masks[idx].flatten()
            if len(self.attention_masks) > 0
            else None,
            "label": torch.tensor(target, dtype=torch.long),
        }

def custom_active_huggingface_dataset(
    dataset,
    tokenizer=None,
    target_key: str = "label",
    input_key_1: str = "sentence1",
    input_key_2: str = "sentence2",
    max_seq_len: int = 400,
    **kwargs
):
    """
    Wrapping huggingface.datasets with baal.active.ActiveLearningDataset.
    Args:
        dataset (torch.utils.data.Dataset): a dataset provided by huggingface.
        tokenizer (transformers.PreTrainedTokenizer): a tokenizer provided by huggingface.
        target_key (str): target key used in the dataset's dictionary.
        input_key (str): input key used in the dataset's dictionary.
        max_seq_len (int): max length of a sequence to be used for padding the shorter sequences.
        kwargs (Dict): Parameters forwarded to 'ActiveLearningDataset'.
    Returns:
        an baal.active.ActiveLearningDataset object.
    """

    return ActiveLearningDataset(
        CustomHuggingFaceDatasets(dataset, tokenizer, target_key, input_key_1, input_key_2, max_seq_len), **kwargs
    )

Information on the hyperparms below

* epoch: Number of times you want to run and AL loop
* batch_size: The train and eval batch size for hf trainer arguments
* model: Hugging Face Model
* query_size: Number of samples you want to query at each AL iteration for labelling
* heuristic: The acquisition function/heuristic based on which you want to query the important samples
* iterations: The number of iterations you want to run for MCdropout to find the uncertanities
* shuffle_prop: Additional Noise to counter selection bias
* learning_epoch: Traing epochs for hugging face trainer

In [None]:
hyperparams = {
    "epoch": 4,
    "batch_size": 16,
    "model": "DeepPavlov/rubert-base-cased",
    "query_size": 50,
    "heuristic": "bald",
    "iterations": 10,
    "shuffle_prop": 0.05,
    "learning_epoch": 2,
}

In [None]:
# Check for CUDA
use_cuda = torch.cuda.is_available()
torch.backends.cudnn.benchmark = True

id2label = {0: "LABEL_0", 1: "LABEL_1"}
label2id = {"LABEL_0": 0, "LABEL_1": 1}

# Load Model
hf_model = AutoModelForSequenceClassification.from_pretrained(
    hyperparams["model"], num_labels=2, id2label=id2label, label2id=label2id
    )

# Setup tokenizer for model
tokenizer = AutoTokenizer.from_pretrained(hyperparams["model"])

# Enable dropouts for predictions
hf_model = patch_module(hf_model)

# Send model to device and setup cuda arguments
if use_cuda:
    hf_model.to("cuda:0")
    no_cuda = False
else:
    hf_model.to("cpu")
    no_cuda = True

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Define labels in your dataset
label_list = [0, 1]

# Load data from files
data = load_dataset("csv", data_files={'train': '/content/train.csv', 'eval': "/content/eval.csv"})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-5c97641fdcada1ed/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5c97641fdcada1ed/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def get_label_from_data(active_dataset, data, target, input1, input2, indexes) -> List[int]:
    """
    Get labels from the active dataset, this assumes that you have
    already labelled some samples in your initial dataset

    Args:
    ----
    active_dataset : Active dataset which consists of train and pool

    indexes : Indexes of the points for which labels are to be fetched
    from the data

    Returns:
    ----
    labels: Returns the corresponding labels

    """

    labels = []

    # Now since you labelled points earlier now some part of pool has become train
    # so in order to get the pool indexes based on your 'original' data i.e
    # your raw_train_set. Make sure to user __pool_tp

    raw_data_idx = active_dataset._pool_to_oracle_index(indexes)

    for idx in raw_data_idx:
        #print(f"Adding labels for Raw data Index {idx} : {data[input1][idx]}")
        #print(f"Adding labels for Raw data Index {idx} : {data[input2][idx]}")

        label = data[target][idx]
        labels.append(label)
        #print(label)
        #print("\n")

    return labels

In [None]:
def get_label_human_oracle(active_dataset, indexes) -> List[int]:
    """
    Get labels from human oracle. During the AL loop some samples
    will go to the human labeller

    Args:
    ----
    active_dataset : Active dataset which consists of train and pool

    indexes : Indexes of the points for which labels are to be fetched
    from the data

    Returns:
    ----
    labels: Returns the corresponding labels

    """
    # List for corresponding labels
    labels = []

    skipped = []

    for sample_idx, idx in enumerate(indexes):

        while True:
            try:
                print(idx)
                pprint(active_dataset.pool.__getitem__(idx)['inputs'], width=150)
                label = int(input())
            except ValueError:
                print("Sorry, I didn't understand that.")
                continue
            if label != -1 and label not in label_list:
                print(f"Allowed labels are {label_list}")
                continue
            if label == -1:
                print("Skipping this sample")
                skipped.append(sample_idx)
                break
            else:
                labels.append(label)
                break
        print("\n")

    indexes_upd = np.delete(indexes, skipped)

    return labels, indexes_upd

In [None]:
#small_data = data['train'].select(range(10_000))
small_data = data['train']

active_set = custom_active_huggingface_dataset(
    small_data,
    tokenizer=tokenizer,
    target_key='result_match',
    input_key_1='name_body',
    input_key_2='last_part_content'
)

active_set.can_label = True

# 700 первых объектов набора данных
label_from_data = get_label_from_data(active_set, small_data, 'result_match', 'name_body', 'last_part_content', range(700))

active_set.label(
    range(700),
    label_from_data,
)

valid_set = CustomHuggingFaceDatasets(data['eval'], tokenizer=tokenizer, target_key='result_match', input_key_1='name_body', input_key_2='last_part_content')

active_set, test_set = active_set, valid_set

In [None]:
# Setup Heuristics
heuristic = get_heuristic(
    hyperparams["heuristic"], hyperparams["shuffle_prop"]
)

# Model save checkpoint
save_checkpoint = 2

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Keep track of initial model weights
init_weights = deepcopy(hf_model.state_dict())

training_args = TrainingArguments(
    output_dir=".",
    num_train_epochs=hyperparams["learning_epoch"],
    per_device_train_batch_size=hyperparams["batch_size"],
    per_device_eval_batch_size=hyperparams["batch_size"],
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Active Learning Trainer Wrapper
baal_trainer = BaalTransformersTrainer(
    model=hf_model,
    args=training_args,
    train_dataset=active_set,
    eval_dataset=test_set,
    tokenizer=None,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
logs = []
last_epoch = hyperparams["epoch"] - 1

for epoch in tqdm(range(hyperparams["epoch"])):
    # we use the default setup of HuggingFace for training (ex: epoch=1).
    # The setup is adjustable when BaalHuggingFaceTrainer is defined.
    baal_trainer.train()
    print("\n")

    # Validation!
    #eval_metrics = baal_trainer.evaluate()
    #print("\n")

    if (epoch != last_epoch):
      # MCdropout to gather uncertanities
      predictions = baal_trainer.predict_on_dataset(
          active_set.pool, iterations=hyperparams["iterations"]
      )
      print("\n")

      # Acquistion of the most informative and diverse samples based on BatchBALD
      top_uncertainty = heuristic(predictions)[: hyperparams.get("query_size", 1)]

      # Send the samples for labelling from human oracle
      label_from_oracle, points_to_label_oracle = get_label_human_oracle(
          active_set, top_uncertainty
      )

      # Label active dataset
      active_set.label(points_to_label_oracle, label_from_oracle)

      # We reset the model weights to relearn from the new trainset.
      baal_trainer.load_state_dict(init_weights)
      baal_trainer.lr_scheduler = None

      active_logs = {
          "epoch": epoch,
          "labeled_data": active_set.labelled_map,
          "Next Training set size": len(active_set),
      }
      logs.append({**active_logs})



Epoch,Training Loss,Validation Loss,F1
1,0.4494,0.531894,0.796209
2,0.209,0.241835,0.936759




[275-MainThread  ] [baal.transformers_trainer_wrapper:predict_on_dataset_generator:67] [2m2023-05-28T10:48:54.829879Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m11764[0m



  0%|          | 0/736 [00:00<?, ?it/s][A
  0%|          | 1/736 [00:04<53:09,  4.34s/it][A
  0%|          | 2/736 [00:08<51:54,  4.24s/it][A
  0%|          | 3/736 [00:12<51:49,  4.24s/it][A
  1%|          | 4/736 [00:17<52:07,  4.27s/it][A
  1%|          | 5/736 [00:21<52:35,  4.32s/it][A
  1%|          | 6/736 [00:25<53:07,  4.37s/it][A
  1%|          | 7/736 [00:30<53:42,  4.42s/it][A
  1%|          | 8/736 [00:35<54:13,  4.47s/it][A
  1%|          | 9/736 [00:39<54:27,  4.49s/it][A
  1%|▏         | 10/736 [00:44<54:20,  4.49s/it][A
  1%|▏         | 11/736 [00:48<53:59,  4.47s/it][A
  2%|▏         | 12/736 [00:52<53:39,  4.45s/it][A
  2%|▏         | 13/736 [00:57<53:10,  4.41s/it][A
  2%|▏         | 14/736 [01:01<52:45,  4.38s/it][A
  2%|▏         | 15/736 [01:05<52:17,  4.35s/it][A
  2%|▏         | 16/736 [01:10<51:50,  4.32s/it][A
  2%|▏         | 17/736 [01:14<51:26,  4.29s/it][A
  2%|▏         | 18/736 [01:18<51:07,  4.27s/it][A
  3%|▎         | 19/736 [01:2



7266
('Бразилия – Колумбия. Неймар покинул поле на носилках. Нападающий сборной Бразилии Неймар получил травму в матче 1/4 финала чемпионата мира-2014. '
 'Форвард «Барселоны» покинул поле на носилках после удара коленом в спину от колумбийского футболиста. Вместо него на поле вышел Энрике. Бразилия – '
 'Колумбия – 2:1. Как это было\n'
 'Рамос не хуже разных неймаров раскидал соперников на бегу финтами и вырулил атаку своей сборной на финиш 4 в 4! Однако пас в офсайд запортил все '
 'дело. Паулиньо, уходя с поля, показывает, кого не надо забывать держать. Эрнанес вместо него. Суньига с разбега коленом в крестец - приличный '
 'рестлеровский прием выводит Неймара из игры. Энрике вместо Неймара, который на носилках транспортируется прямиком в раздевалку. Блокируют удар '
 'Бакки. Всей командой пошли колумбийцы в атаку. Родригес играет в тело, возможно, своего будущего одноклубника. Несладко пришлось Марсело получить '
 'плечом в грудь от Хамеса. +5 минут. Если будут играть так, как се

 25%|██▌       | 1/4 [1:04:24<3:13:12, 3864.27s/it]





Epoch,Training Loss,Validation Loss,F1
1,0.2467,0.1449,0.970414
2,0.0931,0.188067,0.966862




[275-MainThread  ] [baal.transformers_trainer_wrapper:predict_on_dataset_generator:67] [2m2023-05-28T11:53:23.180679Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m11714[0m



  0%|          | 0/733 [00:00<?, ?it/s][A
  0%|          | 1/733 [00:04<50:20,  4.13s/it][A
  0%|          | 2/733 [00:08<50:35,  4.15s/it][A
  0%|          | 3/733 [00:12<50:58,  4.19s/it][A
  1%|          | 4/733 [00:16<51:26,  4.23s/it][A
  1%|          | 5/733 [00:21<52:00,  4.29s/it][A
  1%|          | 6/733 [00:25<52:28,  4.33s/it][A
  1%|          | 7/733 [00:30<53:03,  4.39s/it][A
  1%|          | 8/733 [00:34<53:40,  4.44s/it][A
  1%|          | 9/733 [00:39<54:15,  4.50s/it][A
  1%|▏         | 10/733 [00:43<54:28,  4.52s/it][A
  2%|▏         | 11/733 [00:48<54:21,  4.52s/it][A
  2%|▏         | 12/733 [00:52<54:00,  4.50s/it][A
  2%|▏         | 13/733 [00:57<53:34,  4.46s/it][A
  2%|▏         | 14/733 [01:01<53:11,  4.44s/it][A
  2%|▏         | 15/733 [01:05<52:42,  4.40s/it][A
  2%|▏         | 16/733 [01:10<52:13,  4.37s/it][A
  2%|▏         | 17/733 [01:14<51:47,  4.34s/it][A
  2%|▏         | 18/733 [01:18<51:23,  4.31s/it][A
  3%|▎         | 19/733 [01:2



2329
('Красич получил травму в матче со «Спартаком». Полузащитник «Фенербахче» Милош Красич получил травму в начале ответного матча плей-офф квалификации '
 'Лиги чемпионов против «Спартака» (первый матч – 1:2). Вместо него на 16-й минуте на поле появился Милослав Стох. Sports.ru ведет текстовую '
 'трансляцию матча «Фенербахче» – «Спартак» .\n'
 'Страшно представить, что было бы, будь на месте Кюйта Крауч или хотя бы пол-Крауча - сантиметров не хватило Дирку, чтобы замкнуть пас на дальней '
 'штанге. Макеев будет доигрывать матч с судорогами. Вдевятером москвичи, по сути. Дзюба сейчас полаялся с Коркмазом - и арбитр предъявил им по '
 'карте. Соу! Двойной удар! ДИКАНЬ. Ногти и заусенцы кончились. Грызу зубы. Дикань сейвит после дальнего выстрела, а затем и намертво забирает '
 'повторную попытку из-за штрафной! Макеева уносят на носилках. Ему бы вернуться поскорей, но не застрять при создании "вне игры" с окаменелой '
 'ногой. Правда, турки уже всем составом в нашей штрафной. Пушным

 50%|█████     | 2/4 [2:07:49<2:07:38, 3829.29s/it]





Epoch,Training Loss,Validation Loss,F1
1,0.4607,0.354096,0.888889
2,0.1513,0.209688,0.951076




[275-MainThread  ] [baal.transformers_trainer_wrapper:predict_on_dataset_generator:67] [2m2023-05-28T12:57:02.128274Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m11664[0m



  0%|          | 0/729 [00:00<?, ?it/s][A
  0%|          | 1/729 [00:04<50:01,  4.12s/it][A
  0%|          | 2/729 [00:08<50:21,  4.16s/it][A
  0%|          | 3/729 [00:12<50:46,  4.20s/it][A
  1%|          | 4/729 [00:16<51:14,  4.24s/it][A
  1%|          | 5/729 [00:21<51:44,  4.29s/it][A
  1%|          | 6/729 [00:25<52:16,  4.34s/it][A
  1%|          | 7/729 [00:30<52:53,  4.40s/it][A
  1%|          | 8/729 [00:34<53:31,  4.45s/it][A
  1%|          | 9/729 [00:39<53:54,  4.49s/it][A
  1%|▏         | 10/729 [00:43<53:57,  4.50s/it][A
  2%|▏         | 11/729 [00:48<53:44,  4.49s/it][A
  2%|▏         | 12/729 [00:52<53:24,  4.47s/it][A
  2%|▏         | 13/729 [00:57<53:01,  4.44s/it][A
  2%|▏         | 14/729 [01:01<52:35,  4.41s/it][A
  2%|▏         | 15/729 [01:05<52:10,  4.38s/it][A
  2%|▏         | 16/729 [01:10<51:45,  4.36s/it][A
  2%|▏         | 17/729 [01:14<51:21,  4.33s/it][A
  2%|▏         | 18/729 [01:18<50:59,  4.30s/it][A
  3%|▎         | 19/729 [01:2



1805
('«Магнитка» – ЦСКА. Коварж сравнял счет за 17 секунд до конца третьего периода и перевел игру в овертайм. Форвард «Магнитки» Ян Коварж сравнял счет '
 'за 17 секунд до окончания третьего периода третьего матча финальной серии Кубка Гагарина с ЦСКА (2:2). Чешскому игроку магнитогорцев помогли '
 'отличиться Данис Зарипов и Александр Семин. Таким образом, игра переходит в овертайм. Sports.ru ведет текстовую онлайн-трансляцию этого матча.\n'
 'Удаление у "Металлурга"! Батыршин за толчок клюшкой на пятаке у своих ворот. Тайм-аут взял Квартальнов. Все или ничего сейчас? гол!!!! Да Коста - '
 '2:3! Дубль!! 7-й гол в плей-офф! Как это было: кистевой бросок с левого края из верха круга вбрасывания оказался неберущимся для Кошечкина! И '
 'продолжает действовать правило финала плей-офф уже в третьем матче - забивший первым проигрывает... 2:3 в матче и 1-2 в финальной серии - ЦСКА '
 'вырывает победу в концовке первого овертайма! В состязание двух терпений и точных бросков сегодня были л

 75%|███████▌  | 3/4 [3:08:53<1:02:34, 3754.12s/it]





Epoch,Training Loss,Validation Loss,F1
1,0.4883,0.430454,0.862921
2,0.1718,0.121726,0.974155


100%|██████████| 4/4 [3:12:08<00:00, 2882.00s/it]








In [None]:
m = AutoModelForSequenceClassification.from_pretrained("/content/checkpoint-108")
trainer = BaalTransformersTrainer(
    model=m,
    args=training_args,
    train_dataset=active_set,
    eval_dataset=test_set,
    tokenizer=None,
    compute_metrics=compute_metrics
)
output = trainer.evaluate()
output

{'eval_loss': 0.11491850763559341,
 'eval_f1': 0.9763779527559056,
 'eval_runtime': 9.8966,
 'eval_samples_per_second': 37.791,
 'eval_steps_per_second': 2.425}

In [None]:
!cp -r "/content/checkpoint-108" "/content/drive/MyDrive/RuSportSum/bald"