In [2]:
import pandas as pd
import json
import os
import re
import numpy as np
from transformers import AutoTokenizer, AutoModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, T5ForConditionalGeneration, GPT2Tokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch
from datasets import Dataset, load_dataset
import random
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter
from copy import deepcopy
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, f1_score, accuracy_score
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import Dataset as BaseDataset
from transformers import Adafactor

[2023-08-05 11:54:09,897] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2023-08-05 11:54:10.721183: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-05 11:54:10.767353: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")
set_seed()

Random seed set as 42


In [4]:
dataset = load_dataset('IlyaGusev/gazeta')

No config specified, defaulting to: gazeta/default
Reusing dataset gazeta (/home/user/.cache/huggingface/datasets/IlyaGusev___gazeta/default/2.0.0/e2d171980aa248bc22e0af4f8485ad69071fc8e5f3d54a253c71eb434f6694bd)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model_name = 'ai-forever/rut5-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 60964
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 6793
    })
    validation: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 6369
    })
})

In [9]:
def prepare_features(examples):
    tokenized_examples = tokenizer(
        [x for x in examples["text"]],
        padding='max_length', 
        max_length=2048,
        truncation=True,
        return_tensors='np'
    )
    target_encoding = tokenizer(
        [f'{i}' for i in examples['summary']],
        padding="max_length",
        max_length=1024,
        truncation=True,
        return_tensors='np'
    )
    labels = target_encoding.input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    tokenized_examples['labels'] = labels
    return tokenized_examples

In [10]:
dataset = dataset.map(prepare_features, batched=True)

  0%|          | 0/61 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [11]:
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=DEVICE_MAP)

Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    'models/rut5large',
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    learning_rate = 1e-4,
    weight_decay=0.001,
    num_train_epochs = 5,
    save_strategy = 'epoch',
    report_to = 'none',
    gradient_accumulation_steps=16,
    predict_with_generate = False,
    save_total_limit = 1,
    optim='adafactor',
    logging_strategy='epoch',
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
)

In [None]:
trainer.train()