In [36]:
from datasets import load_dataset, Dataset, load_metric
import numpy as np
import nltk
from transformers import AutoTokenizer, T5TokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import torch
import pandas as pd
from transformers.optimization import Adafactor, AdafactorSchedule

# –í–∞—Ä–∏–∞–Ω—Ç 1. –°–∞–º–æ—Å—Ç–æ—è—Ç–µ–ª—å–Ω—ã–π fine-tune –º–æ–¥–µ–ª–∏

In [2]:
gazeta = load_dataset("IlyaGusev/gazeta", revision="v2.0")

In [45]:
# –∫–æ–¥–∏—Ä—É–µ–º –¥–∞–Ω–Ω—ã–µ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ –º–æ–¥–µ–ª—å—é
def encode_data(input_sequences):
  task_prefix = "summarize: "
  if type(input_sequences) != list: 
    input_sequences = [input_sequences]
  encoded = tokenizer(
    [task_prefix + sequence for sequence in input_sequences],
    padding="longest",
    max_length=max_input,
    truncation=True,
    return_tensors="pt",)
  return encoded

In [40]:
#—Ç–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –¥–∞—Ç–∞—Å–µ—Ç
def encode_dataset(dataset, max_output = 64):
    INPUT_IDS = []
    ATTENTION_MASK = []
    LABELS = []
    for i in range(len(dataset)):
        encoded_row = encode_data(dataset[i]['text'])
        input_ids, attention_mask = encoded_row.input_ids, encoded_row.attention_mask
        target_encoding = tokenizer(dataset[i]['summary'], padding="longest", max_length=max_output, truncation=True)
        labels = target_encoding.input_ids
        labels = torch.tensor(labels)
        labels[labels == tokenizer.pad_token_id] = -100
        INPUT_IDS.append(input_ids)
        ATTENTION_MASK.append(ATTENTION_MASK)
        LABELS.append(labels)
    data = Dataset.from_pandas(pd.DataFrame({'input_ids': list(np.array(INPUT_IDS)), 'attention_mask': list(np.array(ATTENTION_MASK)), 'labels': list(np.array(LABELS))}))
    return data

In [50]:
len(gazeta['train'])

60964

In [None]:
encoded_gazeta = encode_dataset(gazeta['test'])

In [21]:
# –∞–ª—å—Ç–µ—Ä–Ω–∞—Ç–∏–≤–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç
urukhan = load_dataset('UrukHan/t5-russian-summarization' )
urukhan_train = urukhan['train']
urukhan_test = urukhan['test'].train_test_split(0.02)['test'] 

Downloading metadata:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/452275 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/23804 [00:00<?, ? examples/s]

In [4]:
metric = load_metric("rouge")
nltk.download('punkt')

  metric = load_metric("rouge")
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Boris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
model_name = 'UrukHan/t5-russian-summarization' # –ù–∞–∑–≤–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ –∏–∑ HuggingFace Hub
max_input = 1024  # –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–Ω–∞ –≤—Ö–æ–¥–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞ (–≤ —Ç–æ–∫–µ–Ω–∞—Ö)
max_output  = 64  # –ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –¥–ª–∏–Ω–Ω–∞ —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–Ω–æ–≥–æ –∫—Ä–∞—Ç–∫–æ–≥–æ —Å–æ–¥–µ—Ä–∂–∞–Ω–∏—è (–≤ —Ç–æ–∫–µ–Ω–∞—Ö)
batch_size = 8 
output_dir = 'tmp_trainer'

In [11]:
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [12]:
model.config.max_length = max_output

In [13]:
train = gazeta['train']
test = gazeta['test']

In [14]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
  result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  
  return {k: round(v, 4) for k, v in result.items()}

In [None]:
# –í–ù–ò–ú–ê–ù–ò–ï! –ù–£–ñ–ù–ê CUDA. –ú–æ–∂–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å Google Collabs
# pip install transformers[torch]

# test = urukhan_test
# train = urukhan_train

training_args = Seq2SeqTrainingArguments(
  output_dir = output_dir,
  evaluation_strategy='steps',
  #learning_rate=2e-5,
  eval_steps=5000,
  save_steps=5000,
  num_train_epochs=1,
  predict_with_generate=True,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  fp16=True,
  save_total_limit=2,
  #generation_max_length=256,
  #generation_num_beams=4,
  weight_decay=0.005,
  #logging_dir='logs',
)

# –û–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä
optimizer = Adafactor(
    model.parameters(),
    lr=1e-5,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False,
)
lr_scheduler = AdafactorSchedule(optimizer)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset = train,
  eval_dataset = test,
  optimizers = (optimizer, lr_scheduler),
  tokenizer = tokenizer,
  compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
predicts = model.generate(encode(input_sequences)) 

decoded = tokenizer.batch_decode(predicts, skip_special_tokens=True)

# –í–∞—Ä–∏–∞–Ω—Ç 2. –ü—Ä–æ—Å—Ç–æ –∏—Å–ø–æ–ª—å –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å

In [None]:
import torch
from transformers import GPT2Tokenizer, T5ForConditionalGeneration 
tokenizer = GPT2Tokenizer.from_pretrained('RussianNLP/FRED-T5-Summarizer',eos_token='</s>')
model = T5ForConditionalGeneration.from_pretrained('RussianNLP/FRED-T5-Summarizer')
device='cuda'
model.to(device)

input_text='<LM> –°–æ–∫—Ä–∞—Ç–∏ —Ç–µ–∫—Å—Ç.\n –¢–µ–∫—Å—Ç (–æ—Ç –ª–∞—Ç. textus ‚Äî —Ç–∫–∞–Ω—å; —Å–ø–ª–µ—Ç–µ–Ω–∏–µ, —Å–æ—á–µ—Ç–∞–Ω–∏–µ) ‚Äî –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –Ω–∞ –∫–∞–∫–æ–º-–ª–∏–±–æ –º–∞—Ç–µ—Ä–∏–∞–ª—å–Ω–æ–º –Ω–æ—Å–∏—Ç–µ–ª–µ —á–µ–ª–æ–≤–µ—á–µ—Å–∫–∞—è –º—ã—Å–ª—å; –≤ –æ–±—â–µ–º –ø–ª–∞–Ω–µ —Å–≤—è–∑–Ω–∞—è –∏ –ø–æ–ª–Ω–∞—è –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç—å —Å–∏–º–≤–æ–ª–æ–≤. –°—É—â–µ—Å—Ç–≤—É—é—Ç –¥–≤–µ –æ—Å–Ω–æ–≤–Ω—ã–µ —Ç—Ä–∞–∫—Ç–æ–≤–∫–∏ –ø–æ–Ω—è—Ç–∏—è ¬´—Ç–µ–∫—Å—Ç¬ª: –∏–º–º–∞–Ω–µ–Ω—Ç–Ω–∞—è (—Ä–∞—Å—à–∏—Ä–µ–Ω–Ω–∞—è, —Ñ–∏–ª–æ—Å–æ—Ñ—Å–∫–∏ –Ω–∞–≥—Ä—É–∂–µ–Ω–Ω–∞—è) –∏ —Ä–µ–ø—Ä–µ–∑–µ–Ω—Ç–∞—Ç–∏–≤–Ω–∞—è (–±–æ–ª–µ–µ —á–∞—Å—Ç–Ω–∞—è). –ò–º–º–∞–Ω–µ–Ω—Ç–Ω—ã–π –ø–æ–¥—Ö–æ–¥ –ø–æ–¥—Ä–∞–∑—É–º–µ–≤–∞–µ—Ç –æ—Ç–Ω–æ—à–µ–Ω–∏–µ –∫ —Ç–µ–∫—Å—Ç—É –∫–∞–∫ –∫ –∞–≤—Ç–æ–Ω–æ–º–Ω–æ–π —Ä–µ–∞–ª—å–Ω–æ—Å—Ç–∏, –Ω–∞—Ü–µ–ª–µ–Ω–Ω–æ—Å—Ç—å –Ω–∞ –≤—ã—è–≤–ª–µ–Ω–∏–µ –µ–≥–æ –≤–Ω—É—Ç—Ä–µ–Ω–Ω–µ–π —Å—Ç—Ä—É–∫—Ç—É—Ä—ã. –†–µ–ø—Ä–µ–∑–µ–Ω—Ç–∞—Ç–∏–≤–Ω—ã–π ‚Äî —Ä–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –∫–∞–∫ –æ—Å–æ–±–æ–π —Ñ–æ—Ä–º—ã –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –æ –≤–Ω–µ—à–Ω–µ–π —Ç–µ–∫—Å—Ç—É –¥–µ–π—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–æ—Å—Ç–∏.'
input_ids=torch.tensor([tokenizer.encode(input_text)]).to(device)
outputs=model.generate(input_ids,eos_token_id=tokenizer.eos_token_id,
                    num_beams=5,
                    min_new_tokens=17,
                    max_new_tokens=200,
                    do_sample=True,
                    no_repeat_ngram_size=4,
                    top_p=0.9)
print(tokenizer.decode(outputs[0][1:]))