In [1]:
import os
import json

import numpy as np

from transformers import (AutoConfig, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq,
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments
                        )

from datasets import load_dataset, DatasetDict
import evaluate


from tqdm import tqdm

In [2]:
model_name = "ai-forever/ruT5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Загрузим данные с Habr и поделим на train/val/test 

In [5]:
articles = load_dataset("json", data_files='data/habr_large/hf_habr_articles_large.json')
train_data = articles['train'].train_test_split(test_size=0.4, seed=42)
val_data = train_data['test'].train_test_split(test_size=0.5, seed=42)

habr_dataset = DatasetDict({
    'train': train_data['train'],
    'validation': val_data['train'],
    'test': val_data['test']
})
habr_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tags', 'title', 'text', 'category', 'article_id'],
        num_rows: 4672
    })
    validation: Dataset({
        features: ['tags', 'title', 'text', 'category', 'article_id'],
        num_rows: 1557
    })
    test: Dataset({
        features: ['tags', 'title', 'text', 'category', 'article_id'],
        num_rows: 1558
    })
})

In [4]:
max_input_length = 512
max_target_length = 30

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_datasets = habr_dataset.map(preprocess_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(
    habr_dataset["train"].column_names
)

Map:   0%|          | 0/4672 [00:00<?, ? examples/s]

Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/1558 [00:00<?, ? examples/s]

In [14]:
BATCH_SIZE = 8
NUM_TRAIN_EPOCHS = 8
# Show the training loss with every epoch
output_path = '/models/lab3/t5'

training_args = Seq2SeqTrainingArguments(
    output_dir=output_path,
    do_train=True,
    do_eval=True,
    do_predict=True,
    max_steps=1000,
    evaluation_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    predict_with_generate=True,
    logging_strategy='steps', 
    logging_steps=200,
)

In [None]:
rouge_score = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    #decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    #decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()