In [1]:
!pip install transformers[torch] sacrebleu evaluate
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install rouge-metric

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54

## Загружаем данные

В тренировочной выборке 4411 примера, в валидационной 111 пример, в тестовой 992 примера.

In [None]:
import json
from sklearn.model_selection import train_test_split

data = json.load(open('russian_dataset_rewrite_checked.json'))
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.1, random_state=42)

In [None]:
len(train_data), len(test_data), len(val_data)

(4411, 111, 992)

In [None]:
# with open('train_data_rewrite.json', 'w', encoding='utf-8') as file:
#     json.dump({'data': train_data}, file, ensure_ascii=False, indent=2)

# with open('test_data_rewrite.json', 'w', encoding='utf-8') as file:
#     json.dump({'data': test_data}, file, ensure_ascii=False, indent=2)

# with open('val_data_rewrite.json', 'w', encoding='utf-8') as file:
#     json.dump({'data': val_data}, file, ensure_ascii=False, indent=2)

In [2]:
from datasets import load_dataset

dataset = load_dataset('json', data_files={'train': 'train_data_rewrite.json',
                                           'test': 'test_data_rewrite.json',
                                           'val': 'val_data_rewrite.json'},
                       field='data')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

## Предобработка данных

In [3]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('ai-forever/ruT5-base',eos_token='</s>')
model = T5ForConditionalGeneration.from_pretrained('ai-forever/ruT5-base')

def preprocess_function(examples):
    # prompt with context and real phrase
    phrases = [example for example in examples["Phrase"]]
    contexts = ['\n,'.join([f'<SC5>"{c}"</SC5>'  for c in example[-2:]]) for example in examples["History"]]
    inputs = [f'''
Перепиши данную фразу согласно данному контексту, так чтобы фраза стала контекстно независимой. Исключи из фразы эллипсис и кореференцию.
КОНТЕКСТ:
{context}
Фраза:
{phrases[i]}
Перефраз: <extra_id_0>
''' for i, context in enumerate(contexts)]
    # rewrites
    targets = [example for example in examples["Rewrite"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                       model='ai-forever/ruT5-base',
                                       pad_to_multiple_of=8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Map:   0%|          | 0/4411 [00:00<?, ? examples/s]

Map:   0%|          | 0/992 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

## Метрики

In [4]:
import evaluate
from rouge_metric import PyRouge

bleu = evaluate.load("sacrebleu")
rouge = PyRouge(rouge_n=(4), skip_gap=4)

import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    rouge_score = rouge.evaluate(decoded_preds, decoded_labels)
    result['rouge_l'] = rouge_score['rouge-l']['f']
    result['rouge_2'] = rouge_score['rouge-2']['f']

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## Train

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
# model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

from peft import LoraConfig, get_peft_model
from peft.utils.peft_types import TaskType

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, config)

In [6]:
new_model = 'my_rewrite_base_t5'

training_args = Seq2SeqTrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Rouge L,Rouge 2,Gen Len
0,No log,4.542775,1.0521,0.0443,0.0066,19.0
1,No log,2.953781,6.5666,0.1437,0.0585,19.0
2,No log,2.220399,8.3725,0.2338,0.1404,10.499
4,No log,1.909067,23.6396,0.4408,0.3337,13.3659
5,No log,1.811206,29.0856,0.5118,0.3982,13.6633
6,No log,1.770657,31.4541,0.5369,0.4228,13.871
8,No log,1.705003,33.9099,0.5624,0.4472,14.374
9,No log,1.7061,34.0882,0.5669,0.4509,14.3619
10,No log,1.674719,34.3811,0.5691,0.4542,14.4052
12,No log,1.648776,34.8028,0.5731,0.4576,14.5202




KeyboardInterrupt: 

In [None]:
trainer.evaluate(tokenized_dataset["val"])

{'eval_loss': nan,
 'eval_bleu': 30.9459,
 'eval_gen_len': 19.0,
 'eval_runtime': 899.0242,
 'eval_samples_per_second': 0.944,
 'eval_steps_per_second': 0.944,
 'epoch': 1.9974470257850396}

In [7]:
import torch
from tqdm.notebook import tqdm

In [8]:
raw_test_results_new = []
model = model.to('cuda')

for i, encoding in tqdm(enumerate(tokenized_dataset['test']['input_ids'])):
    input_ids=torch.tensor([encoding]).to('cuda')
    out = model.generate(inputs=input_ids,
                         eos_token_id=tokenizer.eos_token_id,
                         max_length=150,
                         num_beams=2,
                         repetition_penalty=3.,
                         length_penalty=3.0,
                         early_stopping=True)
    out = tokenizer.decode(out[0][1:],
                           skip_special_tokens=True,
                           clean_up_tokenization_spaces=True)
    raw_test_results_new.append((tokenized_dataset['test'][i]['Utt_ID_hash'],
                             tokenized_dataset['test'][i]['Dia_ID_hash'],
                             out))

0it [00:00, ?it/s]

In [9]:
import pandas as pd

raw_test_results = pd.DataFrame(raw_test_results_new, columns=['Utt_ID_hash', 'Dia_ID_hash', 'model_out_raw'])
raw_test_results.head()

Unnamed: 0,Utt_ID_hash,Dia_ID_hash,model_out_raw
0,utt_1c9499ef,dia_8b1e5abb,То обязательно обращусь к вам.
1,utt_ed502a14,dia_3db27511,"Разные сериалы, главное, чтоб не нервные, а то..."
2,utt_f9015b3f,dia_1684762c,"Круто, я умею готовить! А что ты умеешь готовить?"
3,utt_87197d1a,dia_5dd50bca,Я бы побалуюсь на велосипеде - это более полез...
4,utt_7d293283,dia_043da1df,Секретарь. Хотя готовить всегда нравилось боль...


In [10]:
raw_test_results['Phrase'] = tokenized_dataset['test']['Phrase']
raw_test_results['Rewrite'] = tokenized_dataset['test']['Rewrite']

**Bleu**

In [12]:
results = bleu.compute(predictions=raw_test_results.model_out_raw.values.tolist(), references=raw_test_results.Rewrite.values.tolist())
print(results['score'])

34.198896532456935


In [13]:
results = bleu.compute(predictions=raw_test_results.model_out_raw.values.tolist(), references=raw_test_results.Phrase.values.tolist())
print(results['score'])

44.197992352763634


**Rouge**

In [14]:
scores = rouge.evaluate(raw_test_results.model_out_raw.values.tolist(), raw_test_results.Rewrite.apply(lambda x: [x]).values.tolist())
for k in scores:
    print(k, scores[k]['f'])

rouge-1 0.5366912662619522
rouge-2 0.3951366135806867
rouge-3 0.30228331149144216
rouge-4 0.22820169880779465
rouge-l 0.5267900789514197


In [15]:
scores = rouge.evaluate(raw_test_results.model_out_raw.values.tolist(), raw_test_results.Phrase.apply(lambda x: [x]).values.tolist())
for k in scores:
    print(k, scores[k]['f'])

rouge-1 0.5713822560579963
rouge-2 0.4670942993600828
rouge-3 0.3895367676252729
rouge-4 0.30671728100990236
rouge-l 0.5662813278217507


**Restoration F1**

In [16]:
import numpy as np

class RestorationFScore:

    def __init__(self, tokenizer, n_gram: int=2):
        self.n_gram = n_gram
        self.tokenizer = tokenizer

    def preprocess(self, sents):
        for sent in sents:
            sent_tokenize = self.tokenizer(sent)['input_ids']
            yield [tuple(sent_tokenize[i:i+self.n_gram]) for i, _ in enumerate(sent_tokenize)]

    def _itereval(self):
        for i, predictions in enumerate(self.predictions):
            restored_ngrams = set(predictions).difference(self.references[i])
            ngrams_in_ref = set(self.rewrites[i]).difference(self.references[i])
            interagree = ngrams_in_ref.intersection(restored_ngrams)
            if len(restored_ngrams):
                precision = len(interagree) / len(restored_ngrams)
            else:
                precision = 0.
            if len(ngrams_in_ref):
                recall = len(interagree) / len(ngrams_in_ref)
            else:
                recall = 0.
            if precision or recall:
                yield 2 * ((precision * recall) / (precision + recall))
            else:
                yield 0.

    def evaluate(self, predictions: list,
                 references: list, rewrites: list):
        self.predictions = [p for p in self.preprocess(predictions)]
        self.references = [p for p in self.preprocess(references)]
        self.rewrites = [p for p in self.preprocess(rewrites)]
        return np.mean(list(self._itereval()))

In [17]:
rf_score = RestorationFScore(tokenizer, 1)
rf_score.evaluate(predictions=raw_test_results.model_out_raw.values.tolist(),
                 references=raw_test_results.Phrase.values.tolist(),
                 rewrites=raw_test_results.Rewrite.values.tolist())

0.19888681840290004

In [18]:
rf_score = RestorationFScore(tokenizer, 2)
rf_score.evaluate(predictions=raw_test_results.model_out_raw.values.tolist(),
                 references=raw_test_results.Phrase.values.tolist(),
                 rewrites=raw_test_results.Rewrite.values.tolist())

0.13285988048990177

In [19]:
rf_score = RestorationFScore(tokenizer, 3)
rf_score.evaluate(predictions=raw_test_results.model_out_raw.values.tolist(),
                 references=raw_test_results.Phrase.values.tolist(),
                 rewrites=raw_test_results.Rewrite.values.tolist())

0.10622029450283109

In [20]:
rf_score = RestorationFScore(tokenizer, 4)
rf_score.evaluate(predictions=raw_test_results.model_out_raw.values.tolist(),
                 references=raw_test_results.Phrase.values.tolist(),
                 rewrites=raw_test_results.Rewrite.values.tolist())

0.093716832458188

## Сохранение модели

In [21]:
model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)



('my_rewrite_base_t5/tokenizer_config.json',
 'my_rewrite_base_t5/special_tokens_map.json',
 'my_rewrite_base_t5/spiece.model',
 'my_rewrite_base_t5/added_tokens.json')

In [23]:
!zip -r /content/my_rewrite_base_t5_new.zip /content/my_rewrite_base_t5

  adding: content/my_rewrite_base_t5/ (stored 0%)
  adding: content/my_rewrite_base_t5/adapter_model.safetensors (deflated 7%)
  adding: content/my_rewrite_base_t5/added_tokens.json (deflated 83%)
  adding: content/my_rewrite_base_t5/adapter_config.json (deflated 51%)
  adding: content/my_rewrite_base_t5/special_tokens_map.json (deflated 85%)
  adding: content/my_rewrite_base_t5/spiece.model (deflated 56%)
  adding: content/my_rewrite_base_t5/README.md (deflated 66%)
  adding: content/my_rewrite_base_t5/runs/ (stored 0%)
  adding: content/my_rewrite_base_t5/runs/Jun05_17-25-08_ab1df38cde96/ (stored 0%)
  adding: content/my_rewrite_base_t5/runs/Jun05_17-25-08_ab1df38cde96/events.out.tfevents.1717608309.ab1df38cde96.271.0 (deflated 65%)
  adding: content/my_rewrite_base_t5/tokenizer_config.json (deflated 94%)
