In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import ast

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import process_json, unprocess_json

In [5]:
train = pd.read_csv('../data/distill_100k.csv', index_col=0, converters={'json': ast.literal_eval})
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads})
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads})

In [7]:
len(train), len(val_set), len(manual_test)

(100000, 500, 100)

In [8]:
train = train[train.Text.apply(lambda txt: txt not in val_set.Text.values and txt not in manual_test.Text.values)]
len(train)

99382

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [11]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.005, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 98885
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 497
    })
})

In [12]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/98885 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/497 [00:00<?, ? examples/s]

In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [14]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [15]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.generations = []
    self.batch_size=batch_size

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [23]:
n_epochs = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="epoch",
    # eval_steps=600,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer(batch_size=32)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)



In [17]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
1,0.1224,0.131697,0.710076,0.675262,41.415907,79.261798,78.990408,0.038,0.052,52.241866,76.587065,77.05151,52.241866,0.0
2,0.1083,0.124929,0.715752,0.688492,41.204374,79.162463,79.197703,0.026,0.054,52.552299,76.752391,77.471091,52.552299,0.0
3,0.1122,0.116555,0.713258,0.685291,42.114947,79.779301,79.545293,0.034,0.052,55.149066,77.475075,78.159108,55.149066,0.0
4,0.1238,0.11466,0.713279,0.68632,42.143194,79.588736,79.611439,0.03,0.05,55.279685,77.232037,78.053142,55.279685,0.0


TrainOutput(global_step=6184, training_loss=0.11798346058054698, metrics={'train_runtime': 3391.1472, 'train_samples_per_second': 116.639, 'train_steps_per_second': 1.824, 'total_flos': 6.020849444179968e+16, 'train_loss': 0.11798346058054698, 'epoch': 4.0})

In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
output_dir = "ruT5-base-trained-full-distill-data"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('ruT5-base-trained-full-distill-data/tokenizer_config.json',
 'ruT5-base-trained-full-distill-data/special_tokens_map.json',
 'ruT5-base-trained-full-distill-data/spiece.model',
 'ruT5-base-trained-full-distill-data/added_tokens.json',
 'ruT5-base-trained-full-distill-data/tokenizer.json')

In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ruT5-base-trained-full-distill-data")
model = AutoModelForSeq2SeqLM.from_pretrained("ruT5-base-trained-full-distill-data")

In [24]:
mc(1)



{'BEP-sb': 0.7132786024307072,
 'BEP-multi': 0.6863201648665004,
 'TA-BLEU-sb': 42.143193793256295,
 'TA-CHRF-sb': 79.5887363178885,
 'TA-CHRF-multi': 79.61143880499039,
 'EB-ind': 0.03,
 'MB-ind': 0.05,
 'BLEU-classic': 55.27968482908748,
 'CHRF-classic': 77.23203743869887,
 'CHRF-classic-multi': 78.05314245419021,
 'bleu_old': 55.27968482908748,
 'failed_ratio': 0.0}

In [32]:
ev = Evaluator(manual_test, model=model, tokenizer=tokenizer, batch_size=32)
output = ev.generate_samples_batched()
df = pd.DataFrame([{'id': i, 'json': json.dumps(v, indent=4, ensure_ascii=False)} for i, v in output.items()])
df.to_csv('manual_test_outputs.csv', index=False)



<BOT> прищепки<EOT><BOP> 54 шт<EOC1><BOC2> GEL<EOC2>
