In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
# from torch.utils.data import Dataset, DataLoader
# from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import process_json, unprocess_json

In [3]:
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters={'json': json.loads})
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads})
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads})

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-base")

In [5]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [6]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.05, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8370
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 441
    })
})

In [7]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/8370 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/441 [00:00<?, ? examples/s]

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [9]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [13]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.generations = []
    self.batch_size=batch_size

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [16]:
n_epochs = 15

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="epoch",
    # eval_steps=600,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer(batch_size=32)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
1,No log,0.567342,0.614478,0.520004,33.750218,72.877461,69.04157,0.104,0.086,45.577007,69.123407,62.757394,45.577007,0.004
2,No log,0.491572,0.645237,0.552914,33.794737,74.117045,71.099846,0.128,0.07,46.490574,69.924836,65.819957,46.490574,0.0
3,No log,0.485147,0.646975,0.575769,36.402834,75.161443,73.639301,0.08,0.09,48.131789,71.327573,69.255397,48.131789,0.0
4,0.685100,0.446876,0.677011,0.599624,38.519991,76.875164,74.599695,0.11,0.066,49.073103,73.751441,70.424292,49.073103,0.0
5,0.685100,0.439375,0.681828,0.61409,39.716093,77.177293,74.913505,0.084,0.062,51.707655,73.543853,70.20909,51.707655,0.0
6,0.685100,0.427693,0.684322,0.621358,38.340069,76.885615,75.147277,0.09,0.054,50.39135,73.32468,71.13753,50.39135,0.002
7,0.685100,0.42337,0.697549,0.622101,39.455136,77.781878,75.640969,0.116,0.054,51.696548,74.731378,72.107017,51.696548,0.004
8,0.440100,0.41483,0.686838,0.623559,38.8457,77.396554,75.701121,0.088,0.056,50.788194,74.119522,72.729457,50.788194,0.004
9,0.440100,0.411882,0.687769,0.638576,39.172384,77.484452,76.036414,0.042,0.062,51.242754,74.170801,72.817491,51.242754,0.002
10,0.440100,0.411628,0.697065,0.642427,39.651552,77.931728,76.726469,0.072,0.052,52.788225,75.300362,74.041879,52.788225,0.0


<BOT> Мешок щепа 12 литров<EOC1><BOC2> RUB<EOC2>
<BOT> Наволочки по 2 шт<EOC1><BOC2> RUB<EOC2>
<BOT> Sigma 30 1.4 +600<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2>
<BOT> Sigma 30 1.4 + 600 GEL<EOC2>
<BOT> платье в горошек для беременных m/l<EOT><BOP> m/l<EOC1><BOC2> RUB<EOC2>
<BOT> Sigma 30 1.4 +600 GEL<EOC2>
<BOT> платье в горошек для беременных m/l<EOT><BOP> m/l<EOC1><BOC2> RUB<EOC2>
<BOT> Sigma 30 1.4 +600<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2>
<BOT> Sigma 30 1.4 +600<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2>
<BOT> Sigma 30 1.4 +600 GEL<EOC2>
<BOT> Sigma 30 1.4 +600 GEL<EOC2>
<BOT> Sigma 30 1.4 +600 GEL<EOC2>
<BOT> Sigma 30 1.4 +600 GEL<EOC2>


TrainOutput(global_step=1965, training_loss=0.45945893498777435, metrics={'train_runtime': 2773.8374, 'train_samples_per_second': 45.262, 'train_steps_per_second': 0.708, 'total_flos': 1.911168667680768e+16, 'train_loss': 0.45945893498777435, 'epoch': 15.0})

In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [19]:
output_dir = "ruT5-base-trained-gpt-data"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('ruT5-base-trained-gpt-data/tokenizer_config.json',
 'ruT5-base-trained-gpt-data/special_tokens_map.json',
 'ruT5-base-trained-gpt-data/spiece.model',
 'ruT5-base-trained-gpt-data/added_tokens.json',
 'ruT5-base-trained-gpt-data/tokenizer.json')