In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
# from torch.utils.data import Dataset, DataLoader
# from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

In [3]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import TextJSONProcessor

json_proc = TextJSONProcessor()

In [4]:
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters={'json': json.loads})
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads})
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads})

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
tokenizer.add_tokens(json_proc.spec_tokens)
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == json_proc.unprocess_json(json_proc.process_json(train.loc[train.index[0], 'json']))

In [7]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.05, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8370
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 441
    })
})

In [8]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [json_proc.process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/8370 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/441 [00:00<?, ? examples/s]

In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [10]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [11]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.generations = []
    self.batch_size=batch_size

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size, json_processor=json_proc)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [12]:
n_epochs = 15

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=350,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer(batch_size=32)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)



In [13]:
trainer.train()

Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
350,No log,0.51762,0.640803,0.549075,35.87083,74.111133,69.130526,0.088,0.092,46.824849,69.697365,65.484622,46.824849,0.04
700,1.044800,0.47905,0.680357,0.609939,39.765578,77.201041,74.779221,0.092,0.068,48.311692,73.203884,71.970504,48.311692,0.02
1050,0.496100,0.456884,0.690033,0.635098,40.201262,77.263306,75.564854,0.074,0.054,51.613084,74.150365,72.951086,51.613084,0.012
1400,0.496100,0.448651,0.703422,0.647954,40.50068,78.26184,76.860553,0.078,0.044,52.152377,75.036528,73.795777,52.152377,0.008
1750,0.414100,0.442184,0.693576,0.641601,39.080469,77.589081,76.026903,0.07,0.048,50.503031,74.441197,73.389587,50.503031,0.01




Scrabble дорожный в количестве 1, цена 700 (RUB
умберто эко «растительная память» в количестве 1, цена 25 (
Пла
Шорт
чехлы для iphone 7. 3 силиконовых, один пластиковый, один пластиковый
sigma 30 1.4 +600 gel с Sigma 30 1.4 +600 gel с Sigma 30 1.4 +600 gel с Sigma 30 1.4 +600 gel с Sigma 30 1.4 +600 gel с Sigma 30 1.4 +600 gel с Sigma 30 1.4 + 600 gel с Sigma 30 1.4 + 600 gel с Sigma 30 1.4 + 600 gel с Sigma 30 1.4 + 600 gel с Sigma 30 1.4 + 150 gel с Sigma 30 1.4 + 150
худи с лампасами размер L в количестве 1
Одеяло верблюжья шерсть LUC в количестве
платье с коротким рукавом в
Шапка-шлем, 54 размер в количестве 1, цена
Джин
Пеги на велик b
монитор onn 24" model: onn 24" model: onn 24" model: onn 24" model: onn 24" model: onn 24" model: onn
штаны на клепках ellesse 5/5 s-m в количестве 1, цена 65 (GEL
игрушка ручной работы пикачу, рост до головы 22 см, рост до головы 22 см, рост до головы 22 см, до ушей 27 см, до ушей 27 см, одежда съёмная в количестве 1, цена 35 (ла
майка karl kani ра



морд
настольная игра клуэдо/cluedo harry po
Sigma 30 1.4 + 600 (GEL)
худи с лампас
Подсвечники, 19 штук,
Чемодан на колесах, ручная кладь, 55х35х20, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у, б/у,
Audi A4 2006 года 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, 2.0l автомат, япония, импортная, комплектация, ксенон, електро сиденья, с регулировками, с регулировками, 7 динамиков, сабвуфер
майка karl kani в количестве 1, цена:30 (GEL)
зеленые классические брюки в количестве 1,
Простыня на резинке, ткань Трикотаж, ткань Трикот
морд
настольная игра клуэдо/cluedo harry po




braddon «the law
Платье (Турция) в количестве 1, цена 400 (RUB
расческа
худи с лампас
майка karl kani в количестве 1, цена:30 (GEL)
Простыня на резинке, 100% хлопок в количестве 1, цена 300 (
braddon «the law
Платье (Турция) в количестве 1, цена 400 (RUB
расческа
braddon «the lawyers secret
расчес
худи с лампасами Off-W
Просты
braddon «the lawyers secret
расчес





Джинсы в количестве 1, цена 400
расчес
Sigma 30 1.4 +600 (GEL)
Простыня на резинке 140х200см в количестве 1,

Костюм велюровый в
расчес


TrainOutput(global_step=1965, training_loss=0.5892136280166587, metrics={'train_runtime': 1876.6636, 'train_samples_per_second': 66.901, 'train_steps_per_second': 1.047, 'total_flos': 1.911039264129024e+16, 'train_loss': 0.5892136280166587, 'epoch': 15.0})

In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [14]:
output_dir = "ruT5-base-trained-gpt-data"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('ruT5-base-trained-gpt-data/tokenizer_config.json',
 'ruT5-base-trained-gpt-data/special_tokens_map.json',
 'ruT5-base-trained-gpt-data/spiece.model',
 'ruT5-base-trained-gpt-data/added_tokens.json',
 'ruT5-base-trained-gpt-data/tokenizer.json')