In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
# from torch.utils.data import Dataset, DataLoader
# from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import TextJSONProcessor

json_proc = TextJSONProcessor()

In [3]:
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
tokenizer.add_tokens(json_proc.spec_tokens)
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == json_proc.unprocess_json(json_proc.process_json(train.loc[train.index[0], 'json']))

In [6]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.02, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8634
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 177
    })
})

In [7]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [json_proc.process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/8634 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/177 [00:00<?, ? examples/s]

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [9]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [10]:
from IPython.display import clear_output

class MetricComputer:
  def __init__(self):
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, json_processor=json_proc, batch_size=24)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [11]:
n_epochs = 11

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=750,
    learning_rate=5e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer()
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)



In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
750,0.7257,0.25767,0.696885,0.633601,39.173793,77.41499,74.208251,0.048,0.074,52.301046,74.323402,74.128226,52.301046,0.036
1500,0.3122,0.246288,0.691719,0.651598,38.840839,77.280413,75.830699,0.036,0.06,50.040077,74.347446,74.792677,50.040077,0.018
2250,0.2481,0.245166,0.705087,0.659222,39.37882,77.721149,75.396857,0.046,0.056,50.586034,74.443197,74.527288,50.586034,0.026
3000,0.1749,0.245014,0.710756,0.670902,39.51217,78.366216,76.36689,0.03,0.064,52.211191,75.18228,74.75903,52.211191,0.018
3750,0.1604,0.246883,0.710776,0.673747,39.896283,78.447571,76.696014,0.03,0.062,51.972877,75.494117,75.293795,51.972877,0.016




Простыня на резинке
Стёганая сумка Reserved размер 33
зеленый свитер
Часы Luxury, новые в количестве 1, цена 800
Сапоги от 40-45


Boomerang US - Eng / Fr в количестве 1, цена
сапоги 39р (25см)
Фигурки агентов для игры Lodrs of Waterdeep в количестве 36, цена
Школьная рубашка размер 140 белая и голубая в количестве 1

Сапоги резиновые, размер 22, новые в количестве 19 штук
Платье (Турция)
Ticket to ride: The Heart of Africa
сериал «с
гель-лаки в количестве 12 штук, база и топ global fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion
Сапоги Демар, размер 26/27 в количестве 1,




Простыня на резинке, Трикотаж 100% хлоп
Стёганая сумка Reserved
зеленая юбка incity в количестве 1,



Машинки деревянные, щенячий патруль






Простыня на резинке 90Х200 см в количестве 1, цена 320 (

зеленая юбка incity в количестве 1, цена 25 (лари
Часы Luxury(новые), 800 (RUB)
Сапоги размер 40-45
ремешки apple watch 38/41 mm, uag
Scrabble дорожный в количестве 1, цена 700 (
футболка Dolce Gabbana размер М в количестве 1, цена 500 (RUB
Подсвечники в количестве 19 штук
Костюм велюровый в количестве 1, цена
Ticket to ride:
белльвиль ван стоун «скетчи» в количестве 1, цена
Сапоги Демар, размер 26/27 в количестве 1, цена 500




Простыня на резинке Трикотаж 100% хлоп


Камера Nikon 3200, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24
футболка Dolce Gabbana размер М в количестве 1, цена 500 (RUB
Конструктор sluban M38-B0373 в
Платье (Турция) в количестве 1, цена 400 (
Ticket to ride: The Heart of Africa (настольная игра, настольная игра,
braddon «the lawyer




Простыня на резинке Трикотаж 100% хлоп
лодочки calipso в количестве 1,


Камера Nikon 3200, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24
Машинки деревянные, щенячий патруль
Костюм
braddon «the lawyer


TrainOutput(global_step=3960, training_loss=0.3004930245755899, metrics={'train_runtime': 3746.9228, 'train_samples_per_second': 25.347, 'train_steps_per_second': 1.057, 'total_flos': 5.0743397486592e+16, 'train_loss': 0.3004930245755899, 'epoch': 11.0})

In [13]:
s = 'продам зонт за 300 рублей'
inputs = tokenizer(s, return_tensors='pt').to('cuda')
out = model.generate(**inputs)
tokenizer.decode(out[0])



'<pad> Продается: зонт в количестве 1, цена 300 (RUB)</s>'

In [14]:
tokenizer.decode(ads['train'][0]['input_ids'])

'стол офисный не разбирается с тумбой и стулом 100 пафос</s>'

In [15]:
tokenizer.decode(ads['train'][0]['labels'])

'Продается: стол офисный с тумбой и стулом в количестве 1, цена 100 (RUB)</s>'

In [16]:
tokenizer.decode(tokenizer.encode(json_proc.process_json(train.loc[train.index[0], 'json'])))

'Продается: Свёкла кормовая в количестве 1 мешок, цена 250 (RUB)</s>'

In [17]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
output_dir = "ruT5-large-trained-text-json"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('ruT5-large-trained-text-json/tokenizer_config.json',
 'ruT5-large-trained-text-json/special_tokens_map.json',
 'ruT5-large-trained-text-json/spiece.model',
 'ruT5-large-trained-text-json/added_tokens.json',
 'ruT5-large-trained-text-json/tokenizer.json')

In [19]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained(output_dir)
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [20]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [21]:
# ev = Evaluator(distill_data, model, tokenizer)
# output = ev.generate_samples_batched(batch_size=256)

In [22]:
# distill_data['json'] = pd.Series(output)

In [23]:
# distill_data.to_csv('../data/distill_100k.csv')