In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
# from torch.utils.data import Dataset, DataLoader
# from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import TextJSONProcessor

json_proc = TextJSONProcessor()

In [3]:
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
tokenizer.add_tokens(json_proc.spec_tokens)
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == json_proc.unprocess_json(json_proc.process_json(train.loc[train.index[0], 'json']))

In [6]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.02, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8634
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 177
    })
})

In [7]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [json_proc.process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/8634 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/177 [00:00<?, ? examples/s]

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [9]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [10]:
from IPython.display import clear_output

class MetricComputer:
  def __init__(self):
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, json_processor=json_proc, batch_size=24)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [11]:
n_epochs = 11

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=750,
    learning_rate=5e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer()
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)



In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
750,0.7257,0.25767,0.696885,0.633601,39.173793,77.41499,74.208251,0.048,0.074,52.301046,74.323402,74.128226,52.301046,0.036
1500,0.3122,0.246288,0.691719,0.651598,38.840839,77.280413,75.830699,0.036,0.06,50.040077,74.347446,74.792677,50.040077,0.018
2250,0.2481,0.245166,0.705087,0.659222,39.37882,77.721149,75.396857,0.046,0.056,50.586034,74.443197,74.527288,50.586034,0.026
3000,0.1749,0.245014,0.710756,0.670902,39.51217,78.366216,76.36689,0.03,0.064,52.211191,75.18228,74.75903,52.211191,0.018
3750,0.1604,0.246883,0.710776,0.673747,39.896283,78.447571,76.696014,0.03,0.062,51.972877,75.494117,75.293795,51.972877,0.016




–ü—Ä–æ—Å—Ç—ã–Ω—è –Ω–∞ —Ä–µ–∑–∏–Ω–∫–µ
–°—Ç—ë–≥–∞–Ω–∞—è —Å—É–º–∫–∞ Reserved —Ä–∞–∑–º–µ—Ä 33
–∑–µ–ª–µ–Ω—ã–π —Å–≤–∏—Ç–µ—Ä
–ß–∞—Å—ã Luxury, –Ω–æ–≤—ã–µ –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 800
–°–∞–ø–æ–≥–∏ –æ—Ç 40-45


Boomerang US - Eng / Fr –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞
—Å–∞–ø–æ–≥–∏ 39—Ä (25—Å–º)
–§–∏–≥—É—Ä–∫–∏ –∞–≥–µ–Ω—Ç–æ–≤ –¥–ª—è –∏–≥—Ä—ã Lodrs of Waterdeep –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 36, —Ü–µ–Ω–∞
–®–∫–æ–ª—å–Ω–∞—è —Ä—É–±–∞—à–∫–∞ —Ä–∞–∑–º–µ—Ä 140 –±–µ–ª–∞—è –∏ –≥–æ–ª—É–±–∞—è –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1

–°–∞–ø–æ–≥–∏ —Ä–µ–∑–∏–Ω–æ–≤—ã–µ, —Ä–∞–∑–º–µ—Ä 22, –Ω–æ–≤—ã–µ –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 19 —à—Ç—É–∫
–ü–ª–∞—Ç—å–µ (–¢—É—Ä—Ü–∏—è)
Ticket to ride: The Heart of Africa
—Å–µ—Ä–∏–∞–ª ¬´—Å
–≥–µ–ª—å-–ª–∞–∫–∏ –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 12 —à—Ç—É–∫, –±–∞–∑–∞ –∏ —Ç–æ–ø global fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion fashion
–°–∞–ø–æ–≥–∏ –î–µ–º–∞—Ä, —Ä–∞–∑–º–



–ü—Ä–æ—Å—Ç—ã–Ω—è –Ω–∞ —Ä–µ–∑–∏–Ω–∫–µ, –¢—Ä–∏–∫–æ—Ç–∞–∂ 100% —Ö–ª–æ–ø
–°—Ç—ë–≥–∞–Ω–∞—è —Å—É–º–∫–∞ Reserved
–∑–µ–ª–µ–Ω–∞—è —é–±–∫–∞ incity –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1,



–ú–∞—à–∏–Ω–∫–∏ –¥–µ—Ä–µ–≤—è–Ω–Ω—ã–µ, —â–µ–Ω—è—á–∏–π –ø–∞—Ç—Ä—É–ª—å






–ü—Ä–æ—Å—Ç—ã–Ω—è –Ω–∞ —Ä–µ–∑–∏–Ω–∫–µ 90–•200 —Å–º –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 320 (

–∑–µ–ª–µ–Ω–∞—è —é–±–∫–∞ incity –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 25 (–ª–∞—Ä–∏
–ß–∞—Å—ã Luxury(–Ω–æ–≤—ã–µ), 800 (RUB)
–°–∞–ø–æ–≥–∏ —Ä–∞–∑–º–µ—Ä 40-45
—Ä–µ–º–µ—à–∫–∏ apple watch 38/41 mm, uag
Scrabble –¥–æ—Ä–æ–∂–Ω—ã–π –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 700 (
—Ñ—É—Ç–±–æ–ª–∫–∞ Dolce Gabbana —Ä–∞–∑–º–µ—Ä –ú –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 500 (RUB
–ü–æ–¥—Å–≤–µ—á–Ω–∏–∫–∏ –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 19 —à—Ç—É–∫
–ö–æ—Å—Ç—é–º –≤–µ–ª—é—Ä–æ–≤—ã–π –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞
Ticket to ride:
–±–µ–ª–ª—å–≤–∏–ª—å –≤–∞–Ω —Å—Ç–æ—É–Ω ¬´—Å–∫–µ—Ç—á–∏¬ª –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞
–°–∞–ø–æ–≥–∏ –î–µ–º–∞—Ä, —Ä–∞–∑–º–µ—Ä 26/27 –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 500




–ü—Ä–æ—Å—Ç—ã–Ω—è –Ω–∞ —Ä–µ–∑–∏–Ω–∫–µ –¢—Ä–∏–∫–æ—Ç–∞–∂ 100% —Ö–ª–æ–ø


–ö–∞–º–µ—Ä–∞ Nikon 3200, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24
—Ñ—É—Ç–±–æ–ª–∫–∞ Dolce Gabbana —Ä–∞–∑–º–µ—Ä –ú –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 500 (RUB
–ö–æ–Ω—Å—Ç—Ä—É–∫—Ç–æ—Ä sluban M38-B0373 –≤
–ü–ª–∞—Ç—å–µ (–¢—É—Ä—Ü–∏—è) –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 400 (
Ticket to ride: The Heart of Africa (–Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞, –Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞,
braddon ¬´the lawyer




–ü—Ä–æ—Å—Ç—ã–Ω—è –Ω–∞ —Ä–µ–∑–∏–Ω–∫–µ –¢—Ä–∏–∫–æ—Ç–∞–∂ 100% —Ö–ª–æ–ø
–ª–æ–¥–æ—á–∫–∏ calipso –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1,


–ö–∞–º–µ—Ä–∞ Nikon 3200, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24MP CMOS, 24
–ú–∞—à–∏–Ω–∫–∏ –¥–µ—Ä–µ–≤—è–Ω–Ω—ã–µ, —â–µ–Ω—è—á–∏–π –ø–∞—Ç—Ä—É–ª—å
–ö–æ—Å—Ç—é–º
braddon ¬´the lawyer


TrainOutput(global_step=3960, training_loss=0.3004930245755899, metrics={'train_runtime': 3746.9228, 'train_samples_per_second': 25.347, 'train_steps_per_second': 1.057, 'total_flos': 5.0743397486592e+16, 'train_loss': 0.3004930245755899, 'epoch': 11.0})

In [13]:
s = '–ø—Ä–æ–¥–∞–º –∑–æ–Ω—Ç –∑–∞ 300 —Ä—É–±–ª–µ–π'
inputs = tokenizer(s, return_tensors='pt').to('cuda')
out = model.generate(**inputs)
tokenizer.decode(out[0])



'<pad> –ü—Ä–æ–¥–∞–µ—Ç—Å—è: –∑–æ–Ω—Ç –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 300 (RUB)</s>'

In [14]:
tokenizer.decode(ads['train'][0]['input_ids'])

'—Å—Ç–æ–ª –æ—Ñ–∏—Å–Ω—ã–π –Ω–µ —Ä–∞–∑–±–∏—Ä–∞–µ—Ç—Å—è —Å —Ç—É–º–±–æ–π –∏ —Å—Ç—É–ª–æ–º 100 –ø–∞—Ñ–æ—Å</s>'

In [15]:
tokenizer.decode(ads['train'][0]['labels'])

'–ü—Ä–æ–¥–∞–µ—Ç—Å—è: —Å—Ç–æ–ª –æ—Ñ–∏—Å–Ω—ã–π —Å —Ç—É–º–±–æ–π –∏ —Å—Ç—É–ª–æ–º –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1, —Ü–µ–Ω–∞ 100 (RUB)</s>'

In [16]:
tokenizer.decode(tokenizer.encode(json_proc.process_json(train.loc[train.index[0], 'json'])))

'–ü—Ä–æ–¥–∞–µ—Ç—Å—è: –°–≤—ë–∫–ª–∞ –∫–æ—Ä–º–æ–≤–∞—è –≤ –∫–æ–ª–∏—á–µ—Å—Ç–≤–µ 1 –º–µ—à–æ–∫, —Ü–µ–Ω–∞ 250 (RUB)</s>'

In [17]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [18]:
output_dir = "ruT5-large-trained-text-json"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('ruT5-large-trained-text-json/tokenizer_config.json',
 'ruT5-large-trained-text-json/special_tokens_map.json',
 'ruT5-large-trained-text-json/spiece.model',
 'ruT5-large-trained-text-json/added_tokens.json',
 'ruT5-large-trained-text-json/tokenizer.json')

In [19]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained(output_dir)
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [20]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [21]:
# ev = Evaluator(distill_data, model, tokenizer)
# output = ev.generate_samples_batched(batch_size=256)

In [22]:
# distill_data['json'] = pd.Series(output)

In [23]:
# distill_data.to_csv('../data/distill_100k.csv')