In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import ast

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import process_json, unprocess_json

In [3]:
train = pd.read_csv('../data/distill_100k.csv', index_col=0, converters={'json': ast.literal_eval}).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)

In [4]:
len(train), len(val_set), len(manual_test)

(100000, 500, 100)

In [5]:
train = train[train.Text.apply(lambda txt: txt not in val_set.Text.values and txt not in manual_test.Text.values)]
len(train)

99382

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rut5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("cointegrated/rut5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [8]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.005, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 98885
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 497
    })
})

In [9]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    targets = [process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

Map (num_proc=4):   0%|          | 0/98885 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/497 [00:00<?, ? examples/s]

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [11]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [12]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.generations = []
    self.batch_size=batch_size

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [13]:
n_epochs = 6

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-small",
    # overwrite_output_dir=True,
    evaluation_strategy="epoch",
    # eval_steps=600,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    # fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer(batch_size=32)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)



In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
1,0.5253,0.384668,0.63709,0.562437,37.357678,75.57049,73.296776,0.13,0.05,51.8724,72.889997,69.394716,51.8724,0.002
2,0.4316,0.332382,0.665245,0.597401,38.013116,76.773762,74.672946,0.102,0.056,53.053844,73.825586,71.243057,53.053844,0.0
3,0.3906,0.311431,0.67057,0.608645,38.341458,77.242664,75.347931,0.088,0.054,51.569675,74.432465,72.281754,51.569675,0.0
4,0.3689,0.301968,0.671776,0.614677,37.59861,76.7862,75.477162,0.084,0.058,51.53632,73.868858,72.793992,51.53632,0.0
5,0.3577,0.299432,0.672986,0.611404,37.974979,76.694806,75.428674,0.094,0.056,51.3926,73.931551,72.470517,51.3926,0.0
6,0.3577,0.298156,0.675385,0.613226,37.809312,76.934711,75.775212,0.096,0.056,51.286876,73.970323,72.730799,51.286876,0.0




<BOT> Наволочки по 2 шт<EOC1><BOC2> RUB<EOC2>




TrainOutput(global_step=9276, training_loss=0.43154376722091536, metrics={'train_runtime': 4720.7454, 'train_samples_per_second': 125.681, 'train_steps_per_second': 1.965, 'total_flos': 2.535882638304461e+16, 'train_loss': 0.43154376722091536, 'epoch': 6.0})

In [15]:
s = 'продам зонт за 300 рублей </s>'

inputs = tokenizer.encode(s, return_tensors='pt').to('cuda')
out = model.generate(inputs)
tokenizer.decode(out[0])



'<pad><BOB><BOT> зонт<BOP> 300<EOP><BOC1> 1<EOC1><BOC2> RUB<EOC2><EOB></s>'

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [17]:
output_dir = "ruT5-small-trained-full-distill-data"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('ruT5-small-trained-full-distill-data/tokenizer_config.json',
 'ruT5-small-trained-full-distill-data/special_tokens_map.json',
 'ruT5-small-trained-full-distill-data/spiece.model',
 'ruT5-small-trained-full-distill-data/added_tokens.json',
 'ruT5-small-trained-full-distill-data/tokenizer.json')

In [28]:
ev = Evaluator(manual_test, model=model, tokenizer=tokenizer, batch_size=32)
output = ev.generate_samples_batched()
df = pd.DataFrame([{'id': i, 'json': json.dumps(v, indent=4, ensure_ascii=False)} for i, v in output.items()])
df.to_csv('manual_test_outputs.csv', index=False)



<BOT> Версия польская, игра языконезависимая<EOT><BOP> 2 партии<EOC1><BOC2> RUB<EOC2>
