In [1]:
import pandas as pd
import numpy as np
import re
import torch
import json
from tqdm.notebook import tqdm
# from torch.utils.data import Dataset, DataLoader
# from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator
from json_format import process_json, unprocess_json

In [3]:
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads}).sample(frac=1, random_state=42)

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-large")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [8]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.05, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8370
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 441
    })
})

In [9]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/8370 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/441 [00:00<?, ? examples/s]

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [11]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [12]:
from IPython.display import clear_output

class MetricComputer:
  def __init__(self):
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [13]:
n_epochs = 13

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer()
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)



In [12]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
1000,0.42,0.341139,0.682549,0.618635,39.589126,76.811225,75.118051,0.078,0.064,52.357236,74.038565,70.480968,52.357236,0.0
2000,0.2737,0.328908,0.711432,0.671214,40.893856,78.456784,77.394638,0.022,0.066,54.156738,75.551359,74.278085,54.156738,0.002
3000,0.1825,0.3425,0.730814,0.682197,43.354566,80.039947,78.573414,0.042,0.06,54.569057,76.896123,76.702511,54.569057,0.006
4000,0.1326,0.358613,0.716935,0.685848,41.906696,79.158118,78.874741,0.026,0.058,50.678412,76.47287,76.784072,50.678412,0.0
5000,0.104,0.379249,0.728599,0.697764,43.056374,79.591233,79.34339,0.026,0.066,53.315289,76.865857,77.566729,53.315289,0.0
6000,0.0881,0.382238,0.724529,0.696209,42.52332,79.458857,79.256893,0.018,0.064,54.221383,76.261452,77.20745,54.221383,0.0


<BOT> Sony Sigma 30 1.4 + 600<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2>
<BOT> ремешки apple watch 38/41 mm, uag 20<EOP><BOC1> 1<EOC1><BOC2> лари<EOC2>
<BOT> sony nex-5n with sigma 30 1.4 +600<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2>
<BOT> рюкзак, 40 литров, идеальное состояние, 100<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2>


TrainOutput(global_step=6812, training_loss=0.2345005036520384, metrics={'train_runtime': 5353.8745, 'train_samples_per_second': 20.324, 'train_steps_per_second': 1.272, 'total_flos': 5.6719849860096e+16, 'train_loss': 0.2345005036520384, 'epoch': 13.0})

In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
output_dir = "ruT5-large-trained"
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [15]:
mc(1)



{'BEP-sb': 0.7225385295939926,
 'BEP-multi': 0.6950607267170497,
 'TA-BLEU-sb': 42.27549211585097,
 'TA-CHRF-sb': 79.44316171521454,
 'TA-CHRF-multi': 79.1766937539493,
 'EB-ind': 0.014,
 'MB-ind': 0.064,
 'BLEU-classic': 52.55155492028843,
 'CHRF-classic': 76.29997669403694,
 'CHRF-classic-multi': 77.08981173389313,
 'bleu_old': 52.55155492028843,
 'failed_ratio': 0.0}

In [27]:
s = manual_test.loc[6550, 'Text']

inputs = tokenizer.encode(s, return_tensors='pt').to('cuda')
out = model.generate(inputs, max_length=150, early_stopping=True)
tokenizer.decode(out[0])



'<pad><BOB><BOT> большой контейнер для хранения<EOT><BOP> 30<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2><EOB><BOB><BOT> вешалки металлические, прорезиненные<EOT><BOP> 9<EOP><BOC1> 9 шт.<EOC1><BOC2> GEL<EOC2><EOB><BOB><BOT> прищепки<EOT><BOP> 54 шт.<EOC1><BOC2> GEL<EOC2><EOB><BOB><BOT> скотч металлический, новый<EOT><BOP> 30<EOP><BOC1> 54 шт.<EOC1><BOC2> GEL<EOC2><EOB></s>'

In [34]:
s = '<BOB><BOT> большой контейнер для хранения<EOT><BOP> 30<EOP><BOC1> 1<EOC1><BOC2> GEL<EOC2><EOB><BOB><BOT> вешалки металлические, прорезиненные<EOT><BOP> 9<EOP><BOC1> 9 шт.<EOC1><BOC2> GEL<EOC2><EOB><BOB><BOT> прищепки<EOT><BOP> 30<EOP><BOC1>54 шт.<EOC1><BOC2> GEL<EOC2><EOB><BOB><BOT> скотч металлический, новый<EOT><BOP> 30<EOP><BOC1> 54 шт.<EOC1><BOC2> GEL<EOC2><EOB>'
print(json.dumps(unprocess_json(s), indent=4, ensure_ascii=False))

[
    {
        "Title": " большой контейнер для хранения",
        "Price": " 30",
        "Count": " 1",
        "Currency": " GEL"
    },
    {
        "Title": " вешалки металлические, прорезиненные",
        "Price": " 9",
        "Count": " 9 шт.",
        "Currency": " GEL"
    },
    {
        "Title": " прищепки",
        "Price": " 30",
        "Count": "54 шт.",
        "Currency": " GEL"
    },
    {
        "Title": " скотч металлический, новый",
        "Price": " 30",
        "Count": " 54 шт.",
        "Currency": " GEL"
    }
]


In [19]:
ev = Evaluator(manual_test, model=model, tokenizer=tokenizer, batch_size=32)
output = ev.generate_samples_batched([6550])
df = pd.DataFrame([{'id': i, 'json': json.dumps(v, indent=4, ensure_ascii=False)} for i, v in output.items()])
# df.to_csv('manual_test_outputs_default.csv', index=False)
df



<BOT> прищепки<EOT><BOP> 54 шт.<EOC1><BOC2> GEL<EOC2>


Unnamed: 0,id,json
0,6550,[]


In [16]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [17]:
# ev = Evaluator(distill_data, model, tokenizer)
# output = ev.generate_samples_batched(batch_size=256)

In [18]:
# distill_data['json'] = pd.Series(output)

In [19]:
# distill_data.to_csv('../data/distill_100k.csv')