In [1]:
import pandas as pd
import torch
import json
import re

In [2]:
import sys

sys.path.append('../utils')
from json_format import process_json, unprocess_json
from evaluator import Evaluator

In [3]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("ai-forever/FRED-T5-large")
# model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/FRED-T5-large")

from transformers import GPT2Tokenizer, T5ForConditionalGeneration 

tokenizer = GPT2Tokenizer.from_pretrained('ai-forever/FRED-T5-large',eos_token='</s>')
model = T5ForConditionalGeneration.from_pretrained('ai-forever/FRED-T5-large')
# tokenizer = AutoTokenizer.from_pretrained('./fredT5-large-trained')
# model = AutoModelForSeq2SeqLM.from_pretrained('./fredT5-large-trained').to('cuda')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
converters = {'json': json.loads}
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)

In [5]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [7]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.005, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8766
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 45
    })
})

In [8]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'mask_token': '<mask>'}

In [9]:
def preprocess_function(examples):
    inputs = ['<LM>' + text for text in examples["Text"]]
    targets = ['<LM>' + process_json(bundles) for bundles in examples["json"]]
    # model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
    # return model_inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

Map (num_proc=4):   0%|          | 0/8766 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/45 [00:00<?, ? examples/s]



In [11]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [12]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [13]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.batch_size = batch_size
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size, seq_tokens=True)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [14]:
n_epochs = 13

training_args = Seq2SeqTrainingArguments(
  output_dir="fredT5-large-checkpoints",
  # overwrite_output_dir=True,
  evaluation_strategy="steps",
  eval_steps=1500,
  learning_rate=4e-5,
  per_device_train_batch_size=12,
  per_device_eval_batch_size=2,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=n_epochs,
  # predict_with_generate=True,
  # generation_max_length=128,
  lr_scheduler_type="cosine",
  group_by_length=False,
  warmup_steps=3,
  # fp16=True,
)


mc = MetricComputer(batch_size=8)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=ads["train"],
  eval_dataset=ads["test"],
  # eval_dataset=empty_dataset,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=mc,
)



In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
1500,0.5006,0.269909,0.597537,0.144344,14.456966,59.753728,67.489076,0.876,0.102,8.578952,56.811897,69.183275,8.578952,0.012
3000,0.3424,0.225722,0.0,0.1421,45.703755,75.722859,69.333907,0.908,0.076,48.700187,75.82601,69.714772,48.700187,0.0
4500,0.2954,0.203776,0.0,0.149213,36.047425,63.140979,70.734532,0.902,0.08,36.082985,57.11413,70.422954,36.082985,0.0
6000,0.2594,0.199087,0.489459,0.15336,28.171807,67.65342,70.728134,0.902,0.078,22.185013,68.736545,71.097599,22.185013,0.004
7500,0.2499,0.192591,0.275156,0.151185,7.533618,41.189925,69.701109,0.904,0.078,8.357715,39.685428,69.743792,8.357715,0.0
9000,0.2341,0.194644,0.47289,0.155455,27.472111,61.184358,68.286267,0.882,0.098,23.787855,59.259756,70.265873,23.787855,0.004
10500,0.218,0.192579,0.184651,0.1557,21.860332,57.987036,69.645914,0.892,0.092,15.363595,51.643677,71.362265,15.363595,0.0
12000,0.2159,0.191576,0.240481,0.154793,10.667611,45.019882,69.97943,0.896,0.088,12.690747,43.410908,71.278226,12.690747,0.0
13500,0.2102,0.19097,0.184651,0.154845,40.950765,65.693755,69.956881,0.894,0.086,21.328198,60.348279,71.28812,21.328198,0.0




 <BOT> Samsung 40inch, не smart, не smart, цена €100 <EOP> <BOC1> 1 <EOC1> <BOC2> EUR <EOC2> 
 <BOT> Мафон пионер с фишкой <EOT> <BOP> 1 <EOC1> <BOC2> RUB <EOC2> 
 <BOT> большая поясная сумка с бутылками <EOT> <BOP> 135 <EOP> <BOC1> 270 гр <EOC2> 
 <BOT> картина маслом 40 на 30 см <EOT> <BOP> 20 <EOP> <BOC1> 40 на 30 см <EOC2> 
 <BOT> кухонные весы кухонные 4 евро <EOT> <BOP> 4 <EOP> <BOC1> кухонные весы кухонные 4 евро <EOT> <BOP> 4 <EOP> <BOC1> кухонные весы кухонные 4 евро <EOC2> 
 <BOT> EUR 2 EUR 2 шт Limaxollar <EOT> <BOP> 2 шт <EOC1> <BOC2> EUR <EOC2> 




 <BOT> KUMHO R15 4 колёса лето 4.000₽ <EOP> <BOC1> 1 <EOC1> <BOC2> ₽ <EOC2> 
 <BOT> витамины запечатанные orthomol mental ортомол ментал, срок до 20.02.2024 35eur <EOT> <BOP> витамины запечатанные orthomol mental ортомол ментал <EOT> <BOP> 35EUR <EOC2> 




 <BOT> KUMHO R15 4 колёса лето 4.000₽ <EOP> <BOC1> 1 <EOC1> <BOC2> RUB <EOC2> 
 <BOT> ортачала <EOT> <BOP> 1 <EOC1> <BOC2> лари <EOC2> 




TrainOutput(global_step=14248, training_loss=0.45023905514733703, metrics={'train_runtime': 15995.2609, 'train_samples_per_second': 7.124, 'train_steps_per_second': 0.891, 'total_flos': 6.023907124995686e+16, 'train_loss': 0.45023905514733703, 'epoch': 13.0})

In [13]:
output_dir = "fredT5-large-trained"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('fredT5-large-trained/tokenizer_config.json',
 'fredT5-large-trained/special_tokens_map.json',
 'fredT5-large-trained/vocab.json',
 'fredT5-large-trained/merges.txt',
 'fredT5-large-trained/added_tokens.json',
 'fredT5-large-trained/tokenizer.json')

In [12]:
mc(1)



{'BEP-sb': 0.7100557283942733,
 'BEP-multi': 0.6778037162156486,
 'TA-BLEU-sb': 40.88007731124459,
 'TA-CHRF-sb': 79.54746004661867,
 'TA-CHRF-multi': 79.01507359036316,
 'EB-ind': 0.018,
 'MB-ind': 0.068,
 'BLEU-classic': 57.110613019167744,
 'CHRF-classic': 76.90979283859272,
 'CHRF-classic-multi': 76.59727547801717,
 'bleu_old': 57.110613019167744,
 'failed_ratio': 0.0}

In [13]:
ev = Evaluator(manual_test, model=model, tokenizer=tokenizer, batch_size=4)
output = ev.generate_samples_batched()
df = pd.DataFrame([{'id': i, 'json': json.dumps(v, indent=4, ensure_ascii=False)} for i, v in output.items()])
df.to_csv('manual_test_outputs.csv', index=False)

In [14]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained(output_dir)
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [15]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [16]:
# ev = Evaluator(distill_data)
# output = ev.generate_samples_batched(count=20000, batch_size=256)