In [1]:
import pandas as pd
import torch
import json
import re

In [2]:
import sys

sys.path.append('../utils')
from evaluator import Evaluator

from json_format import SepTokenJSONProcessor

json_proc = SepTokenJSONProcessor()

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/FRED-T5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/FRED-T5-large")
# tokenizer = AutoTokenizer.from_pretrained('fredT5-large-checkpoints/checkpoint-000')
# model = AutoModelForSeq2SeqLM.from_pretrained('fredT5-large-checkpoints/checkpoint-2000').to('cuda')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
converters = {'json': json.loads}
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)

In [5]:
tokenizer.add_tokens(json_proc.spec_tokens)
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == json_proc.unprocess_json(json_proc.process_json(train.loc[train.index[0], 'json']))

In [6]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.005, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8766
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 45
    })
})

In [7]:
def preprocess_function(examples):
    inputs = [tokenizer.bos_token + text + tokenizer.eos_token for text in examples["Text"]]
    targets = [tokenizer.bos_token + json_proc.process_json(bundles) + tokenizer.eos_token for bundles in examples["json"]]
    # model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
    # return model_inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

Map (num_proc=4):   0%|          | 0/8766 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/45 [00:00<?, ? examples/s]



In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [9]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [10]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.batch_size = batch_size
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size, seq_tokens=True, json_processor=json_proc)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [11]:
n_epochs = 13

training_args = Seq2SeqTrainingArguments(
  output_dir="fredT5-large-checkpoints",
  # overwrite_output_dir=True,
  evaluation_strategy="steps",
  eval_steps=2500,
  learning_rate=4e-5,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=2,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=n_epochs,
  # predict_with_generate=True,
  # generation_max_length=128,
  lr_scheduler_type="cosine",
  group_by_length=False,
  warmup_steps=3,
  # fp16=True,
)


mc = MetricComputer(batch_size=8)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=ads["train"],
  eval_dataset=ads["test"],
  # eval_dataset=empty_dataset,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=mc,
)



In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
2500,0.4656,0.305376,0.666744,0.608176,37.365754,75.885887,74.001542,0.06,0.072,51.35389,72.11328,71.962766,51.35389,0.02
5000,0.3576,0.263086,0.692156,0.654763,39.265658,77.851847,76.759315,0.034,0.07,54.352233,74.494435,73.870299,54.352233,0.006
7500,0.3239,0.249824,0.704964,0.663203,40.805743,79.265672,77.601435,0.042,0.056,56.507576,75.987409,75.185944,56.507576,0.014
10000,0.2914,0.247466,0.699307,0.661789,39.337795,78.831964,77.397676,0.028,0.062,55.498698,75.32833,74.607341,55.498698,0.012
12500,0.2901,0.241164,0.704194,0.665725,39.674455,78.999617,77.5584,0.022,0.068,55.456883,75.523943,75.343507,55.456883,0.012




<T>—Ñ–æ—Ä–º–∞ –¥–ª—è –ª—å–¥–∞<P>1<C2>EUR
<T>–±–µ–ª—ã–π –ø–∏–¥–∂–∞–∫ —Å –≤–æ–∑–¥—É—à–Ω—ã–º–∏ —à–∞—Ä–∞
<T>–Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞ –∫–ª—É—ç–¥–æ/cluedo harry potter
<T>Scrabble –¥–æ—Ä–æ–∂–Ω—ã–π<P>
<T>–Ω–æ—É—Ç
<T>—Ñ—É—Ç–±–æ–ª–∫–∞ Gucci<P>
<T>–ú–∞—à–∏–Ω–∫–∏, —Å–æ—Ä—Ç–µ—Ä
<T>–ü–ª–∞—Ç—å
<T>braddon ¬´the lawyer‚Äôs lawyer‚Äôs lawyer‚Äôs lawyer‚Äôs lawyer‚Äôs lawyer
<T>—Ç—Ä–∞–Ω—Å




<T>Voip-—Ç–µ–ª–µ—Ñ–æ–Ω —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π SIP –∏ H.323, Wi-Fi, Wi-Fi+, Bluetooth, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-Fi+, Wi-
<T>–Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞ –∫–ª—É—ç–¥–æ/cluedo harry potter
<T>braddon ¬´the lawyer‚Äô




<T>–¢–æ–ª—Å—Ç–æ–≤–∫–∞ —Å –∫–∞–ø—é—à–æ–Ω–æ–º
<T>–Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞ –∫–ª—É—ç–¥–æ/cluedo harry potter
<T>Rick
<T>—Ñ—É—Ç–±–æ–ª–∫–∞ Gucci —Ä–∞–∑–º–µ—Ä –ú<P>500
<T>–ö–Ω–∏–≥–∏, —Ä–∞—Å–∫—Ä–∞—Å–∫–∏, –∫–∞—Ä–∞–Ω–¥–∞—à–∏ –∏ —Ç–¥

<T>bradd




<T>–ö–æ–∂–∞–Ω–∞—è —Å—É–º–∫–∞ —Å –∫–∞—Ä–º–∞–Ω–æ–º
<T>–Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞ –∫–ª—É—ç–¥–æ/cluedo harry potter
<T>—Ñ—É—Ç–±–æ–ª–∫–∞ Gucci<P>

<T>–ü–ª–∞—Ç—å–µ –Ω–æ–≤–æ–µ (–¢—É—Ä—Ü–∏—è) –†–∞–∑–º–µ—Ä 110-116 —Å–º<P>400<C1>
<T>braddon ¬´the lawyer‚Äô




<T>–ö
<T>Voip-—Ç–µ–ª–µ—Ñ–æ–Ω —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π SIP –∏ H.323, Wi-Fi, Bluetooth, Wi-Fi Direct, Wi-Fi Bluetooth, –≤—Å—Ç—Ä–æ–µ–Ω–Ω–∞—è —Ç–µ–ª–µ—Ñ–æ–Ω–Ω–∞—è –∫–Ω–∏–≥–∞, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ PoE, –æ–ø—Ä–µ–¥–µ–ª–∏—Ç–µ–ª—å –Ω–æ–º–µ—Ä–∞, –∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü-—Å–≤—è–∑—å, –≥—Ä–æ–º–∫–∞—è —Å–≤—è–∑—å, –≤–µ–±-–∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å, LCD-–¥–∏—Å–ø–ª–µ–π, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Bluetooth, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, Wi-Fi Bluetooth, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Bluetooth, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞ Wi-Fi Direct, –ø–æ–¥–¥–µ—Ä–∂–∫–∞
<T>–Ω–∞—Å—Ç–æ–ª—å–Ω–∞—è –∏–≥—Ä–∞ –∫–ª—É—ç–¥–æ/cluedo harry potter
<T>—Ñ—É—Ç–±–æ–ª–∫–∞ Dolce & Gabbana

<T>braddon ¬´the lawyer‚Äô


KeyboardInterrupt: 

In [None]:
output_dir = "fredT5-large-trained-sep-token"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('fredT5-large-trained/tokenizer_config.json',
 'fredT5-large-trained/special_tokens_map.json',
 'fredT5-large-trained/vocab.json',
 'fredT5-large-trained/merges.txt',
 'fredT5-large-trained/added_tokens.json',
 'fredT5-large-trained/tokenizer.json')

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained(output_dir)
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [None]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [None]:
# ev = Evaluator(distill_data)
# output = ev.generate_samples_batched(count=20000, batch_size=256)