In [2]:
import pandas as pd
import torch
import json
import re

In [3]:
import sys

sys.path.append('../utils')
from json_format import process_json, unprocess_json
from evaluator import Evaluator

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("ai-forever/sage-fredt5-distilled-95m")
# model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/sage-fredt5-distilled-95m")
tokenizer = AutoTokenizer.from_pretrained("./fredT5-distill-trained")
model = AutoModelForSeq2SeqLM.from_pretrained("fredT5-distill-trained")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
import ast 
train = pd.read_csv('../data/distill_100k.csv', index_col=0, converters={'json': ast.literal_eval})
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters={'json': json.loads})
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters={'json': json.loads})

In [6]:
len(train), len(val_set), len(manual_test)

(100000, 500, 100)

In [7]:
train = train[train.Text.apply(lambda txt: txt not in val_set.Text.values and txt not in manual_test.Text.values)]
len(train)

99382

In [8]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [9]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.0005, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 99332
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 50
    })
})

In [10]:
def preprocess_function(examples):
    inputs = [tokenizer.bos_token + text + tokenizer.eos_token for text in examples["Text"]]
    targets = [tokenizer.bos_token + process_json(bundles) + tokenizer.eos_token for bundles in examples["json"]]
    # model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
    # return model_inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

Map (num_proc=4):   0%|          | 0/99332 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/50 [00:00<?, ? examples/s]



In [11]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [12]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [13]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.batch_size = batch_size
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size, seq_tokens=True)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [14]:
n_epochs = 6

training_args = Seq2SeqTrainingArguments(
  output_dir="fredT5-large-checkpoints",
  # overwrite_output_dir=True,
  evaluation_strategy="steps",
  eval_steps=1500,
  learning_rate=4e-5,
  per_device_train_batch_size=64,
  per_device_eval_batch_size=32,
  weight_decay=0.01,
  save_total_limit=2,
  num_train_epochs=n_epochs,
  # predict_with_generate=True,
  # generation_max_length=128,
  lr_scheduler_type="cosine",
  group_by_length=False,
  warmup_steps=3,
  # fp16=True,
)


mc = MetricComputer(batch_size=32)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=ads["train"],
  eval_dataset=ads["test"],
  # eval_dataset=empty_dataset,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=mc,
)



In [14]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
1500,0.5672,0.476776,0.610843,0.556171,32.618497,71.888206,71.155171,0.082,0.066,45.601557,68.185342,68.463856,45.601557,0.0
3000,0.3955,0.388242,0.642899,0.59287,36.902114,74.957201,74.281,0.066,0.068,49.911577,72.02268,72.138166,49.911577,0.0
4500,0.35,0.357655,0.666168,0.619379,37.644112,75.806263,74.857479,0.064,0.06,51.315076,72.924358,72.435426,51.315076,0.0
6000,0.3249,0.341082,0.66584,0.619857,38.202311,76.328949,75.518574,0.066,0.06,51.953642,73.554389,73.165919,51.953642,0.0
7500,0.3184,0.333237,0.665482,0.620925,38.492631,76.563687,75.790874,0.06,0.06,52.266655,73.639834,73.466396,52.266655,0.0
9000,0.3154,0.333146,0.667872,0.622325,38.609402,76.552543,75.81938,0.058,0.06,52.288566,73.657978,73.537189,52.288566,0.0


TrainOutput(global_step=9318, training_loss=0.6017264075278212, metrics={'train_runtime': 4851.5017, 'train_samples_per_second': 122.847, 'train_steps_per_second': 1.921, 'total_flos': 3.1972292965564416e+16, 'train_loss': 0.6017264075278212, 'epoch': 6.0})

In [19]:
output_dir = "fredT5-distill-trained"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('fredT5-distill-trained/tokenizer_config.json',
 'fredT5-distill-trained/special_tokens_map.json',
 'fredT5-distill-trained/vocab.json',
 'fredT5-distill-trained/merges.txt',
 'fredT5-distill-trained/added_tokens.json',
 'fredT5-distill-trained/tokenizer.json')

: 

In [15]:
mc(1)



{'BEP-sb': 0.6678720755504919,
 'BEP-multi': 0.6227291023268076,
 'TA-BLEU-sb': 38.609401672252865,
 'TA-CHRF-sb': 76.55254266005984,
 'TA-CHRF-multi': 75.81771929397691,
 'EB-ind': 0.06,
 'MB-ind': 0.06,
 'BLEU-classic': 52.28856553393526,
 'CHRF-classic': 73.65797810366747,
 'CHRF-classic-multi': 73.61236898220828,
 'bleu_old': 52.28856553393526,
 'failed_ratio': 0.0}

In [16]:
ev = Evaluator(manual_test, model=model, tokenizer=tokenizer, batch_size=4)
output = ev.generate_samples_batched()
df = pd.DataFrame([{'id': i, 'json': json.dumps(v, indent=4, ensure_ascii=False)} for i, v in output.items()])
df.to_csv('manual_test_outputs.csv', index=False)

In [16]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained(output_dir)
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [17]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [18]:
# ev = Evaluator(distill_data)
# output = ev.generate_samples_batched(count=20000, batch_size=256)