In [1]:
import pandas as pd
import torch
import json
import re

In [2]:
import sys

sys.path.append('../utils')
from json_format import process_json, unprocess_json
from evaluator import Evaluator

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'ai-forever/FRED-T5-1.7B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
converters = {'json': json.loads}
train = pd.read_csv('../data/train_9k_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
val_set = pd.read_csv('../data/val_set_300_sb_valid.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)
manual_test = pd.read_csv('../data/manual_test_100.csv', index_col=0, converters=converters).sample(frac=1, random_state=42)

In [5]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))


assert train.loc[train.index[0], 'json'] == unprocess_json(process_json(train.loc[train.index[0], 'json']))

In [6]:
from datasets import Dataset

ads_dataset = Dataset.from_pandas(train[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.005, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 8766
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 45
    })
})

In [7]:
def preprocess_function(examples):
    inputs = [tokenizer.bos_token + text + tokenizer.eos_token for text in examples["Text"]]
    targets = [tokenizer.bos_token + process_json(bundles) + tokenizer.eos_token for bundles in examples["json"]]
    # model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
    # return model_inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

Map (num_proc=4):   0%|          | 0/8766 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/45 [00:00<?, ? examples/s]



In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [9]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [10]:
class MetricComputer:
  def __init__(self, batch_size=8):
    self.batch_size = batch_size
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(val_set, model=model, tokenizer=tokenizer, batch_size=self.batch_size, seq_tokens=True)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [11]:
n_epochs = 8

training_args = Seq2SeqTrainingArguments(
  output_dir="fredT5-1.7B-checkpoints",
  evaluation_strategy="steps",
  eval_steps=6000,
  learning_rate=4e-5,
  per_device_train_batch_size=2,
  per_device_eval_batch_size=2,
  weight_decay=0.01,
  save_total_limit=1,
  save_steps=6000,
  num_train_epochs=n_epochs,
  lr_scheduler_type="cosine",
  group_by_length=False,
  warmup_steps=3,
  # load_best_model_at_end= True,
)


mc = MetricComputer(batch_size=2)
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=ads["train"],
  eval_dataset=ads["test"],
  # eval_dataset=empty_dataset,
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=mc,
)



In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
6000,0.2848,0.247434,0.709373,0.672864,40.696003,79.140267,78.811325,0.048,0.04,56.647225,76.165868,75.451996,56.647225,0.0
12000,0.2112,0.224835,0.688866,0.664838,37.400845,76.927791,77.063078,0.026,0.048,52.830884,73.503712,74.160753,52.830884,0.002
18000,0.1566,0.196096,0.72439,0.700108,41.650682,79.76809,80.311817,0.016,0.056,57.357349,77.024426,78.672094,57.357349,0.0




<BOT>чешский бисер<EOT><BOP>1<EOC1><BOC2>EUR<EOC2>




In [12]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Step,Training Loss,Validation Loss,Bep-sb,Bep-multi,Ta-bleu-sb,Ta-chrf-sb,Ta-chrf-multi,Eb-ind,Mb-ind,Bleu-classic,Chrf-classic,Chrf-classic-multi,Bleu Old,Failed Ratio
30000,0.1462,0.205346,0.728746,0.70694,41.445821,79.507208,79.609243,0.016,0.046,57.040565,76.961041,77.369716,57.040565,0.0


Could not locate the best model at fredT5-1.7B-checkpoints/checkpoint-24000/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


FileNotFoundError: [Errno 2] No such file or directory: 'fredT5-1.7B-checkpoints/checkpoint-24000'

In [20]:
model_name = './checkpoint-30000'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
mc(1)



{'BEP-sb': 0.7287457256519236,
 'BEP-multi': 0.7069398307496324,
 'TA-BLEU-sb': 41.44582147695938,
 'TA-CHRF-sb': 79.50720754462834,
 'TA-CHRF-multi': 79.60924297907755,
 'EB-ind': 0.016,
 'MB-ind': 0.046,
 'BLEU-classic': 57.040564511240156,
 'CHRF-classic': 76.96104108112667,
 'CHRF-classic-multi': 77.36971623947342,
 'bleu_old': 57.040564511240156,
 'failed_ratio': 0.0}

In [22]:
ev = Evaluator(manual_test, model=model, tokenizer=tokenizer, batch_size=2)
output = ev.generate_samples_batched()
df = pd.DataFrame([{'id': i, 'json': json.dumps(v, indent=4, ensure_ascii=False)} for i, v in output.items()])
df.to_csv('manual_test_outputs.csv', index=False)

In [23]:
output_dir = "fredT5-1.7B-trained"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('fredT5-1.7B-trained/tokenizer_config.json',
 'fredT5-1.7B-trained/special_tokens_map.json',
 'fredT5-1.7B-trained/vocab.json',
 'fredT5-1.7B-trained/merges.txt',
 'fredT5-1.7B-trained/added_tokens.json',
 'fredT5-1.7B-trained/tokenizer.json')

: 

In [None]:
exit()

In [13]:
# s = '<s> продам зонт за 400 рублей</s>'

# inputs = tokenizer(s, return_tensors='pt').to('cuda')
# out = model.generate(**inputs)
# tokenizer.decode(out[0])



'<pad><s> продам<s> продам<s> продам<s> продам<s> продам<s> продам<s>'

In [None]:
# tokenizer.decode(ads['train'][0]['input_ids'])

'<s>кровать икея односпальная растущая, отличное состояние 70€ лимассол</s>'

In [None]:
# exit()

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained(output_dir)
# model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to('cuda')

In [None]:
# distill_data = pd.read_csv('../data/distill_data.csv', index_col=0)
# distill_data.head()

In [None]:
# ev = Evaluator(distill_data)
# output = ev.generate_samples_batched(count=20000, batch_size=256)