In [1]:
import pandas as pd
import numpy as np
import re
import torch
# import json
from tqdm.notebook import tqdm
# from torch.utils.data import Dataset, DataLoader
# from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../labeled_15k.csv')
df = df[df.Label_gpt == 'valid']
df = df[['Text', 'Title', 'Price', 'Currency', 'Count']].reset_index(drop=True)
df['json'] = pd.Series([[{'Title': row.Title, 'Price': row.Price, 'Currency': row.Currency, 'Count': row.Count}] for row in df.itertuples()])
text_df = df[['Text', 'json']].groupby('Text').sum().reset_index()
text_df['n_bundle'] = text_df.json.apply(len)

single_bundle_mask = text_df.n_bundle == 1
np.random.seed(42)
test_set_index = np.random.choice(np.where(single_bundle_mask)[0], size=300, replace=False)
test_mask = np.zeros(single_bundle_mask.shape, dtype=bool)
test_mask[test_set_index] = True

text_df, text_df_test = text_df[~test_mask], text_df[test_mask]

# text_df, text_df_test = train_test_split(text_df, random_state=42, test_size=0.1)
print(len(text_df))
text_df.head()

9603


Unnamed: 0,Text,json,n_bundle
0,! новая. хлопковая рубашка оверсайз с приятной...,"[{'Title': 'хлопковая рубашка оверсайз', 'Pric...",1
1,!!! ПОМИДОРКИ !!! в 3л Банке.... по 250 руб. К...,"[{'Title': 'Икра КРАСНАЯ 140 гр. банка', 'Pric...",3
2,"""конфетный мячик"" 🍬🏀 цена €7😻 бесплатная доста...","[{'Title': 'конфетный мячик', 'Price': '7', 'C...",1
3,# ТРИКО Распродажа без выбораМОДЕЛЬ ТКАНЬ ТРИК...,"[{'Title': 'Трико, двух нитки качества супер',...",1
4,# ТРИКО Распродажа без выбораМОДЕЛЬ ТКАНЬ ТРИК...,[{'Title': 'Суперкачественный трикотаж из двух...,1


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-large")

In [4]:
tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"])
model.resize_token_embeddings(len(tokenizer))

def process_json(json):
  return ''.join([f"<BOB><BOT>{d['Title']}<EOT><BOP>{d['Price']}<EOP><BOC1>{d['Count']}<EOC1><BOC2>{d['Currency']}<EOC2><EOB>" for d in json])

def unprocess_json(s):
  json = []
  for t in re.findall(r'<BOB>(.*?)<EOB>', s):
    try:
      json.append({
        'Title': re.findall(r'<BOT>(.*?)<EOT>', t)[0],
        'Price': re.findall(r'<BOP>(.*?)<EOP>', t)[0],
        'Count': re.findall(r'<BOC1>(.*?)<EOC1>', t)[0],
        'Currency': re.findall(r'<BOC2>(.*?)<EOC2>', t)[0]
      })
    except Exception as e:
      print(t)
      raise e
  return json
assert text_df.loc[text_df.index[0], 'json'] == unprocess_json(process_json(text_df.loc[text_df.index[0], 'json']))

In [5]:
from datasets import Dataset

ads_test_dataset = Dataset.from_pandas(text_df_test[["Text", "json"]]).flatten()

ads_dataset = Dataset.from_pandas(text_df[["Text", "json"]])
ads_dataset = ads_dataset.train_test_split(test_size=0.05, seed=42)
ads_dataset = ads_dataset.flatten()
ads_dataset

DatasetDict({
    train: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 9122
    })
    test: Dataset({
        features: ['Text', 'json', '__index_level_0__'],
        num_rows: 481
    })
})

In [6]:
def preprocess_function(examples):
    inputs = [text for text in examples["Text"]]
    # targets = ['' for bundles in examples["json"]]
    targets = [process_json(bundles) for bundles in examples["json"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

ads = ads_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=ads_dataset["train"].column_names
)
ads = ads.flatten()

# ads_test = ads_test_dataset.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=ads_test_dataset.column_names
# )
# ads_test = ads_test.flatten()


Map (num_proc=4):   0%|          | 0/9122 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/481 [00:00<?, ? examples/s]

In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [8]:
from transformers import DataCollatorForSeq2Seq

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [9]:
import evaluate
bleu_metric = evaluate.load("sacrebleu")

class Evaluator:
  def __init__(self, df, use_cache=False, batch_size=16):
    self.batch_size=batch_size
    self.df = df
    self.failed_generations = 0
    self.use_cache = use_cache
    if use_cache:
      self.cache = {}

  def generate_text(self, input_text, device='cuda'):
    if self.use_cache and input_text in self.cache:
      return self.cache[input_text]
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    # model.eval()
    # input_ids = torch.IntTensor([input_ids['input_ids']]).to(device)
    with torch.no_grad():
      outputs = model.generate(input_ids, max_length=128, early_stopping=True)
      decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
      try:
        json = unprocess_json(decoded_output)
      except:
        json = []
        # self.failed_generations += 1
      # generated_ids = model.generate(input_ids, max_length=512)
    if self.use_cache:
      self.cache[input_text] = json
    return json

  def generate_text_batch(self, input_texts, device='cuda'):
    if not isinstance(input_texts, list):
        raise ValueError("input_texts should be a list of strings")
    batch_size = len(input_texts)
    input_ids = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).input_ids.to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=128, early_stopping=True)
        decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        results = []
        for decoded_output in decoded_outputs:
            try:
                json_output = unprocess_json(decoded_output)
            except Exception as e:
                json_output = []
                # self.failed_generations += 1 if you want to track failed generations
            results.append(json_output)
            if self.use_cache:
                for input_text, result in zip(input_texts, results):
                    self.cache[input_text] = result

    return results
  
  def generate_samples(self, ids=None, count=None):
    if count is not None:
      ids = self.df.index[:count]
    if ids is None:
      ids = self.df.index

    results = {}
    for i in ids:
      input_text = self.df.loc[i, 'Text']
      pred = self.generate_text(input_text)
      if len(pred) == 0 or 'Title' not in pred[0]:
        self.failed_generations += 1
        results[i] = []
      else:
        results[i] = pred
    return results
  
  def generate_samples_batched(self, ids=None, count=None, batch_size=8):
    if count is not None:
        ids = self.df.index[:count]
    if ids is None:
        ids = self.df.index

    results = {}
    all_texts = [self.df.loc[i, 'Text'] for i in ids]
    all_ids = list(ids)
    
    for start in range(0, len(all_texts), batch_size):
        end = start + batch_size
        batch_texts = all_texts[start:end]
        batch_ids = all_ids[start:end]
        
        predictions = self.generate_text_batch(batch_texts)
        
        for i, pred in zip(batch_ids, predictions):
            if len(pred) == 0 or 'Title' not in pred[0]:
                self.failed_generations += 1
                results[i] = []
            else:
                results[i] = pred
                
    return results
  
  def calc_bleu(self):
    refs = []
    predictions = []
    full_predictions = self.generate_samples()
    for i, pred in full_predictions:
      if len(pred) == 1 and 'Title' in pred[0]:
        refs.append([self.df.loc[i, 'Title']])
        predictions.append(pred[0]['Title'])
    bleu_value = bleu_metric.compute(predictions=predictions, references=refs)
    return {'bleu': bleu_value, 'failed_ratio': 1 - len(predictions) / len(self.df)}
  
  def calc_bleu_batched(self, batch_size=8):
    refs = []
    predictions = []
    full_predictions = self.generate_samples_batched(batch_size=batch_size)
    
    for i, pred in full_predictions.items():
        if len(pred) == 1 and 'Title' in pred[0]:
            refs.append([self.df.loc[i, 'json'][0]['Title']])
            predictions.append(pred[0]['Title'])
    
    if len(refs) != 0:
      bleu_value = bleu_metric.compute(predictions=predictions, references=refs)['score']
    else:
      bleu_value = np.nan

    return {'bleu': bleu_value, 'failed_ratio': 1 - len(predictions) / len(self.df)}


In [10]:
from IPython.display import clear_output

class MetricComputer:
  def __init__(self):
    self.generations = []

  def __call__(self, eval_preds):
    ev = Evaluator(text_df_test)
    stats = ev.calc_bleu_batched()
    self.generations.append(ev.generate_samples_batched(count=20))
    # clear_output()
    return stats

In [11]:
n_epochs = 7

training_args = Seq2SeqTrainingArguments(
    output_dir="ruT5-large",
    # overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=300,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=n_epochs,
    # predict_with_generate=True,
    generation_max_length=128,
    fp16=True,
    lr_scheduler_type="cosine",
    group_by_length=False,
    warmup_steps=3,
)

mc = MetricComputer()
empty_dataset = Dataset.from_dict({"Text": [], "json": []})
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ads["train"],
    eval_dataset=ads["test"],
    # eval_dataset=empty_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=mc,
)

In [12]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu,Failed Ratio
300,No log,0.470256,35.631757,0.026667
600,0.967500,0.417016,28.207157,0.013333
900,0.967500,0.397189,27.745244,0.026667
1200,0.431000,0.390581,30.747078,0.03
1500,0.347100,0.380656,33.97894,0.02
1800,0.347100,0.382794,30.384596,0.493333
2100,0.293300,0.390927,28.965381,0.353333
2400,0.293300,0.388672,24.471471,0.48
2700,0.243100,0.382368,23.794168,0.286667
3000,0.217300,0.389495,25.34499,0.043333


<BOT> книга Pro Core and EUR<EOC2>
<BOT> набор из 5<EOC1><BOC2> RUB<EOC2>
<BOT> кроссовки<EOT><BOP> 1<EOC1><BOC2> GEL<EOC2>


TrainOutput(global_step=3997, training_loss=0.3616401284880897, metrics={'train_runtime': 3494.9365, 'train_samples_per_second': 18.27, 'train_steps_per_second': 1.144, 'total_flos': 3.3469863638016e+16, 'train_loss': 0.3616401284880897, 'epoch': 7.0})

In [16]:
mc.generations[-1]

{56: [{'Title': ' Starship Samurai + Starship Shattered Alliances',
   'Price': ' 10000',
   'Count': ' 1',
   'Currency': ' RUB'}],
 125: [{'Title': ' Спящие боги (полный предзаказ кроме монет + орги)',
   'Price': ' 9000',
   'Count': ' 1',
   'Currency': ' RUB'},
  {'Title': ' База + Приливы в руинах + Подземелья + промо-карты+ металлический корабль + плеймат + орги от meeplehouse',
   'Price': ' 9000',
   'Count': ' 1',
   'Currency': ' RUB'}],
 257: [{'Title': ' Плеймат и набор исправлений',
   'Price': ' 31000',
   'Count': ' 1',
   'Currency': ' RUB'}],
 335: [{'Title': ' электрический чайник',
   'Price': ' 90',
   'Count': ' 1',
   'Currency': ' RUB'}],
 347: [{'Title': ' Bluetooth-гарнитура Baofeng (Kenwood) с модулем передачи для рации и инструкцией',
   'Price': ' 35',
   'Count': ' 1',
   'Currency': ' GEL'}],
 476: [{'Title': ' вкусняшки для собак',
   'Price': ' 1500',
   'Count': ' 1',
   'Currency': ' RUB'}],
 723: [{'Title': ' набор из 14 предметов',
   'Price': ' 110

In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [14]:
# output_dir = "ruT5-large-too-good-try-3"
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)