In [1]:
import torch

from src.data import load_main_dataset, preprocess_dataset
from models import gpt35_turbo_model, t5_model, bart_base_model
from datasets import load_metric
from tqdm import tqdm

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
dataset = load_main_dataset()
metric = load_metric("sacrebleu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
def measure(__preds, __labels):
    global metric
    return metric.compute(predictions=__preds, references=__labels)

Let's test GPT3.5. I appologise there are only 10 inputs for the model, but it's not a free stuff and I want to get some number of tries to use the library for me only:)

In [13]:
# GPT3.5
test_texts, test_trans = gpt35_turbo_model.preprocess_dataset(dataset)
test_texts = test_texts.to_list()

In [14]:
pred = []

for i, inp in tqdm(enumerate(test_texts), total=len(test_texts)):
    pred.append(gpt35_turbo_model.gpt_detox(test_texts[i]))
measure(pred, test_trans)

100%|██████████| 10/10 [00:16<00:00,  1.65s/it]


{'score': 7.1582382262575,
 'counts': [60, 17, 6, 3],
 'totals': [178, 168, 158, 148],
 'precisions': [33.70786516853933,
  10.119047619047619,
  3.7974683544303796,
  2.027027027027027],
 'bp': 1.0,
 'sys_len': 178,
 'ref_len': 150}

Now time for t5 model.

In [5]:
t5_input, t5_output = preprocess_dataset(dataset, t5_model.tokenizer, device)

In [9]:
# I'll take only 10000 samples for measuring this model
t5_input = t5_input[:10000]
t5_output = t5_output[:10000]

In [10]:
pred = []

for i, inp in tqdm(enumerate(t5_input), total=len(t5_input)):
    pred.append(t5_model.t5_detox(inp))

100%|██████████| 10000/10000 [59:40<00:00,  2.79it/s] 


In [11]:
measure(pred, t5_output)

{'score': 8.799947696556991,
 'counts': [43667, 17456, 8902, 4631],
 'totals': [166665, 156668, 146689, 136810],
 'precisions': [26.200462004620046,
  11.142032833763118,
  6.068621369018809,
  3.3849864775966667],
 'bp': 1.0,
 'sys_len': 166665,
 'ref_len': 120861}

Time to the next model: bart-base-detox

In [14]:
bart_input, bart_output = preprocess_dataset(dataset, bart_base_model.tokenizer, device)
bart_input = bart_input[:10000]
bart_output = bart_output[:10000]

In [15]:
pred = []

for i, inp in tqdm(enumerate(bart_input), total=len(t5_input)):
    pred.append(bart_base_model.bart_detox(inp))

100%|██████████| 10000/10000 [44:27<00:00,  3.75it/s] 


In [16]:
measure(pred, bart_output)

{'score': 19.64251856481778,
 'counts': [63342, 31154, 16738, 8987],
 'totals': [134353, 124353, 114353, 104372],
 'precisions': [47.14595133714915,
  25.052873674137334,
  14.637132388306384,
  8.61054688997049],
 'bp': 1.0,
 'sys_len': 134353,
 'ref_len': 120861}

The last model has the best score. So I will use it as my final model.