# Training a model T5-small

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_metric
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer)
from torch.utils.data import Dataset as D
import pickle

## Class MyDataset (I have problems with imports)

In [2]:
class MyDataset(D):
    def __init__(self, data_path):
        self.raw_data = pd.read_csv(data_path, sep='\t', index_col=0)
        
        data = pd.DataFrame()
        data['toxic'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['reference'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['translation']])
        data['normal'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['translation'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['reference']])
        data['toxic_reduction'] = abs(self.raw_data['ref_tox'] - self.raw_data['trn_tox'])
        self.data = data


    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [3]:
df = pickle.load(open('../data/interim/text_dataset.pkl', 'rb')).data
df.head()

Unnamed: 0,toxic,normal,toxic_reduction
5,"[gon, na, child, genetic, disorder, gon, na, d...","[going, breed, kid, genetic, disorder, make, die]",0.915109
6,"[laughing, u, kick, as]","[laughing, u, show]",0.999361
7,"[maine, short, black, people, back]","[much, black, maine]",0.814971
11,"[spirit, cursed, walking, back, road, waterway...","[soul, cursed, guard, path, say, encounter, un...",0.698517
13,"[come, cal, leave, shit, alone]","[come, cal, put]",0.999357


In [4]:
df = df[["toxic", "normal"]]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 577777 entries, 5 to 577776
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   toxic   577777 non-null  object
 1   normal  577777 non-null  object
dtypes: object(2)
memory usage: 13.2+ MB


In [5]:
dataset = Dataset.from_pandas(df[:1000]).train_test_split(test_size=0.2) # 1000 for simplicity of checking

In [6]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def preprocess_function(data):
    inputs = [l for l in data["toxic"]]
    targets = [l for l in data["normal"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, is_split_into_words=True)
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
folder = '../models/T5-small' 

training_args = Seq2SeqTrainingArguments(
    output_dir=folder,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    save_steps=10,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    generation_max_length = 133,
    weight_decay=1e-6,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,4.253178
2,No log,4.143383
3,No log,4.066164
4,No log,4.007196
5,No log,3.964003
6,No log,3.930886
7,No log,3.906989
8,No log,3.890918
9,No log,3.881775
10,No log,3.878522


TrainOutput(global_step=70, training_loss=4.426967075892857, metrics={'train_runtime': 1006.322, 'train_samples_per_second': 7.95, 'train_steps_per_second': 0.07, 'total_flos': 60996858150912.0, 'train_loss': 4.426967075892857, 'epoch': 10.0})

In [8]:
trainer.save_model('../models/saved')

### Load checkpoint and generate detoxified sentence

In [9]:
test_model = AutoModelForSeq2SeqLM.from_pretrained("../models/saved/")

In [10]:
input_ids = tokenizer("Now you're getting nasty.".lower().split(), max_length=128, truncation=True, is_split_into_words=True, return_tensors="pt").input_ids
outputs = test_model.generate(input_ids, max_new_tokens=133)

In [11]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Jetzt erhitzt man sich.


### Haha, idk, but on my machine it translates the sentence to another language :)