In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
from torch import cuda

from transformers import set_seed

SEED = 42
set_seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

device = 'cuda' if cuda.is_available() else 'cpu'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
PATH = "../data/inheritim/"
SAVE_PATH = "../models/t5-detoxification/"

In [5]:
df = pd.read_csv(PATH + 'filtered.csv', index_col=0)
df.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,avg_word_ref,avg_word_trans
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.981983,0.014195,15,16
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.999039,0.065473,4,3
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.985068,0.213313,8,6
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.994215,0.053362,9,6
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.999348,0.009402,7,6


split out data into train and test set

In [6]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
print(df_train.shape[0], df_test.shape[0])

462221 115556


# Creating Dataset

In [7]:
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
class Dataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer):
        self.x = tokenizer(df.reference.tolist(), 
                           truncation=True,
                           )
        
        self.y = tokenizer(df.translation.tolist(),
                           truncation=True,
                           )
        
    def __getitem__(self, idx: int):
        return {
            "input_ids": self.x["input_ids"][idx],
            "attention_mask": self.x["attention_mask"][idx],
            "labels": self.y['input_ids'][idx],
        }

    def __len__(self):
        return len(self.x['input_ids'])

In [9]:
train_dataset = Dataset(df_train, tokenizer)
test_dataset = Dataset(df_test, tokenizer)

In [10]:
len(train_dataset), len(test_dataset)

(462221, 115556)

# Load Dataset into Dataloader

In [11]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Trainer

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,).to(device)

In [13]:
args = Seq2SeqTrainingArguments(
    output_dir=SAVE_PATH,   
    overwrite_output_dir=True,
    num_train_epochs=1,             
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,    
    warmup_steps=300,               
    weight_decay=0.01,                  
    learning_rate=3e-5,
    logging_steps=1000,
    eval_steps=1000,
    evaluation_strategy='steps',
    save_total_limit=1,
    save_steps=1000,
)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,                         
    args=args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,          
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
1000,2.1423,1.737445
2000,1.8541,1.672046
3000,1.8027,1.638613
4000,1.7914,1.617661
5000,1.7595,1.603462
6000,1.7441,1.590576
7000,1.7393,1.58083
8000,1.7203,1.573294
9000,1.7224,1.568296
10000,1.7041,1.563477


TrainOutput(global_step=14445, training_loss=1.7689432167272412, metrics={'train_runtime': 3949.9181, 'train_samples_per_second': 117.02, 'train_steps_per_second': 3.657, 'total_flos': 5962041088278528.0, 'train_loss': 1.7689432167272412, 'epoch': 1.0})

# Saving

In [17]:
trainer.save_model(SAVE_PATH + "checkpoint-final/")