In [1]:
import pandas as pd


In [2]:
dataset = "../data/raw/filtered.tsv"

In [3]:
df = pd.read_csv(dataset, sep="\t")
df.head(10)

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
5,5,I'm not gonna have a child... ...with the same...,I'm not going to breed kids with a genetic dis...,0.703185,0.206522,0.950956,0.035846
6,6,"They're all laughing at us, so we'll kick your...",they're laughing at us. We'll show you.,0.618866,0.230769,0.999492,0.000131
7,7,Maine was very short on black people back then.,there wasn't much black in Maine then.,0.720482,0.1875,0.96368,0.14871
8,8,"Briggs, what the hell's happening?","Briggs, what the hell is going on?",0.920373,0.0,0.159096,0.841071
9,9,"Another one simply had no clue what to do, so ...","another simply didn't know what to do, so when...",0.87754,0.101695,0.055371,0.930472


In [4]:
references = [row.reference if row.ref_tox > row.trn_tox else row.translation for i, row in df.iterrows()]
translations = [row.translation if row.ref_tox > row.trn_tox else row.reference for i, row in df.iterrows()]
inference_df = pd.DataFrame({'references': references, 'translations': translations})

In [5]:
tokenizer_name = "ceshine/t5-paraphrase-paws-msrp-opinosis"

In [6]:
from transformers import T5Tokenizer, T5TokenizerFast

tokenizer = T5TokenizerFast.from_pretrained(tokenizer_name)

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(inference_df, test_size=0.1)

In [8]:
train_references = tokenizer(df_train.references.tolist(), truncation=True)
train_translations = tokenizer(df_train.translations.tolist(), truncation=True)
test_references = tokenizer(df_test.references.tolist(), truncation=True)
test_translations = tokenizer(df_test.translations.tolist(), truncation=True)

In [9]:
from torch.utils.data import Dataset


class DetoxDataset(Dataset):
    def __init__(self, references, translations):
        self.references = references
        self.translations = translations

    def __getitem__(self, idx):
        assert idx < len(self.references['input_ids'])
        item = {key: val[idx] for key, val in self.references.items()}
        item['decoder_attention_mask'] = self.translations['attention_mask'][idx]
        item['labels'] = self.translations['input_ids'][idx]
        return item

    @property
    def n(self):
        return len(self.references['input_ids'])

    def __len__(self):
        return self.n

train_dataset = DetoxDataset(train_references, train_translations)
test_dataset = DetoxDataset(test_references, test_translations)
len(train_dataset), len(test_dataset)

(519999, 57778)

In [10]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=4, drop_last=True, shuffle=False, num_workers=1)

In [11]:
language_model_name = 'SkolkovoInstitute/t5-paraphrase-paws-msrp-opinosis-paranmt'

In [12]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained(language_model_name)

In [13]:
import torch

device = torch.device('cuda:0')
model.to(device)

In [14]:
from typing import List, Dict, Union

class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True,
        )
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,
        )
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']

        return {k: torch.tensor(v) for k, v in batch.items()}

In [15]:
save_name = 'models/t5-detox'

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=save_name,   # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,             # total # of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=300,               # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=3e-5,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=100,
    eval_steps=100,

    evaluation_strategy='steps',
    save_total_limit=1,
    save_steps=5000,
)

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [19]:
import gc
gc.collect()
torch.cuda.empty_cache();

In [20]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [21]:
trainer.evaluate()

{'eval_loss': 0.8115463852882385}

In [24]:
trainer.save_model(save_name)

In [22]:
model.eval()

In [23]:
inputs = tokenizer('The internal policy of the fucking Trump is stupid.', return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}
for t in model.generate(**inputs, num_return_sequences=10, do_sample=False, num_beams=10):
    print(tokenizer.decode(t, skip_special_tokens=True))



the Trump administration's internal policy is stupid.
the internal policy of Trump is stupid.
the internal policy of Donald Trump is stupid.
the internal policy of President Trump is stupid.
the Trump administration's internal policy is nonsense.
the internal policy of Donald Trump is nonsense.
the Trump administration's internal policy is bad.
the internal policy of Mr. Trump is stupid.
the internal policy of Trump is nonsense.
the Trump internal policy is stupid.
