In [1]:
import pandas as pd
import json
import os
import re
import numpy as np
from transformers import AutoTokenizer, AutoModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, T5ForConditionalGeneration, GPT2Tokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch
from datasets import Dataset, load_dataset, DatasetDict
import random
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter
from copy import deepcopy
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, f1_score, accuracy_score
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import Dataset as BaseDataset
from transformers import Adafactor

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")
set_seed()

Random seed set as 42


In [3]:
train_df = pd.read_csv('detox_data/train.tsv', sep='\t')
val_df = pd.read_csv('detox_data/dev.tsv', sep='\t')

In [4]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(train_df[['toxic_comment', 'neutral_comment1']])
dataset['validation'] = Dataset.from_pandas(val_df[['toxic_comment', 'neutral_comment1']])

In [5]:
model_name = 'ai-forever/FRED-T5-large'
tokenizer = GPT2Tokenizer.from_pretrained(model_name, truncation_side='left')

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['toxic_comment', 'neutral_comment1'],
        num_rows: 6948
    })
    validation: Dataset({
        features: ['toxic_comment', 'neutral_comment1'],
        num_rows: 800
    })
})

In [7]:
def prepare_features(examples):
    tokenized_examples = tokenizer(
        ['<LM>' +x for x in examples["toxic_comment"]],
        padding='max_length', 
        max_length=512,
        truncation=True,
        return_tensors='np'
    )
    target_encoding = tokenizer(
        [f'{i}</s>' for i in examples['neutral_comment1']],
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors='np'
    )
    labels = target_encoding.input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    tokenized_examples['labels'] = labels
    return tokenized_examples

def prepare_features_test(examples):
    tokenized_examples = tokenizer(
        ['<LM>' + x for x in examples["toxic_comment"]], 
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors='np'
    )
    return tokenized_examples

In [8]:
dataset = dataset.map(prepare_features, batched=True)

Map:   0%|          | 0/6948 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [11]:
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=DEVICE_MAP)

In [12]:
training_args = Seq2SeqTrainingArguments(
    'models/fredt5large',
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    learning_rate = 1e-4,
    weight_decay=0.001,
    num_train_epochs = 5,
    save_strategy = 'epoch',
    report_to = 'none',
    gradient_accumulation_steps=1,
    predict_with_generate = True,
    save_total_limit = 1,
    optim='adafactor',
    logging_strategy='epoch',
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
)

In [None]:
trainer.train()