In [1]:
!pip install transformers



In [15]:
import pandas as pd
import numpy as np
import warnings

file_path = "/kaggle/input/filtered/filtered.tsv"
raw_df = pd.read_csv(file_path, delimiter="\t")
df = pd.DataFrame(raw_df)
warnings.filterwarnings('ignore')

In [11]:
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import pandas as pd

# Necessary inputs
TOKEN_PREFIX = "Make this text non-toxic:"
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128

def prepare_model_inputs(examples):
    input_texts = [TOKEN_PREFIX + ref for ref in examples["reference"]]
    target_texts = [tsn for tsn in examples["translation"]]
    
    model_inputs = tokenizer(input_texts, max_length=MAX_INPUT_LENGTH, truncation=True, return_overflowing_tokens=False)

    # Setup the tokenizer for targets
    labels = tokenizer(target_texts, max_length=MAX_TARGET_LENGTH, truncation=True, return_overflowing_tokens=False)
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Load and preprocess data
df = pd.read_csv("/kaggle/input/filtered/filtered.tsv", sep='\t', index_col=0)
dataset = Dataset.from_pandas(df).remove_columns('__index_level_0__')

# Split dataset
split_dict = dataset.train_test_split(test_size=0.1, seed=42)

# Crop dataset
batch_size = 256
cropped_datasets = split_dict
cropped_datasets['train'] = split_dict['train'].select(range(1000))
cropped_datasets['test'] = split_dict['test'].select(range(100))

# Tokenize datasets
tokenized_datasets = cropped_datasets.map(prepare_model_inputs, batched=True, batch_size=batch_size, remove_columns=split_dict["train"].column_names)
tokenized_datasets['train'][0]

# Create model
model_name = 't5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-detoxification",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Postprocessing function
def post_process_predictions(predictions, labels):
    predictions = [pred.strip() for pred in predictions]
    labels = [[label.strip()] for label in labels]
    return predictions, labels

# Metrics function
def compute_custom_metrics(eval_predictions):
    predictions, labels = eval_predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = post_process_predictions(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Trainer
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_custom_metrics
)

# Train the model
trainer.train()

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.810852,9.6669,10.8
2,No log,1.554081,2.7522,6.4
3,No log,1.410892,2.4709,6.4
4,No log,1.375681,0.2389,4.8
5,No log,1.387443,9.7643,9.2
6,No log,1.344263,41.316,10.6
7,No log,1.348004,41.316,10.6
8,No log,1.318513,41.316,10.6
9,No log,1.3025,41.316,10.6
10,No log,1.30129,41.316,10.6


TrainOutput(global_step=250, training_loss=1.7819320068359374, metrics={'train_runtime': 110.242, 'train_samples_per_second': 4.535, 'train_steps_per_second': 2.268, 'total_flos': 16537012715520.0, 'train_loss': 1.7819320068359374, 'epoch': 10.0})

In [12]:
# ... (Previous code remains the same)

# Test the model
test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test Results:", test_results)

# Save the fine-tuned model
model.save_pretrained('fine_tuned_detox_model')
tokenizer.save_pretrained('fine_tuned_detox_model')

Test Results: {'eval_loss': 1.3012897968292236, 'eval_bleu': 41.316, 'eval_gen_len': 10.6, 'eval_runtime': 1.008, 'eval_samples_per_second': 4.96, 'eval_steps_per_second': 2.976, 'epoch': 10.0}


('fine_tuned_detox_model/tokenizer_config.json',
 'fine_tuned_detox_model/special_tokens_map.json',
 'fine_tuned_detox_model/spiece.model',
 'fine_tuned_detox_model/added_tokens.json',
 'fine_tuned_detox_model/tokenizer.json')