In [2]:

from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, AdamW, DataCollatorForSeq2Seq,  TrainingArguments, Trainer, DataCollatorForLanguageModeling
import sacrebleu
import numpy as np
import tensorboard
import evaluate
from safetensors.torch import load_file
from adapters import AdapterConfig, T5AdapterModel
import torch
import pandas as pd
from datasets import Dataset

In [3]:
def preprocess_function(examples, tokenizer, input_length=512):
    # Access 'profanity' and 'non_profanity' as lists
    inputs = ['Translate from profanity to non-profanity: ' + text for text in examples['profanity']]
    targets = examples['non_profanity']
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(inputs, max_length=input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=input_length, truncation=True)
        
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [4]:
with open('profanity.csv', 'r') as file:
    stripped_lines = [line.strip() for line in file]
    print(len(stripped_lines))
dataset1 = pd.DataFrame(stripped_lines, columns= ['profanity'])

1598


In [5]:
with open('non_profanity.csv', 'r') as file:
    stripped_lines = [line.strip() for line in file]
    print(len(stripped_lines))
dataset2 = pd.DataFrame(stripped_lines, columns= ['non_profanity'])

1598


In [6]:
dataset1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1598 entries, 0 to 1597
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   profanity  1598 non-null   object
dtypes: object(1)
memory usage: 12.6+ KB


In [7]:
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1598 entries, 0 to 1597
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   non_profanity  1598 non-null   object
dtypes: object(1)
memory usage: 12.6+ KB


In [8]:

dataset1["non_profanity"] = dataset2["non_profanity"]


In [9]:
print(dataset1.head(10))

   profanity         non_profanity
0         69   mutual satisfaction
1        @55                bottom
2   @ssfcker     unpleasant person
3  @ssfucker  offensive individual
4  @ssfvcker  disrespectful person
5    @sshole                  jerk
6  0ral seks         oral activity
7   0ral sex         oral intimacy
8     0rg@sm                climax
9    0rgasms              climaxes


In [10]:
dataset = Dataset.from_pandas(dataset1)

In [11]:
print(dataset)
train_dataset = dataset.train_test_split(test_size=0.2, shuffle= True)

Dataset({
    features: ['profanity', 'non_profanity'],
    num_rows: 1598
})


In [12]:
print(train_dataset.shape)

{'train': (1278, 2), 'test': (320, 2)}


In [13]:
config = T5Config.from_pretrained('t5-base')
model = T5ForConditionalGeneration(config)
tokenizer = T5Tokenizer.from_pretrained('T5-base')
data_collator = DataCollatorForSeq2Seq(tokenizer= tokenizer, model = model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
for ex in train_dataset['train']:
    print(ex)

{'profanity': 'f0ckers', 'non_profanity': 'bothersome individuals'}
{'profanity': 'mofuccers', 'non_profanity': 'bothersome individuals'}
{'profanity': 'fuck y0u', 'non_profanity': 'expression of frustration'}
{'profanity': 'timber nigger', 'non_profanity': 'informal term for female anatomy'}
{'profanity': 'shitfacefuck', 'non_profanity': 'bothersome individual'}
{'profanity': 'scumfucker', 'non_profanity': 'informal term for a bothersome individual'}
{'profanity': 'cunt', 'non_profanity': 'offensive term for a woman'}
{'profanity': 'shemal3', 'non_profanity': 'informal term for a transgender woman'}
{'profanity': 'blacky', 'non_profanity': 'informal term for oral stimulation'}
{'profanity': 'motherfuckingshit', 'non_profanity': 'inappropriate behavior'}
{'profanity': 'jigger', 'non_profanity': 'informal term for a Black person'}
{'profanity': 'analplug', 'non_profanity': 'anal toy'}
{'profanity': 'fvckbunny', 'non_profanity': 'playful term for a troublesome person'}
{'profanity': 'fag

In [15]:
tokenized_train_dataset = train_dataset['train'].map(preprocess_function,
            batch_size =10000,
            batched=True,
            fn_kwargs={
                'tokenizer': tokenizer,
                'input_length': 512,
            },
            remove_columns= ['profanity', 'non_profanity'],
            writer_batch_size = 128)
print(tokenized_train_dataset['input_ids'][0],
      tokenized_train_dataset['labels'][0])

tokenized_test_dataset = train_dataset['test'].map(preprocess_function,
            batch_size =10000,
            batched=True,
            fn_kwargs={
                'tokenizer': tokenizer,
                'input_length': 512,
            },
            remove_columns= ['profanity', 'non_profanity'],
            writer_batch_size = 128)


Map:   0%|          | 0/1278 [00:00<?, ? examples/s]

[30355, 15, 45, 7108, 152, 485, 12, 529, 18, 1409, 12351, 485, 10, 3, 89, 632, 3383, 7, 1] [13965, 5529, 1742, 1]




Map:   0%|          | 0/320 [00:00<?, ? examples/s]

In [16]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Decode labels, replacing -100 with the pad token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(decoded_labels)
    # Strip unnecessary whitespaces
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # BLEU expects a list of references for each prediction

    # Compute BLEU score
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {
    "bleu": result["bleu"],
    "rouge1": rouge_result["rouge1"],
    "rouge2": rouge_result["rouge2"],
    "rougeL": rouge_result["rougeL"],
    }

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir='Users/anthai/Library/CloudStorage/OneDrive-UTS/SPR2024/Advanced NLP/Assignments/Assignment 3/results',
    eval_strategy="steps",
    eval_steps=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size =32,
    learning_rate=0.00005,
    weight_decay=0.001,
    gradient_accumulation_steps=8,
    predict_with_generate=True,
    logging_dir='Users/anthai/Library/CloudStorage/OneDrive-UTS/SPR2024/Advanced NLP/Assignments/Assignment 3/logs',
    logging_steps=10,
    max_steps= 200,
    save_steps=10,
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to="tensorboard",
    generation_max_length= 512,
)

# Initialize the Trainer with AdaFactor optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.00005,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0.001
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    optimizers=(optimizer, None),
    # Provide optimizer in the tuple format
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 6.7254, 'grad_norm': 4.353481292724609, 'learning_rate': 4.75e-05, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

  0%|          | 0/10 [00:00<?, ?it/s]

['informal term for bodily fluid', 'individuals who engage in self-pleasure', 'rude person', 'troublesome individual', 'informal term for a sexual position', 'informal term for an individual', 'rude individual', 'bothersome individual', 'offensive term for a violent act', 'humorous phrase for engaging in sexual activity', 'foolish individual', 'informal term for a Black individuals', 'bothersome individual', 'informal term for a Black person', 'male genitalia', 'informal term for being intoxicated or altered', 'bothersome individuals', 'foolish individual', 'humorous situation involving the toilet', 'Black individual', 'confused or chaotic', 'informal term for a boy with feminine traits', 'derogatory term for individuals', 'foolish individuals', 'informal term for male genitalia', 'punishment (historical context)', 'derogatory term for a woman', 'flamboyant individual', 'bothersome individual', 'derogatory term for a sexually active person', 'informal term for someone who is irritating

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 1387.5817, 'train_samples_per_second': 36.899, 'train_steps_per_second': 0.144, 'train_loss': 2.077594141960144, 'epoch': 40.0}


TrainOutput(global_step=200, training_loss=2.077594141960144, metrics={'train_runtime': 1387.5817, 'train_samples_per_second': 36.899, 'train_steps_per_second': 0.144, 'total_flos': 1348979694612480.0, 'train_loss': 2.077594141960144, 'epoch': 40.0})

In [32]:
prefix = 'Translate from profanity to non profanity: '
input_text = "nigga"
prefixed_input_text = prefix + input_text
# Tokenize the input text
encoding = tokenizer(input_text, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

# Get the input tensor
input_tensor = encoding['input_ids'].to('cpu')
attention_mask = encoding['attention_mask']. to('cpu')# Use the 'input_ids' field for the input tensor
model.eval().to('cpu')
# Get model output
with torch.no_grad():
    outputs = model.generate(input_tensor, attention_mask = attention_mask, max_length = 128)
# Decoding the model output (assuming outputs are logits)
decoded_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print( f"profanity sentence: {prefixed_input_text}" , '\n'
    f"Non-profanity sentence: {decoded_sentence}")

profanity sentence: Translate from profanity to non profanity: nigga 
Non-profanity sentence: ['informal term for Black individuals']


In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate

# Initialize metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


# Prepare data lists
predictions = []
references = []

def translate_sentence(sentence, max_length=128):
    prefix = 'Translate from profanity to non profanity: '
    prefixed_input_text = prefix + sentence
    
    # Tokenize the input text
    encoding = tokenizer(prefixed_input_text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
    input_tensor = encoding['input_ids'].to('cpu')
    attention_mask = encoding['attention_mask'].to('cpu')
    
    model.eval()
    # Generate translation
    with torch.no_grad():
        outputs = model.generate(input_tensor, attention_mask=attention_mask, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    
    # Decode the generated tokens
    decoded_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_sentence

# Evaluate on test set
for example in train_dataset['test']:
    source_sentence = example['profanity']  # replace with source language
    reference_sentence = example['non_profanity']  # replace with target language
    
    predicted_sentence = translate_sentence(source_sentence)
    predictions.append(predicted_sentence)
    references.append([reference_sentence])  # Wrap in list for BLEU compatibility

# Calculate BLEU
result = bleu_metric.compute(predictions=predictions, references=references)
print("BLEU score:", result['bleu'])

# Calculate ROUGE
rouge = rouge_metric.compute(predictions=predictions, references=[ref[0] for ref in references])
print("ROUGE scores:", rouge)


BLEU score: 0.285825929650852
ROUGE scores: {'rouge1': 0.4489743900911183, 'rouge2': 0.3499548021423022, 'rougeL': 0.4458872255195785, 'rougeLsum': 0.44696960719019563}


In [20]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel 
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
separator_token = "<|translate|>"
tokenizer.add_tokens([separator_token])
model.resize_token_embeddings(len(tokenizer))

Embedding(32101, 768)

In [21]:
def gpt_preprocess_function(examples, tokenizer, input_length=128):
    # Access 'profanity' and 'non_profanity' as lists
    inputs = [text for text in examples['profanity']]
    targets = examples['non_profanity']
    translation = [f"{input} {separator_token} {target}" for input, target in zip(inputs, targets)]
    # Tokenize inputs and targets
    model_inputs = tokenizer(translation, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = model_inputs["input_ids"].copy()  # Copy inputs to labels for language modeling
    return model_inputs

In [22]:


bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")


def gpt_compute_metrics(eval_preds):
    preds, labels = eval_preds


    # Replace -100 in labels with pad_token_id for decoding
    labels = np.where(labels != -100, labels, gpt_tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = gpt_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = gpt_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Ensure each label is a list of one reference for BLEU
    decoded_labels = [label.strip() for label in decoded_labels]

    # Strip whitespace from decoded predictions
    decoded_preds = [pred.strip() for pred in decoded_preds]
    # Compute BLEU score
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {
    "bleu": result["bleu"],
    "rouge1": rouge_result["rouge1"],
    "rouge2": rouge_result["rouge2"],
    "rougeL": rouge_result["rougeL"],
    }

In [23]:
tokenized_gpt_train_dataset = train_dataset['train'].map(gpt_preprocess_function,
            batch_size =10000,
            batched=True,
            fn_kwargs={
                'tokenizer': gpt_tokenizer,
                'input_length': 128,
            },
            remove_columns= ['profanity', 'non_profanity'],
            writer_batch_size = 128)
tokenized_gpt_test_dataset = train_dataset['test'].map(gpt_preprocess_function,
            batch_size =10000,
            batched=True,
            fn_kwargs={
                'tokenizer': gpt_tokenizer,
                'input_length': 128,
            },
            remove_columns= ['profanity', 'non_profanity'],
            writer_batch_size = 128)


Map:   0%|          | 0/1278 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

In [24]:
gpt_training_args = TrainingArguments(
    output_dir='Users/anthai/Library/CloudStorage/OneDrive-UTS/SPR2024/Advanced NLP/Assignments/Assignment 3/gpt_results',
    eval_strategy="steps",
    eval_steps=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size =32,
    learning_rate=0.00005,
    weight_decay=0.001,
    gradient_accumulation_steps=8,

    logging_dir='Users/anthai/Library/CloudStorage/OneDrive-UTS/SPR2024/Advanced NLP/Assignments/Assignment 3/gpt_logs',
    logging_steps=10,
    max_steps= 200,
    save_steps=10,
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to="tensorboard",

)

# Initialize the Trainer with AdaFactor optimizer
gpt_optimizer = torch.optim.AdamW(
    gpt_model.parameters(),
    lr=0.00005,
    betas=(0.9, 0.999),
    eps=1e-8,
    weight_decay=0.01
)

gpt_trainer = Trainer(
    model=gpt_model,
    args=gpt_training_args,
    train_dataset=tokenized_gpt_train_dataset,
    eval_dataset=tokenized_gpt_test_dataset,
    optimizers=(gpt_optimizer, None),
    # Provide optimizer in the tuple format
    data_collator=DataCollatorForLanguageModeling(tokenizer=gpt_tokenizer, mlm=False),
    #compute_metrics=gpt_compute_metrics
)

# Start training
gpt_trainer.train()

max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 3.5989, 'grad_norm': 7.136495113372803, 'learning_rate': 4.75e-05, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 2.108588695526123, 'eval_runtime': 5.5487, 'eval_samples_per_second': 57.672, 'eval_steps_per_second': 1.802, 'epoch': 2.0}
{'loss': 1.9882, 'grad_norm': 5.222328186035156, 'learning_rate': 4.5e-05, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.7756900787353516, 'eval_runtime': 5.6218, 'eval_samples_per_second': 56.921, 'eval_steps_per_second': 1.779, 'epoch': 4.0}
{'loss': 1.6804, 'grad_norm': 3.449796199798584, 'learning_rate': 4.25e-05, 'epoch': 6.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.6166467666625977, 'eval_runtime': 5.4985, 'eval_samples_per_second': 58.198, 'eval_steps_per_second': 1.819, 'epoch': 6.0}
{'loss': 1.4921, 'grad_norm': 3.134739875793457, 'learning_rate': 4e-05, 'epoch': 8.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.5180598497390747, 'eval_runtime': 5.5438, 'eval_samples_per_second': 57.722, 'eval_steps_per_second': 1.804, 'epoch': 8.0}
{'loss': 1.3668, 'grad_norm': 3.504802942276001, 'learning_rate': 3.7500000000000003e-05, 'epoch': 10.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.462376594543457, 'eval_runtime': 5.5231, 'eval_samples_per_second': 57.938, 'eval_steps_per_second': 1.811, 'epoch': 10.0}
{'loss': 1.2796, 'grad_norm': 3.1429784297943115, 'learning_rate': 3.5e-05, 'epoch': 12.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.4223439693450928, 'eval_runtime': 5.5692, 'eval_samples_per_second': 57.459, 'eval_steps_per_second': 1.796, 'epoch': 12.0}
{'loss': 1.1961, 'grad_norm': 3.0848982334136963, 'learning_rate': 3.2500000000000004e-05, 'epoch': 14.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.397424340248108, 'eval_runtime': 5.4785, 'eval_samples_per_second': 58.41, 'eval_steps_per_second': 1.825, 'epoch': 14.0}
{'loss': 1.1264, 'grad_norm': 3.2065956592559814, 'learning_rate': 3e-05, 'epoch': 16.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3807834386825562, 'eval_runtime': 5.5913, 'eval_samples_per_second': 57.232, 'eval_steps_per_second': 1.789, 'epoch': 16.0}
{'loss': 1.085, 'grad_norm': 4.277224540710449, 'learning_rate': 2.7500000000000004e-05, 'epoch': 18.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3712090253829956, 'eval_runtime': 5.5849, 'eval_samples_per_second': 57.298, 'eval_steps_per_second': 1.791, 'epoch': 18.0}
{'loss': 1.0335, 'grad_norm': 2.9435415267944336, 'learning_rate': 2.5e-05, 'epoch': 20.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3625977039337158, 'eval_runtime': 5.4935, 'eval_samples_per_second': 58.25, 'eval_steps_per_second': 1.82, 'epoch': 20.0}
{'loss': 0.9894, 'grad_norm': 2.8632659912109375, 'learning_rate': 2.25e-05, 'epoch': 22.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.355841875076294, 'eval_runtime': 5.5113, 'eval_samples_per_second': 58.062, 'eval_steps_per_second': 1.814, 'epoch': 22.0}
{'loss': 0.9539, 'grad_norm': 3.068986415863037, 'learning_rate': 2e-05, 'epoch': 24.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3560172319412231, 'eval_runtime': 5.5363, 'eval_samples_per_second': 57.8, 'eval_steps_per_second': 1.806, 'epoch': 24.0}
{'loss': 0.9299, 'grad_norm': 3.0194973945617676, 'learning_rate': 1.75e-05, 'epoch': 26.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.356813669204712, 'eval_runtime': 5.5077, 'eval_samples_per_second': 58.1, 'eval_steps_per_second': 1.816, 'epoch': 26.0}
{'loss': 0.903, 'grad_norm': 2.7766149044036865, 'learning_rate': 1.5e-05, 'epoch': 28.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3575448989868164, 'eval_runtime': 5.5211, 'eval_samples_per_second': 57.96, 'eval_steps_per_second': 1.811, 'epoch': 28.0}
{'loss': 0.8839, 'grad_norm': 3.090280055999756, 'learning_rate': 1.25e-05, 'epoch': 30.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3600956201553345, 'eval_runtime': 5.5423, 'eval_samples_per_second': 57.737, 'eval_steps_per_second': 1.804, 'epoch': 30.0}
{'loss': 0.8621, 'grad_norm': 2.9484426975250244, 'learning_rate': 1e-05, 'epoch': 32.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3574953079223633, 'eval_runtime': 5.4803, 'eval_samples_per_second': 58.391, 'eval_steps_per_second': 1.825, 'epoch': 32.0}
{'loss': 0.8532, 'grad_norm': 2.4967153072357178, 'learning_rate': 7.5e-06, 'epoch': 34.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3555437326431274, 'eval_runtime': 5.5268, 'eval_samples_per_second': 57.9, 'eval_steps_per_second': 1.809, 'epoch': 34.0}
{'loss': 0.8507, 'grad_norm': 3.3690590858459473, 'learning_rate': 5e-06, 'epoch': 36.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3569778203964233, 'eval_runtime': 5.5229, 'eval_samples_per_second': 57.941, 'eval_steps_per_second': 1.811, 'epoch': 36.0}
{'loss': 0.837, 'grad_norm': 2.5620856285095215, 'learning_rate': 2.5e-06, 'epoch': 38.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3571916818618774, 'eval_runtime': 5.5078, 'eval_samples_per_second': 58.1, 'eval_steps_per_second': 1.816, 'epoch': 38.0}
{'loss': 0.8325, 'grad_norm': 2.328972578048706, 'learning_rate': 0.0, 'epoch': 40.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.3575884103775024, 'eval_runtime': 5.587, 'eval_samples_per_second': 57.276, 'eval_steps_per_second': 1.79, 'epoch': 40.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 3304.9122, 'train_samples_per_second': 15.492, 'train_steps_per_second': 0.061, 'train_loss': 1.2371238231658936, 'epoch': 40.0}


TrainOutput(global_step=200, training_loss=1.2371238231658936, metrics={'train_runtime': 3304.9122, 'train_samples_per_second': 15.492, 'train_steps_per_second': 0.061, 'total_flos': 3339312168960000.0, 'train_loss': 1.2371238231658936, 'epoch': 40.0})

In [28]:
import torch

# Prefix and input text
pad_token_id = tokenizer.eos_token_id 
input_text = "you mothafucka"
def translate_sentence(sentence, max_length=128):
    # Prepare the input sequence with the separator token
    input_text = f"{sentence} {separator_token}"
    inputs = gpt_tokenizer(input_text, return_tensors="pt")
    attention_mask = inputs["attention_mask"]
    # Generate the output sequence
    outputs = gpt_model.to('cpu').generate(
        inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
        pad_token_id = pad_token_id,
        no_repeat_ngram_size=2,
        temperature = 0.8,
    )

    # Decode the generated tokens to get the translated text
    translation = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the target text, removing the source and separator
    translated_text = translation.split(separator_token)[-1].strip()
    return translated_text

# Test the translation function
translated_text = translate_sentence(input_text)
print("Source:", input_text)
print("Translation:", translated_text)

Source: you mothafucka
Translation: bothersome individual (not recommended) who exploits


In [27]:
import evaluate

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
predictions = []
references = []

for example in train_dataset['test']:
    source_sentence = example['profanity']  # replace with source language
    reference_sentence = example['non_profanity']  # replace with target language
    predicted_sentence = translate_sentence(source_sentence)
    
    predictions.append(predicted_sentence)
    references.append([reference_sentence])  # Wrap in list for BLEU compatibility

# Calculate BLEU
result = bleu_metric.compute(predictions=predictions, references=references)

print("BLEU score:", result['bleu'])

# Calculate ROUGE

rouge = rouge_metric.compute(predictions=predictions, references=[ref[0] for ref in references])
print("ROUGE scores:", rouge)

BLEU score: 0.10528400688146479
ROUGE scores: {'rouge1': 0.33736451310718474, 'rouge2': 0.22351533398178627, 'rougeL': 0.3324618942530856, 'rougeLsum': 0.33244531277921896}
