################################

  ## PREP: install packages ##
    
################################

In [None]:
!conda create --prefix="/local_path_to_save_env/"  python=3.10
!conda activate "/local_path_to_save_env/"
!pip install trl
!pip install transformer
!pip install torch torchvision torchaudio
!pip install peft

################################

  ## Load Modules ##
    
################################

In [None]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from torch.optim import Adam
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, create_reference_model, set_seed
from trl.core import LengthSampler

In [2]:
tqdm.pandas()
set_seed(42)

################################

  ## 1. funcs for Data Preparation ##
    
################################

In [4]:
def load_toxic_dataset(dataset_name: str, toxicity_threshold=0.3, cache_dir=None):
    """
    Load a dataset from huggingface by using `load_dataset`
    

    Args:
        dataset_name (`str`): a dataset to be loaded from huggingface.
        toxicity_threshold (`float32`): threshold to determine if an input is toxic.
        cache_dir (`str`): path to cache pretrained LLM.
    
    Returns:
        a dataset
    """
    if dataset_name == "allenai/real-toxicity-prompts":
        ds = load_dataset(dataset_name, split="train")

        def select_toxic(sample):
            toxicity = sample["prompt"]["toxicity"]
            return toxicity is not None and toxicity > toxicity_threshold
    elif dataset_name == "jigsaw_unintended_bias":
        dataset_name = "./dataset/test_public_expanded.csv"
        ds = load_dataset("csv", data_files=dataset_name)['train']
        def select_toxic(sample):
            toxicity = sample["toxicity"]
            return toxicity is not None and toxicity > toxicity_threshold
    else:
        raise ValueError("No such dataset used in the experiment.")

    ds = ds.filter(select_toxic, batched=False)
    return ds

def get_tokenized(tokenizer, dataset, dataset_name, min_text_length, max_text_length):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        tokenizer : a huggingface tokenizer for text tokenization.
        dataset (`dataset.Dataset`): A huggingface dataset to be loaded.
        dataset_name (`str`): dataset name which helps to determine the way to tokenize.
        min_text_length (`int`): minimal length of input.
        max_text_length (`int`): maximal length of input.
        
    Returns:
        dataloader (`torch.utils.data.DataLoader`): a dataloader for the dataset used in the training loop.
    """
    
    # sample by input length
    input_size = LengthSampler(min_text_length, max_text_length)

    def tokenize_real_toxicity_prompts(sample):
        prompt = sample["prompt"]["text"]
        continuation = sample["continuation"]["text"]

        sample["input_ids"] = tokenizer.encode(prompt + continuation)[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
    
    def tokenize_jigsaw_unintended_bias(sample):
        prompt = sample["comment_text"]

        sample["input_ids"] = tokenizer.encode(prompt)[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample
    
    if dataset_name == "allenai/real-toxicity-prompts":
        dataset = dataset.map(tokenize_real_toxicity_prompts, batched=False)
    else:
        dataset = dataset.map(tokenize_jigsaw_unintended_bias, batched=False)
    
    dataset.set_format(type="torch")

    return dataset

########################################

  ## 2. funcs for Load Model and Tokenizer ##
    
########################################

In [5]:
def load_pretrained_model_tokenizer(model_name_or_path, device="cpu", cache_dir=None):
    """Loads a trained model from the given model name or path."""
    tokenizer = get_tokenizer(model_name_or_path, cache_dir=cache_dir)
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        cache_dir=cache_dir,  # change to location you want to store the pretrained-model
        pad_token_id=tokenizer.eos_token_id,
        # torch_dtype=torch.bfloat16 ## use torch.bfloat16 to save memory
    )
    model = model.to(device)
    return model, tokenizer
def get_tokenizer(model_name_or_path, cache_dir):
    if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
        padding_side = "left"
    else:
        padding_side = "right"
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side, cache_dir=cache_dir)
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

########################################

  ## 3. Prepara data and model for experiments ##
    
########################################

In [7]:
# experiment config 
min_text_length = 20
max_text_length = 40
cache_dir = "./cache"
model_name_or_path = "EleutherAI/gpt-neo-125m"
device = "cuda"

# load model and tokenizer
model, tokenizer = load_pretrained_model_tokenizer(model_name_or_path, device=device, cache_dir=cache_dir)

# load dataloader for dataset: "allenai/real-toxicity-prompts" or "jigsaw_unintended_bias"
# dataset_name = "allenai/real-toxicity-prompts" # either "jigsaw_unintended_bias" or "allenai/real-toxicity-prompts"
dataset_name = "jigsaw_unintended_bias"
toxicity_threshold = 0.3
test_ratio = 0.2
dataset = load_toxic_dataset(dataset_name=dataset_name, toxicity_threshold=toxicity_threshold, cache_dir=cache_dir)
dataset = dataset.train_test_split(test_size=test_ratio, shuffle=False)
train_dataset = get_tokenized(tokenizer, dataset['train'], dataset_name, min_text_length, max_text_length)
test_dataset = dataset['test']

Map: 100%|██████████| 8781/8781 [00:07<00:00, 1239.91 examples/s]


In [8]:
train_dataset

Dataset({
    features: ['id', 'comment_text', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack', 'insult', 'threat', 'identity_annotator_count', 'toxicity_annotator_count', 'male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity', 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability', 'input_ids', 'query'],
    num_rows: 8781
})

In [9]:
test_dataset

Dataset({
    features: ['id', 'comment_text', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack', 'insult', 'threat', 'identity_annotator_count', 'toxicity_annotator_count', 'male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity', 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability'],
    num_rows: 2196
})

In [10]:
## 1. create reference model using traditional fine tuning
model = AutoModelForCausalLMWithValueHead.from_pretrained(model)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
num_layers = len(model.pretrained_model.transformer.h)
print(f"number of layers in total: {num_layers}")
num_shared_layers = num_layers - 8 ## tuning on last two layers and freeze other layers
reference_model = create_reference_model(model, num_shared_layers=num_shared_layers)

number of layers in total: 12


In [12]:
## 2. create reference model using peft
from peft import LoraConfig

lora_config = LoraConfig(
    r=128,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = AutoModelForCausalLMWithValueHead.from_pretrained(model, peft_config=lora_config)

In [13]:
total_params = sum(p.numel() for p in model.parameters())
total_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"#trainable parameters: {total_params_trainable}")
print(f"ratio of trainable parameters: {total_params_trainable/total_params}")

#trainable parameters: 56686849
ratio of trainable parameters: 0.45277267030140833


########################################

  ## 4. Proximal Policy Optimizer ##
    
########################################

In [14]:
learning_rate = (1.47e-5) * 2
mini_batch_size = 64
ppo_epochs = 100
batch_size = 64

In [15]:
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

In [16]:
ppo_config = PPOConfig(
    model_name=model_name_or_path,
    learning_rate=learning_rate,
    ppo_epochs=ppo_epochs,
    mini_batch_size=batch_size,
    batch_size=batch_size,
    gradient_accumulation_steps=1,
)

In [17]:
ppo_trainer = PPOTrainer(
    ppo_config,
    model,
    ref_model=reference_model,
    tokenizer=tokenizer,
    dataset=train_dataset,
    data_collator=lambda data: dict((key, [d[key] for d in data]) for key in data[0]),
    optimizer=optimizer,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


########################################

  ## 5. Reward Model and Scoring ##
  
  #### use a pretrained classifier to score the toxicity of an input.
    
########################################

In [18]:
toxicity_model_name_or_path = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_name_or_path, cache_dir=cache_dir)
toxicity_model = RobertaForSequenceClassification.from_pretrained(toxicity_model_name_or_path, 
                                                                  cache_dir=cache_dir).to(ppo_trainer.accelerator.device)

########################################

  ## 6. Training the model to detoxify base model ##
  
    
########################################

In [None]:
max_new_tokens = 32
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens,
}

model_save_path = f"./saved_model/{model_name_or_path}_detoxified_{dataset_name.split('/')[-1]}"

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_inputs = batch["input_ids"]

    # Get response from the policy model
    outputs_tensors = []
    for query in query_inputs:
        response = ppo_trainer.generate(query, **generation_kwargs)
        outputs_tensors.append(response.squeeze()[-max_new_tokens:])
    batch["output"] = [tokenizer.decode(r.squeeze()) for r in outputs_tensors]

    # Compute toxicity score for each output
    toxicity_inputs = toxicity_tokenizer(batch["output"], padding=True, truncation=True, return_tensors="pt")
    toxicity_inputs = toxicity_inputs.to(ppo_trainer.accelerator.device)
    logits = toxicity_model(**toxicity_inputs).logits.float()
    toxicity_labels = (logits[:, 0]).tolist()

    rewards = [torch.tensor(output) for output in toxicity_labels]

    # Run PPO optimization step
    stats = ppo_trainer.step(query_inputs, outputs_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    # Save model every 20 epochs
    if epoch and epoch % 20 == 0:
        if ppo_trainer.accelerator.is_main_process:
            ppo_trainer.save_pretrained(model_save_path)

########################################

  ## 7. Evaluate and Compare Results ##
    
########################################

In [22]:
import evaluate
import csv
import numpy as np
from torch.utils.data import DataLoader
from transformers import default_data_collator

In [23]:
## load toxicity from evaluation to score each model output
toxicity = evaluate.load("ybelkada/toxicity", "DaNLP/da-electra-hatespeech-detection", module_type="measurement")

In [67]:
NUM_SAMPLES_TO_TEST = 500
BATCH_SIZE = 64
context_length = 500

In [68]:
from typing import List, Union, Dict, Any
from datasets import Dataset

def tokenize_data(
    tokenizer, input_data: Union[List[str], Dataset], config: Optional[Dict[str, Any]] = None):
    """Tokenize text data.

    Args:
        input_data: The text to be tokenized.
        tokenizer: a tokenizer to tokenize.
        config: parameters for setting up the tokenization. Defaults to None.

    Returns:
        tokenized data Dict[str, Tensor]: tokenized data with input_ids, attention_masks and labels.
    """
    if not isinstance(input_data, List):
        input_data = input_data["text"]

    encoded: Dict[str, torch.Tensor] = tokenizer(
        input_data,
        padding=True,
        # truncation=True,
        return_tensors="pt",
    )
    return encoded

def tokenize_on_dataset(tokenizer, dataset: Dataset, config: Optional[Dict[str, Any]] = None):
    """main function to perform tokenization over a dataset object

    Args:
        tokenizer (PreTrainedTokenizer): a tokenizer
        dataset (Dataset): a dataset object to be tokenized, the feature *text* will be tokenized.
        config (Dict, optional): parameters for setting up the tokenization. Defaults to None.

    Returns:
        tokenized data Dict[str, Tensor]: tokenized data with input_ids, attention_masks and labels.
    """
    tokenized_dataset = dataset.map(lambda x: tokenize_data(tokenizer, x, None), batched=True, num_proc=4)
    return tokenized_dataset

In [60]:
in_dist_ds

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1000
})

In [69]:
# in-distribution test - coming from the same dataset that we used to train our detoxified model
in_dist_ds = test_dataset
def get_text(x):
    if "real" in dataset_name:
        x['text'] = x['prompt']["text"][:context_length]
    else:
        x['text'] = x["comment_text"][:context_length]
    return x

in_dist_ds = in_dist_ds.map(get_text, batched=False)
in_dist_ds = tokenize_on_dataset(tokenizer, in_dist_ds)
in_dist_ds = in_dist_ds.remove_columns([x for x in in_dist_ds.column_names if x not in ['input_ids', "attention_mask"]])
in_dist_ds = in_dist_ds.select(range(NUM_SAMPLES_TO_TEST))
in_dist_test_dataloader = DataLoader(
        in_dist_ds, shuffle=False, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)

In [70]:
#out-distribution test - toxic data (not used in training our detoxified model)
test_dataset_name = "OxAISH-AL-LLM/wiki_toxic"
out_dist_ds = load_dataset(test_dataset_name, split="test")
out_dist_ds = out_dist_ds.filter(lambda x: x["label"] == 1)
out_dist_ds = out_dist_ds.rename_columns({"comment_text":"text"})
def get_text2(x):
    x['text'] = x["text"][:context_length]
    return x
out_dist_ds = out_dist_ds.map(get_text2, batched=False)
out_dist_ds = tokenize_on_dataset(tokenizer, out_dist_ds)
out_dist_ds = out_dist_ds.remove_columns([x for x in out_dist_ds.column_names if x not in ['input_ids', "attention_mask"]])
out_dist_ds = out_dist_ds.select(range(NUM_SAMPLES_TO_TEST))
out_dist_test_dataloader = DataLoader(
        out_dist_ds, shuffle=False, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)

In [71]:
#out-distribution test - benign data
benign_dataset_name = "wikitext"
benign_ds = load_dataset(benign_dataset_name, "wikitext-2-v1", split="test")
def get_text2(x):
    x['text'] = x["text"][:context_length]
    return x
benign_ds = benign_ds.map(get_text2, batched=False)
benign_ds = tokenize_on_dataset(tokenizer, benign_ds)
benign_ds = benign_ds.remove_columns([x for x in benign_ds.column_names if x not in ['input_ids', "attention_mask"]])
benign_ds = benign_ds.select(range(NUM_SAMPLES_TO_TEST))
benign_test_dataloader = DataLoader(
        benign_ds, shuffle=False, collate_fn=default_data_collator, batch_size=BATCH_SIZE, pin_memory=True
)

In [72]:
# evaluation setup
models_to_be_tested = [
    "./saved_model/EleutherAI/gpt-neo-125m_detoxified_jigsaw_unintended_bias",
    # "./saved_model/EleutherAI/gpt-neo-125m_detoxified",
    "EleutherAI/gpt-neo-125m",
    # "./saved_model/EleutherAI/gpt-neo-1.3b_detoxified",
    # "EleutherAI/gpt-neo-1.3b",
]
datasets_to_be_tested = {
    dataset_name: in_dist_test_dataloader,
    test_dataset_name: out_dist_test_dataloader,
    benign_dataset_name: benign_test_dataloader
}
output_file = f"./results/toxicity_{dataset_name.split('/')[-1]}.csv"
max_new_tokens = 50

In [73]:
# open a csv file for writing results
file = open(f"{output_file}", "a", newline="")
writer = csv.writer(file)
# add first rows
writer.writerow(["model_id", "dataset_id", "mean_toxicity", "std_toxicity"])

for model_id in tqdm(models_to_be_tested):
    model = None
    torch.cuda.empty_cache()
    toxicities = {}

    if "saved_model" in model_id: # detoxified model
        model = AutoModelForCausalLM.from_pretrained(model_id, 
                                                     device_map={"": device}, 
                                                    )
        tokenizer = AutoTokenizer.from_pretrained(model_id)
    else: # base model (before detoxifying)
        model, tokenizer = load_pretrained_model_tokenizer(model_id, device=device, cache_dir=cache_dir)

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    
    for dataset_test in datasets_to_be_tested:
        ds_data_loader = datasets_to_be_tested[dataset_test]
        for inputs in ds_data_loader:
            inputs['input_ids'] = torch.tensor(inputs['input_ids']).to(device)
            inputs['attention_mask'] = torch.tensor(inputs['attention_mask']).to(device)
            seq_length = inputs['input_ids'].size(1)
            outputs = model.generate(**inputs, do_sample=True, max_new_tokens=max_new_tokens, use_cache=True)
            generated_texts = tokenizer.batch_decode(outputs[:, seq_length:], skip_special_tokens=True)
            toxicity_score = toxicity.compute(predictions=generated_texts)

            if dataset_test not in toxicities:
                toxicities[dataset_test] = []
            toxicities[dataset_test].extend(toxicity_score["toxicity"])

        # compute mean & std using np
        mean = np.mean(toxicities[dataset_test])
        std = np.std(toxicities[dataset_test])

        # save to file
        writer.writerow([model_id, dataset_test, mean, std])

        # print
        print(f"Model: {model_id} - Dataset: {dataset_test} - Mean: {mean} - Std: {std}")

# close file
file.close()

  0%|          | 0/2 [00:00<?, ?it/s]Some weights of the model checkpoint at ./saved_model/EleutherAI/gpt-neo-125m_detoxified_jigsaw_unintended_bias were not used when initializing GPTNeoForCausalLM: ['v_head.summary.weight', 'v_head.summary.bias']
- This IS expected if you are initializing GPTNeoForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  inputs['input_ids'] = torch.tensor(inputs['input_ids']).to(device)
  inputs['attention_mask'] = torch.tensor(inputs['attention_mask']).to(device)


Model: ./saved_model/EleutherAI/gpt-neo-125m_detoxified_jigsaw_unintended_bias - Dataset: jigsaw_unintended_bias - Mean: 0.029199765555502383 - Std: 0.12521923485779027


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Model: ./saved_model/EleutherAI/gpt-neo-125m_detoxified_jigsaw_unintended_bias - Dataset: OxAISH-AL-LLM/wiki_toxic - Mean: 0.029696088243625126 - Std: 0.10492667473797328


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Model: ./saved_model/EleutherAI/gpt-neo-125m_detoxified_jigsaw_unintended_bias - Dataset: wikitext - Mean: 0.018494784566166347 - Std: 0.09972834850533008
Model: EleutherAI/gpt-neo-125m - Dataset: jigsaw_unintended_bias - Mean: 0.3117448237745557 - Std: 0.3931699463907727


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Model: EleutherAI/gpt-neo-125m - Dataset: OxAISH-AL-LLM/wiki_toxic - Mean: 0.3602497966245282 - Std: 0.4126089161176549


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Model: EleutherAI/gpt-neo-125m - Dataset: wikitext - Mean: 0.10746023948129732 - Std: 0.2360730800055794



