In [3]:
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torchtext.datasets import WikiText103
from tqdm import tqdm
import spacy
import matplotlib.pyplot as plt
import transformers
import numpy as np
import pickle
import pandas as pd
from spacy.cli.download import download
import time
from typing import List, Optional, Tuple, Dict

from io_utils import *
from toxic_suppression_wrapper import GPT2Wrapper
from toxicity_scoring import toxicity_scoring
from perplexity import compute_ppl

API_KEY = ""

%load_ext autoreload
%autoreload 2

In [None]:
MODEL_NAME = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
wrapper = GPT2Wrapper(model_name = "gpt2-medium", use_cuda = True)

In [25]:
non_toxic_values = {
    13: [1852],
    14: [72, 1394],
    15: [215],
    16: [461, 3208, 4060],
    17: [2920],
    18: [1890],
    22: [3769]
}

for layer in non_toxic_values.keys():
    for idx in non_toxic_values[layer]:
        print("VALUE LAYER " + str(layer) + " IDX " + str(idx))
        print(wrapper.project_value_to_vocab(layer, idx))
        print("\n")

with open("non_toxic_values.pickle", "wb") as handle:
    pickle.dump(non_toxic_values, handle, protocol = pickle.HIGHEST_PROTOCOL)

VALUE LAYER 13 IDX 1852
[(' transparency', 0.7804232), (' disclosure', 0.08604208), (' clearer', 0.031260245), (' humility', 0.018021714), ('parency', 0.01392095), ('iquette', 0.013061815), (' better', 0.006321151), (' modesty', 0.0062694233), (' transparent', 0.004169269), (' safer', 0.0038504757)]


VALUE LAYER 14 IDX 72
[(' reconc', 0.9182025), (' respectful', 0.029803287), (' healthy', 0.008752354), (' taxp', 0.007006365), (' gracious', 0.0039775595), (' decent', 0.003406394), (' fair', 0.0031924762), (' modesty', 0.0018224006), (' peacefully', 0.0016074623), (' peaceful', 0.0014397277)]


VALUE LAYER 14 IDX 1394
[('safe', 0.55344427), ('cart', 0.11201968), ('course', 0.064264335), (' Compact', 0.02360836), ('respect', 0.02322706), (' COUR', 0.019045586), ('safety', 0.01884989), (' neither', 0.016840337), (' Safe', 0.011104363), (' apologize', 0.010102618)]


VALUE LAYER 15 IDX 215
[(' acceptance', 0.17209195), (' refere', 0.12844867), ('Accept', 0.07810877), ('Relations', 0.065709

In [None]:
values = pd.read_pickle("non_toxic_values.pickle")
wrapper = GPT2Wrapper(model_name = "gpt2-medium", use_cuda = True)
ppl_debiased, ppl_regular = compute_ppl(tokenizer, wrapper, values_per_layer = values, 
                                        coef_value = 3, 
                                        use_cuda = True)

"""
Above lines equivalent to:

python3 perplexity.py --model_name gpt2-medium \
                      --values_filepath non_toxic_values.pickle \
                      --coef_value 3 \
                      --use_cuda
"""

In [None]:
values = pd.read_pickle("non_toxic_values.pickle")
wrapper = GPT2Wrapper(model_name = "gpt2-medium", use_cuda = True)

toxicity_scoring(prompts_filename = "prompts.jsonl", 
                 output_dir = "toxicity-suppression-results",
                 api_key = API_KEY,
                 wrapper = wrapper, 
                 values_per_layer = values,
                 challenging_only = True,
                 coef_value = 3,
                 mode = "toxic-suppr",
                 max_prompts = 100)

"""
Above lines equivalent to:

python3 toxicity_scoring.py --prompts_filename prompts.jsonl \
                            --output_dir toxicity-suppresion-results \
                            --api_key <API_KEY> \
                            --model_name gpt2-medium \
                            --values_filepath non_toxic_values.pickle \
                            --challenging_only \
                            --coef_value 3 \
                            --mode toxic-suppr \
                            --max_prompts 100 \
                            --use_cuda
"""

In [None]:
wrapper = GPT2Wrapper(model_name = "gpt2-medium", use_cuda = True)

toxicity_scoring(prompts_filename = "prompts.jsonl", 
                 output_dir = "toxicity-suppression-results",
                 api_key = API_KEY,
                 wrapper = wrapper, 
                 challenging_only = True,
                 mode = "word-filter",
                 max_prompts = 100)

"""
Above lines equivalent to:

python3 toxicity_scoring.py --prompts_filename prompts.jsonl \
                            --output_dir toxicity-suppresion-results \
                            --api_key <API_KEY> \
                            --model_name gpt2-medium \
                            --challenging_only \
                            --mode word-filter \
                            --max_prompts 100 \
                            --use_cuda
"""