## Install requirments

In [1]:
!pip install jailbreakbench
!pip install datasets

Collecting jailbreakbench
  Downloading jailbreakbench-0.1.3-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.7/77.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting litellm~=1.20.0 (from jailbreakbench)
  Downloading litellm-1.20.9-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<3.0.0,>=2.2.0 (from jailbreakbench)
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf<5.0.0,>=4.25.3 (from jailbreakbench)
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece<1.0.

## Import

In [2]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import jailbreakbench as jbb
from datasets import load_dataset
import pandas as pd
import json
import os

## Constants

In [None]:
DATA_DIR="/content/drive/MyDrive/prompt_security_code"

## Functions

In [3]:
def calculate_perplexity(sentence, model_name='gpt2'):
    # Load pre-trained model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors='pt')
    input_ids = inputs['input_ids']

    # Get the logits from the model
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)

    # Calculate the loss
    loss = outputs.loss
    perplexity = torch.exp(loss)

    return perplexity.item()


In [4]:
def calculate_sequence_length(sentence, model_name='gpt2'):
    # Load pre-trained model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    inputs = tokenizer(sentence, return_tensors='pt')
    return inputs['input_ids'].shape[1]


In [5]:
def sent_array_to_string(sent_array):
    prompt=""
    for sent in sent_array:
        prompt=prompt+" ".join(sent)
    return prompt


In [6]:
def preprocess_prompts(prompts,output_path):
  result={'idx':[],'Prompt':[],'Length':[],'Perplexity':[],  }


  if os.path.exists(output_path):
    with open(output_path,'r') as f:
      result=json.load(f)

  from tqdm import tqdm

  for i in tqdm(range(len(prompts))):
    if i in result["idx"]:
      continue

    prompt=prompts[i]
    try:
        per=calculate_perplexity(prompt)
    except:
        per=None
    try:
        p_len=calculate_sequence_length(prompt)
    except:
        p_len=None

    result['idx'].append(i)
    result['Prompt'].append(prompt)
    result['Length'].append(p_len)
    result['Perplexity'].append(per)



    with open(output_path,'w') as f:
      f.write(json.dumps(result))

  return pd.DataFrame.from_dict(result)


## Preproccess the datasets

### 1. DocRED
This dataset can be found in the Huggingface hub under name [docred](https://huggingface.co/datasets/thunlp/docred) . We use the
validation split, containing 998 multi-sentence passages designed for the development of entity and relation extraction from long documents.

In [7]:
docred_df = load_dataset("thunlp/docred", split='validation').to_pandas()
docred_df["Prompt"]=[sent_array_to_string(sent_array) for sent_array in docred_df["sents"]]
docred_df=docred_df[["Prompt"]]
docred_df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.32k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/929k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.81M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/998 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_annotated split:   0%|          | 0/3053 [00:00<?, ? examples/s]

Generating train_distant split:   0%|          | 0/101873 [00:00<?, ? examples/s]

                              title  \
0                           Skai TV   
1  Washington Place (West Virginia)   
2             IBM Research – Brazil   
3                        Lookin Ass   
4                 Conrad O. Johnson   

                                               sents  \
0  [[Skai, TV, is, a, Greek, free, -, to, -, air,...   
1  [[Washington, Place, (, William, Washington, H...   
2  [[IBM, Research, –, Brazil, is, one, of, twelv...   
3  [[", Lookin, Ass, ", (, originally, titled, ",...   
4  [[Conrad, Oberon, Johnson, (, November, 15, ,,...   

                                           vertexSet  \
0  [[{'name': 'Skai TV', 'sent_id': 4, 'pos': [0 ...   
1  [[{'name': 'Washington Place', 'sent_id': 0, '...   
2  [[{'name': 'IBM Research – Brazil', 'sent_id':...   
3  [[{'name': 'Lookin Ass', 'sent_id': 6, 'pos': ...   
4  [[{'name': 'Conrad Oberon Johnson', 'sent_id':...   

                                              labels  
0  {'head': [2, 3, 5, 0, 0, 0, 0], '

In [None]:
preprocess_prompts(list(docred_df["Prompt"]),f"{DATA_DIR}/loading/docRED.json").to_csv(f"{DATA_DIR}/output/docRED.csv")

  0%|          | 0/998 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

 33%|███▎      | 325/998 [16:43<33:22,  2.98s/it]

### 2. SuperGLUE
This dataset can be found in the Huggingface hub under the name super glue. We
use the validation split of the subset named boolq, containing 3270 passages for
answering Yes/No questions. We formulated prompts by combining the fixed instruction “Read the following passage and answer the question:”, followed by the question field in the dataset example, and on a new line we write the passage field of the example.

In [None]:
boolq_df = load_dataset("aps/super_glue",'boolq',split="validation").to_pandas()
boolq_df["Prompt"]="Read the following passage and answer the question:"+super_glue_df["question"]+"\n"+super_glue_df["passage"]
boolq_df=boolq_df[["Prompt"]]

In [None]:
super_glue_df

In [None]:
preprocess_prompts(list(boolq_df["Prompt"]),f"{DATA_DIR}/loading/boolq.json").to_csv(f"{DATA_DIR}/output/boolq.csv")

### 3. Squad V2
The Stanford Question-Answering Dataset is a well-known span-based question-answering dataset
that can be found in the Huggingface hub under the name squad v2. We use
the validation split containing 11873 examples. We formulated prompts by combining three fields
from each example, the title, the context and the question using the following form: We start with
an instruction “Given a context passage from a document titled [title field goes here], followed by
a question, try to answer the question with a span of words from the context:”. Then after a new
line the prompt continues with “The context follows:” followed by the context field, and then after
another new line “The question is:” followed by the question field.

In [None]:
super_glue_squad_v2_df = load_dataset("rajpurkar/squad_v2",split="validation").to_pandas()
super_glue_squad_v2_df["Prompt"]="Given a context passage from a document titled "+super_glue_squad_v2_df["title"]+"\nThe context follows:"+super_glue_squad_v2_df["context"]+"\nThe question is:"+super_glue_squad_v2_df["question"]
super_glue_squad_v2_df=super_glue_squad_v2_df[["Prompt"]]

In [None]:
super_glue_squad_v2_df

In [None]:
preprocess_prompts(list(super_glue_squad_v2_df["Prompt"]),f"{DATA_DIR}/loading/super_glue_squad_v2.json").to_csv(f"{DATA_DIR}/output/super_glue_squad_v2.csv")

### 4. Open Playtpus
The Open-Platypus Dataset is associated with the Platypus project. We use the Huggingface
dataset garage-bAInd/Open-Platypus containing 24926 prompts with instructions from the Platypus dataset’s training split, as they appear, without any additional prefix or suffix. This dataset is
focused on improving LLM logical reasoning skills and was used to train the Platypus2 models

In [None]:
platypus_df = load_dataset("garage-bAInd/Open-Platypus",split= "train").to_pandas()
platypus_df=platypus_df.rename(columns={'instruction':'Prompt'})[["Prompt"]]

In [None]:
platypus_df

In [None]:
preprocess_prompts(list(platypus_df["Prompt"]),f"{DATA_DIR}/loading/platypus.json").to_csv(f"{DATA_DIR}/output/platypus.csv")

### 5. Puffin
This dataset can be found in the Huggingface hub under the name LDJnr/Puffin. Puffin contains
3000 conversations with GPT-4, each being a sequence of interactions that start with the human’s
query. We constructed two samples from this dataset. One is the set of all 6994 prompts produced
by the human side of the conversation. The other contains only the initial utterance that starts each
of the 3000 conversations

In [None]:
puffin_df = load_dataset("LDJnr/Puffin",split="train").to_pandas()
puffin_df=puffin_df.explode(column="conversations").drop(columns=["id"])
puffin_df["Source"]=puffin_df['conversations'].apply(lambda s:s['from'])
puffin_df["prompt"]=puffin_df['conversations'].apply(lambda s:s['value'])
puffin_df=puffin_df.drop(columns=["conversations"])
puffin_df=puffin_df[puffin_df["Source"]=="human"]

In [None]:
puffin_df

In [None]:
preprocess_prompts(list(puffin_df["Prompt"]),f"{DATA_DIR}/loading/puffin.json").to_csv(f"{DATA_DIR}/output/puffin.csv")

### 6. Tapir
This is a large dataset containing examples intended for instruction-following training. We use the
Huggingface dataset MattiaL/tapir-cleaned-116k (Mattia Limone, 2023) containing 116862 exam-
ples. We construct prompts by concatenating the instruction field and the input field from each
example.

In [None]:
tapir_df = load_dataset("MattiaL/tapir-cleaned-116k",split="train").to_pandas()
tapir_df["Prompt"]=tapir_df["instruction"]+"\n"+tapir_df["input"]
tapir_df=tapir_df[["Prompt"]]

In [None]:
tapir_df

In [None]:
preprocess_prompts(list(tapir_df["Prompt"]),f"{DATA_DIR}/loading/tapir.json").to_csv(f"{DATA_DIR}/output/tapir.csv")

### 7. INSTRUCTIONAL C ODE S EARCH
This is a large dataset containing instructional examples for coding in Python. We use the Hug-
gingface dataset Nan-Do/instructional code-search-net-python. because the data set is very large
we only include the first 10,000 examples.

In [None]:
code_df = load_dataset("Nan-Do/instructional_code-search-net-python",split="train").to_pandas()
code_df=code_df.rename(columns={"INSTRUCTION":"Prompt"})
code_df['idx']=list(range(len(code_df)))
code_df=code_df[code_df['idx']<=10000]
code_df=code_df[["Prompt"]]

In [None]:
code_df

In [None]:
preprocess_prompts(list(code_df["Prompt"]),f"{DATA_DIR}/loading/code.json").to_csv(f"{DATA_DIR}/output/code.csv")

## 8. Adverserial workbench

In [None]:
names=['vicuna-13b-v1.5', 'llama-2-7b-chat-hf', 'gpt-3.5-turbo-1106', 'gpt-4-0125-preview']
df_list=[]
for name in names:
  artifact = jbb.read_artifact(method="GCG", model_name=name)
  data=[dict(j) for j in artifact.jailbreaks]
  df = pd.DataFrame(data)
  df=df[["index","category","behavior","prompt","jailbroken"]]
  df["model_name"]=name
  df_list.append(df)
adv_df=pd.concat(df_list)

In [None]:
adv_df

In [None]:
preprocess_prompts(list(adv_df["prompt"]),f"{DATA_DIR}/loading/adv_prompts.json").to_csv(f"{DATA_DIR}/output/adv_prompts.csv")