In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Define the directory containing the datasets
data_dir = "../../data/"
dataset_path = os.path.join(data_dir, "analyzed/catHarmQA/combined_catqa.csv")

In [2]:
data = pd.read_csv(dataset_path)
data.shape

(136400, 21)

In [3]:
data.columns

Index(['category', 'subcategory', 'original_question',
       'original_question_safety', 'original_response',
       'original_response_safety', 'original_response_pre',
       'original_response_pre_safety', 'experiment', 'perturbation_level',
       'perturbation_type', 'perturbation_count', 'perturbed_question',
       'perturbed_question_safety', 'model', 'perturbed_response',
       'perturbed_response_safety', 'perturbed_response_pre',
       'perturbed_response_pre_safety', 'latent_similarity',
       'token_similarity'],
      dtype='object')

In [4]:
# columns with "_pre" in it's name, means questions has been removed from response text. having question in response affect safe of response. that's why question should remove from response.
data.original_response_safety = data.original_response_pre_safety
data.perturbed_response_safety = data.perturbed_response_pre_safety
data.original_response = data.original_response_pre
data.perturbed_response = data.perturbed_response_pre

## 1. Original Question -> Response through all Models -> Labels from Llama Guard

In [5]:
# Count the number of safe and unsafe responses of each model for original questions
data.groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

original_response_safety,safe,unsafe
model,Unnamed: 1_level_1,Unnamed: 2_level_1
llama2,0.856364,0.143636
llama3,0.494545,0.505455
llama31,0.670909,0.329091
mistral,0.538182,0.461818


## 2. Original Question -> filter out Unsafe Original Question using LLama-Guard -> Get Response on filtered question from all Models -> Get Labels from Llama Guard

In [6]:
data.original_question_safety.value_counts(normalize=True)

original_question_safety
unsafe    0.932727
safe      0.067273
Name: proportion, dtype: float64

In [7]:
# filtered unsafe original question according to llama guard
data[data["original_question_safety"] == "unsafe"].groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

original_response_safety,safe,unsafe
model,Unnamed: 1_level_1,Unnamed: 2_level_1
llama2,0.85575,0.14425
llama3,0.465887,0.534113
llama31,0.65692,0.34308
mistral,0.508772,0.491228


## 3. Original Question -> Do Perturbation(char/word/sntnc) -> Check Safety of Perturbed Question -> Take only unsafe perturbed question -> Response through all Models -> Labels from Llama Guard

### 3.1. Safe percetnage of char, word, sentnc

In [8]:
# Safety percentages for perturbed questions
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    "perturbation_level"
    ).perturbed_response_safety.value_counts(normalize=True).unstack()

perturbed_response_safety,safe,unsafe
perturbation_level,Unnamed: 1_level_1,Unnamed: 2_level_1
char,0.661936,0.338064
sntnc,0.633239,0.366761
word,0.64751,0.35249


### 3.2. re-verification

In [9]:
for level in ["char","word", "sntnc"]:
    dd = data[data.perturbation_level == level]
    dd = dd[dd.perturbed_question_safety == "unsafe"]
    dd = dd.perturbed_response_safety.value_counts(normalize=True)*100
    print(f"{level} : {dd.safe:.2f}")

char : 66.19
word : 64.75
sntnc : 63.32


### 3.3. Unsafe

In [10]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(["model" , "perturbation_level"])["perturbed_response_safety"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,perturbed_response_safety,safe,unsafe
model,perturbation_level,Unnamed: 2_level_1,Unnamed: 3_level_1
llama2,char,90.963974,9.036026
llama2,sntnc,83.863636,16.136364
llama2,word,89.131029,10.868971
llama3,char,44.379552,55.620448
llama3,sntnc,49.545455,50.454545
llama3,word,46.065036,53.934964
llama31,char,68.416563,31.583437
llama31,sntnc,65.113636,34.886364
llama31,word,64.441864,35.558136
mistral,char,61.014502,38.985498


In [11]:
# Set display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", None)  # Do not wrap columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column values

data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["model", "perturbation_level", "category"]
)["perturbed_response_safety"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,Unnamed: 1_level_0,perturbed_response_safety,safe,unsafe
model,perturbation_level,category,Unnamed: 3_level_1,Unnamed: 4_level_1
llama2,char,Adult Content,97.38781,2.61219
llama2,char,Child Abuse,92.328956,7.671044
llama2,char,Economic Harm,92.114445,7.885555
llama2,char,Fraud/Deception,91.430546,8.569454
llama2,char,Hate/Harass/Violence,98.29235,1.70765
llama2,char,Illegal Activity,89.869952,10.130048
llama2,char,Malware Viruses,77.424893,22.575107
llama2,char,Physical Harm,91.146191,8.853809
llama2,char,Political Campaigning,90.337079,9.662921
llama2,char,Privacy Violation Activity,87.421875,12.578125


In [12]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["perturbation_level" , "category"]
)["perturbed_response_safety"].value_counts(normalize=True).unstack().T * 100

perturbation_level,char,char,char,char,char,char,char,char,char,char,char,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,word,word,word,word,word,word,word,word,word,word,word
category,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice
perturbed_response_safety,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2
safe,70.696584,64.668279,68.702024,64.789219,67.196038,58.744011,56.95279,70.075498,61.741573,68.613281,75.178998,65.979381,59.302326,68.518519,61.263736,68.82716,53.763441,50.0,75.290698,62.676056,62.5,67.1875,67.903336,62.626628,67.380074,65.131579,65.698479,58.310297,55.593607,68.410853,61.307137,66.854419,72.02381
unsafe,29.303416,35.331721,31.297976,35.210781,32.803962,41.255989,43.04721,29.924502,38.258427,31.386719,24.821002,34.020619,40.697674,31.481481,38.736264,31.17284,46.236559,50.0,24.709302,37.323944,37.5,32.8125,32.096664,37.373372,32.619926,34.868421,34.301521,41.689703,44.406393,31.589147,38.692863,33.145581,27.97619
