In [20]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Define the directory containing the datasets
data_dir = "../../data/"
dataset_path = os.path.join(data_dir, "analyzed/catHarmQA/combined_catqa.csv")

In [21]:
data = pd.read_csv(dataset_path)

## 1. Original Question -> Response through all Models -> Labels from Llama Guard

In [22]:
# Count the number of safe and unsafe responses of each model for original questions
data.groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

original_response_safety,safe,unsafe
model,Unnamed: 1_level_1,Unnamed: 2_level_1
llama2,0.32,0.68
llama3,0.18,0.82
llama31,0.216364,0.783636
mistral,0.176364,0.823636


### 1.1 Cross verification

In [23]:
# Define a helper function to filter the safety responses
def filter_safety_response(label):
    return label.strip().split()[0].lower()

# Load individual model safety data
llama2_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_llama2_Question_safety.csv"))
llama3_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_llama3_Question_safety.csv"))
llama31_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_llama31_Question_safety.csv"))
mistral_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_mistral_Question_safety.csv"))

# Apply filtering and calculate normalized proportions for each model
pd.DataFrame({
            "llama2": llama2_org_ques_safety_df['Question_llama2_safety'].apply(filter_safety_response).value_counts(normalize=True),
            "llama3": llama3_org_ques_safety_df['Question_llama3_safety'].apply(filter_safety_response).value_counts(normalize=True),
            "llama31": llama31_org_ques_safety_df['Question_llama31_safety'].apply(filter_safety_response).value_counts(normalize=True),
            "mistral": mistral_org_ques_safety_df['Question_mistral_safety'].apply(filter_safety_response).value_counts(normalize=True)
        }).fillna(0).T

Unnamed: 0,unsafe,safe
llama2,0.68,0.32
llama3,0.82,0.18
llama31,0.783636,0.216364
mistral,0.823636,0.176364


## 2. Original Question -> Get Unsafe Original Question using LLama-Guard -> Response through all Models -> Labels from Llama Guard

In [24]:
data.original_question_safety.value_counts(normalize=True)

original_question_safety
unsafe    0.932727
safe      0.067273
Name: proportion, dtype: float64

In [25]:
# filtered unsafe original question according to llama guard
data[data["original_question_safety"] == "unsafe"].groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

original_response_safety,safe,unsafe
model,Unnamed: 1_level_1,Unnamed: 2_level_1
llama2,0.280702,0.719298
llama3,0.130604,0.869396
llama31,0.169591,0.830409
mistral,0.126706,0.873294


## 3. Original Question -> Do Perturbation(char/word/sntnc) -> Check Safety of Perturbed Question -> Take only unsafe perturbed question -> Response through all Models -> Labels from Llama Guard

### 3.1. Safe percetnage of char, word, sentnc

In [26]:
# Safety percentages for perturbed questions
data[data["perturbed_question_safety"] == "unsafe"].groupby("perturbation_level").perturbed_response_safety.value_counts(normalize=True).unstack()

perturbed_response_safety,safe,unsafe
perturbation_level,Unnamed: 1_level_1,Unnamed: 2_level_1
char,0.181836,0.818164
sntnc,0.223295,0.776705
word,0.1844,0.8156


In [27]:
# Safety percentages for perturbed questions
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    "perturbation_level"
).perturbed_response_pre_safety.value_counts(normalize=True).unstack()

perturbed_response_pre_safety,safe,unsafe
perturbation_level,Unnamed: 1_level_1,Unnamed: 2_level_1
char,0.661362,0.338638
sntnc,0.63267,0.36733
word,0.6471,0.3529


### 3.2. re-verification

In [28]:
for level in ["char","word", "sntnc"]:
    dd = data[data.perturbation_level == level]
    dd = dd[dd.perturbed_question_safety == "unsafe"]
    dd = dd.perturbed_response_safety.value_counts(normalize=True)*100
    print(f"{level} : {dd.safe:.2f}")

char : 18.18
word : 18.44
sntnc : 22.33


In [29]:
for level in ["char","word", "sntnc"]:
    dd = data[data.perturbation_level == level]
    dd = dd[dd.perturbed_question_safety == "unsafe"]
    dd = dd.perturbed_response_pre_safety.value_counts(normalize=True)*100
    print(f"{level} : {dd.safe:.2f}")

char : 66.14
word : 64.71
sntnc : 63.27


### 3.3. Unsafe

In [30]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(["model" , "perturbation_level"])["perturbed_response_safety"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,perturbed_response_safety,safe,unsafe
model,perturbation_level,Unnamed: 2_level_1,Unnamed: 3_level_1
llama2,char,28.177702,71.822298
llama2,sntnc,35.113636,64.886364
llama2,word,29.150157,70.849843
llama3,char,12.448323,87.551677
llama3,sntnc,18.295455,81.704545
llama3,word,12.973084,87.026916
llama31,char,16.444649,83.555351
llama31,sntnc,19.659091,80.340909
llama31,word,16.0746,83.9254
mistral,char,15.663757,84.336243


In [31]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(["model" , "perturbation_level"])["perturbed_response_pre_safety"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,perturbed_response_pre_safety,safe,unsafe
model,perturbation_level,Unnamed: 2_level_1,Unnamed: 3_level_1
llama2,char,90.957412,9.042588
llama2,sntnc,83.75,16.25
llama2,word,89.124197,10.875803
llama3,char,44.300807,55.699193
llama3,sntnc,49.545455,50.454545
llama3,word,46.010384,53.989616
llama31,char,68.34438,31.65562
llama31,sntnc,65.0,35.0
llama31,word,64.366717,35.633283
mistral,char,60.942319,39.057681


In [None]:
# Set display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", None)  # Do not wrap columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column values

data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["model", "perturbation_level", "category"]
)["d"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,Unnamed: 1_level_0,perturbed_response_safety,safe,unsafe
model,perturbation_level,category,Unnamed: 3_level_1,Unnamed: 4_level_1
llama2,char,Adult Content,22.237106,77.762894
llama2,char,Child Abuse,16.862474,83.137526
llama2,char,Economic Harm,27.983252,72.016748
llama2,char,Fraud/Deception,24.602626,75.397374
llama2,char,Hate/Harass/Violence,25.409836,74.590164
llama2,char,Illegal Activity,16.221766,83.778234
llama2,char,Malware Viruses,21.716738,78.283262
llama2,char,Physical Harm,39.670556,60.329444
llama2,char,Political Campaigning,45.617978,54.382022
llama2,char,Privacy Violation Activity,34.0625,65.9375


In [33]:
# Set display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", None)  # Do not wrap columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column values

data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["model", "perturbation_level", "category"]
)["perturbed_response_pre_safety"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,Unnamed: 1_level_0,perturbed_response_pre_safety,safe,unsafe
model,perturbation_level,category,Unnamed: 3_level_1,Unnamed: 4_level_1
llama2,char,Adult Content,97.320831,2.679169
llama2,char,Child Abuse,92.328956,7.671044
llama2,char,Economic Harm,92.114445,7.885555
llama2,char,Fraud/Deception,91.430546,8.569454
llama2,char,Hate/Harass/Violence,98.29235,1.70765
llama2,char,Illegal Activity,89.869952,10.130048
llama2,char,Malware Viruses,77.424893,22.575107
llama2,char,Physical Harm,91.146191,8.853809
llama2,char,Political Campaigning,90.337079,9.662921
llama2,char,Privacy Violation Activity,87.421875,12.578125


In [34]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["perturbation_level" , "category"]
)["perturbed_response_safety"].value_counts(normalize=True).unstack().T * 100

perturbation_level,char,char,char,char,char,char,char,char,char,char,char,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,word,word,word,word,word,word,word,word,word,word,word
category,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice
perturbed_response_safety,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2
safe,14.651708,12.629578,17.00977,15.739461,13.592896,11.601643,15.472103,27.350721,22.565543,21.054688,30.230708,20.618557,13.372093,23.148148,22.527473,21.91358,14.516129,14.015152,30.813953,28.873239,30.46875,29.296875,15.180395,14.652677,17.398524,15.647226,14.298064,11.281963,16.09589,25.863284,22.373697,23.830156,28.797209
unsafe,85.348292,87.370422,82.99023,84.260539,86.407104,88.398357,84.527897,72.649279,77.434457,78.945312,69.769292,79.381443,86.627907,76.851852,77.472527,78.08642,85.483871,85.984848,69.186047,71.126761,69.53125,70.703125,84.819605,85.347323,82.601476,84.352774,85.701936,88.718037,83.90411,74.136716,77.626303,76.169844,71.202791


In [35]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["perturbation_level", "category"]
)["perturbed_response_pre_safety"].value_counts(normalize=True).unstack().T * 100

perturbation_level,char,char,char,char,char,char,char,char,char,char,char,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,sntnc,word,word,word,word,word,word,word,word,word,word,word
category,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice,Adult Content,Child Abuse,Economic Harm,Fraud/Deception,Hate/Harass/Violence,Illegal Activity,Malware Viruses,Physical Harm,Political Campaigning,Privacy Violation Activity,Tailored Financial Advice
perturbed_response_pre_safety,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2
safe,70.562626,64.581894,68.684578,64.754665,67.127732,58.675565,56.909871,70.041181,61.70412,68.554688,75.13922,65.979381,59.302326,68.518519,61.263736,68.82716,53.763441,49.242424,75.290698,62.676056,62.5,67.1875,67.903336,62.608538,67.306273,65.078236,65.6639,58.258466,55.502283,68.410853,61.267041,66.832756,71.941708
unsafe,29.437374,35.418106,31.315422,35.245335,32.872268,41.324435,43.090129,29.958819,38.29588,31.445312,24.86078,34.020619,40.697674,31.481481,38.736264,31.17284,46.236559,50.757576,24.709302,37.323944,37.5,32.8125,32.096664,37.391462,32.693727,34.921764,34.3361,41.741534,44.497717,31.589147,38.732959,33.167244,28.058292


In [36]:
data.category.value_counts(normalize=True) * 100

category
Adult Content                 9.090909
Child Abuse                   9.090909
Economic Harm                 9.090909
Fraud/Deception               9.090909
Hate/Harass/Violence          9.090909
Illegal Activity              9.090909
Malware Viruses               9.090909
Physical Harm                 9.090909
Political Campaigning         9.090909
Privacy Violation Activity    9.090909
Tailored Financial Advice     9.090909
Name: proportion, dtype: float64