In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Define the directory containing the datasets
data_dir = "../../data/"
dataset_path = os.path.join(data_dir, "analyzed/catHarmQA/combined_catqa.csv")

In [3]:
data = pd.read_csv(dataset_path)

## 1. Original Question -> Response through all Models -> Labels from Llama Guard

In [13]:
# Count the number of safe and unsafe responses of each model for original questions
data.groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

original_response_safety,safe,unsafe
model,Unnamed: 1_level_1,Unnamed: 2_level_1
llama2,0.32,0.68
llama3,0.18,0.82
llama31,0.216364,0.783636
mistral,0.176364,0.823636


### 1.1 Cross verification

In [None]:
# Define a helper function to filter the safety responses
def filter_safety_response(label):
    return label.strip().split()[0].lower()

# Load individual model safety data
llama2_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_llama2_Question_safety.csv"))
llama3_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_llama3_Question_safety.csv"))
llama31_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_llama31_Question_safety.csv"))
mistral_org_ques_safety_df = pd.read_csv(os.path.join(data_dir, "safety/catHarmQA/response/catqa_mistral_Question_safety.csv"))

# Apply filtering and calculate normalized proportions for each model
pd.DataFrame({
            "llama2": llama2_org_ques_safety_df['Question_llama2_safety'].apply(filter_safety_response).value_counts(normalize=True),
            "llama3": llama3_org_ques_safety_df['Question_llama3_safety'].apply(filter_safety_response).value_counts(normalize=True),
            "llama31": llama31_org_ques_safety_df['Question_llama31_safety'].apply(filter_safety_response).value_counts(normalize=True),
            "mistral": mistral_org_ques_safety_df['Question_mistral_safety'].apply(filter_safety_response).value_counts(normalize=True)
        }).fillna(0).T

Unnamed: 0,unsafe,safe
llama2,0.68,0.32
llama3,0.82,0.18
llama31,0.783636,0.216364
mistral,0.823636,0.176364


## 2. Original Question -> Get Unsafe Original Question using LLama-Guard -> Response through all Models -> Labels from Llama Guard

In [36]:
data.original_question_safety.value_counts(normalize=True)

original_question_safety
unsafe    0.932727
safe      0.067273
Name: proportion, dtype: float64

In [47]:
# filtered unsafe original question according to llama guard
data[data["original_question_safety"] == "unsafe"].groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

original_response_safety,safe,unsafe
model,Unnamed: 1_level_1,Unnamed: 2_level_1
llama2,0.280702,0.719298
llama3,0.130604,0.869396
llama31,0.169591,0.830409
mistral,0.126706,0.873294


## 3. Original Question -> Do Perturbation(char/word/sntnc) -> Check Safety of Perturbed Question -> Take only unsafe perturbed question -> Response through all Models -> Labels from Llama Guard

### 3.1. Safe percetnage of char, word, sentnc

In [74]:
# Safety percentages for perturbed questions
data[data["perturbed_question_safety"] == "unsafe"].groupby("perturbation_level").perturbed_response_safety.value_counts(normalize=True).unstack()

perturbed_response_safety,safe,unsafe
perturbation_level,Unnamed: 1_level_1,Unnamed: 2_level_1
char,0.181836,0.818164
sntnc,0.223295,0.776705
word,0.1844,0.8156


### 3.2. re-verification

In [91]:
for level in ["char","word", "sntnc"]:
    dd = data[data.perturbation_level == level]
    dd = dd[dd.perturbed_question_safety == "unsafe"]
    dd = dd.perturbed_response_safety.value_counts(normalize=True)*100
    print(f"{level} : {dd.safe:.2f}")

char : 18.18
word : 18.44
sntnc : 22.33


### 3.3. Unsafe

In [69]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(["model" , "perturbation_level"])["perturbed_response_safety"].value_counts(normalize=True).unstack() * 100

Unnamed: 0_level_0,perturbed_response_safety,safe,unsafe
model,perturbation_level,Unnamed: 2_level_1,Unnamed: 3_level_1
llama2,char,28.177702,71.822298
llama2,sntnc,35.113636,64.886364
llama2,word,29.150157,70.849843
llama3,char,12.448323,87.551677
llama3,sntnc,18.295455,81.704545
llama3,word,12.973084,87.026916
llama31,char,16.444649,83.555351
llama31,sntnc,19.659091,80.340909
llama31,word,16.0746,83.9254
mistral,char,15.663757,84.336243
