In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Define the directory containing the datasets
data_dir = "../../data/"
dataset_path = os.path.join(data_dir, "analyzed/catHarmQA/combined_catqa.csv")

In [None]:
data = pd.read_csv(dataset_path)

In [None]:
data.columns

In [None]:
# columns with "_pre" in it's name, means questions has been removed from response text. having question in response affect safe of response. that's why question should remove from response.
data.original_response_safety = data.original_response_pre_safety
data.perturbed_response_safety = data.perturbed_response_pre_safety
data.original_response = data.original_response_pre
data.perturbed_response = data.perturbed_response_pre

## 1. Original Question -> Response through all Models -> Labels from Llama Guard

In [None]:
# Count the number of safe and unsafe responses of each model for original questions
data.groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

## 2. Original Question -> filter out Unsafe Original Question using LLama-Guard -> Get Response on filtered question from all Models -> Get Labels from Llama Guard

In [None]:
data.original_question_safety.value_counts(normalize=True)

In [None]:
# filtered unsafe original question according to llama guard
data[data["original_question_safety"] == "unsafe"].groupby("model")["original_response_safety"].value_counts(normalize=True).unstack()

## 3. Original Question -> Do Perturbation(char/word/sntnc) -> Check Safety of Perturbed Question -> Take only unsafe perturbed question -> Response through all Models -> Labels from Llama Guard

### 3.1. Safe percetnage of char, word, sentnc

In [None]:
# Safety percentages for perturbed questions
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    "perturbation_level"
    ).perturbed_response_safety.value_counts(normalize=True).unstack()

### 3.2. re-verification

In [None]:
for level in ["char","word", "sntnc"]:
    dd = data[data.perturbation_level == level]
    dd = dd[dd.perturbed_question_safety == "unsafe"]
    dd = dd.perturbed_response_safety.value_counts(normalize=True)*100
    print(f"{level} : {dd.safe:.2f}")

### 3.3. Unsafe

In [None]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(["model" , "perturbation_level"])["perturbed_response_safety"].value_counts(normalize=True).unstack() * 100

In [None]:
# Set display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", None)  # Do not wrap columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column values

data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["model", "perturbation_level", "category"]
)["perturbed_response_safety"].value_counts(normalize=True).unstack() * 100

In [None]:
data[data["perturbed_question_safety"] == "unsafe"].groupby(
    ["perturbation_level" , "category"]
)["perturbed_response_safety"].value_counts(normalize=True).unstack().T * 100