## Necessary Data Processing ###

In [7]:
import pandas as pd

#### Processing for Experiment 3: percent biases and guardrail model performance ####

Creating dataframes that identify, per model, how much bias was decreased after identifying inputs as harmful

In [26]:
#load in the df from IBM that includes all prompts, stigmas, what is a biased answer and cluster types
df_ssqa = pd.read_csv("hf://datasets/ibm/SocialStigmaQA/yes_no/full_dataset_yes_no.csv")

In [21]:
#load in harm detection results from guardrail models
granite_bias_detection = pd.read_csv('/data/raw-guardrail-model-results/guardian-bias-detection-results.csv')
granite_harm_detection = pd.read_csv('/data/raw-guardrail-model-results/guardian-harm-detection-results.csv')
llama_guard = pd.read_csv('/data/raw-guardrail-model-results/llama_guard_all.csv')
mistral_moderation = pd.read_csv('/data/raw-guardrail-model-results/mistral-moderation-detection-results.csv')

#load in ssqa (socialstigmaqa) performance
SSQA = pd.read_csv("/data/SSQA-results.csv")

In [148]:
granite_reset = SSQA[SSQA["model"] == "granite"].reset_index()
mistral_reset = SSQA[SSQA["model"] == "mistral"].reset_index()
llama_reset =  SSQA[SSQA["model"] == "llama"].reset_index()

In [124]:
#function that returns post-guardrail bias (len = 10360)

def post_guardrail_bias(table, guardrail_results_table, guardrail_results_column, bias_value):
    post_bias = []
    for i in range(0, len(granite_reset)):
        #did granite give a biased answer?
        if table["biased answer"][i] == 1:
            #if it did, then check if the guardrail flagged it as harmful
            if guardrail_results_table[guardrail_results_column][i] == bias_value:
                #if it did, change to unbiased
                post_bias.append(0)
            else:
                #else, keep it biased
                post_bias.append(1)
        else:
            post_bias.append(0)
    return post_bias
        

In [190]:
#for harm default:
post_granite_bias = post_guardrail_bias(granite_reset, granite_harm_detection, "risk of harm (label)?", "Yes")

#for social bias category: 
post_granite_bias_bias = post_guardrail_bias(granite_reset, granite_bias_detection, "risk of social bias (label)?", "Yes")                             

In [339]:
granite_and_granite_guardian_data = {"stigma": df_ssqa['stigma'], "prompt": df_ssqa['prompt'], "prompt style": df_ssqa['prompt style'],
                               "biased answer" : df_ssqa['biased answer'], "risk of harm (label)?" : granite_harm_detection["risk of harm (label)?"],
                               "probability of harm?" : granite_harm_detection["probability of harm?"],
                               "original granite answer": granite_reset["model answer"], "original answer bias": granite_reset['biased answer'],
                               "post guardian answer bias" : post_granite_bias, "clusters": granite_reset['clusters'],
                                "risk of social bias (label)?": granite_bias_detection['risk of social bias (label)?'], 	
                                "probability of social bias?": granite_bias_detection['probability of social bias?'],
                        "post bias guardian answer bias": post_granite_bias_bias}
granite_and_granite_guardian = pd.DataFrame(granite_and_granite_guardian_data)

In [401]:
mistral_and_mistral_moderation_data = mistral_moderation.copy()
mistral_and_mistral_moderation_data['post moderation answer bias'] = post_guardrail_bias(mistral_reset, mistral_moderation, "any_true_threshold", True)
mistral_and_mistral_moderation_data['cluster names'] = mistral_reset['clusters']
mistral_and_mistral_moderation_data ['original answer bias'] = mistral_reset['biased answer']
mistral_and_mistral_moderation = pd.DataFrame(mistral_and_mistral_moderation_data)

In [162]:
#specifically for llama since it needs to != safe
post_bias_llama = []
for i in range(0, len(llama_reset)):
        #did granite give a biased answer?
    if llama_reset["biased answer"][i] == 1:
            #if it did, then check if the guardrail flagged it as harmful
        if llama_guard['safe label'][i] != "\n\nsafe":
                #if it did, change to unbiased
                post_bias_llama.append(0)
        else:
                #else, keep it biased
                post_bias_llama.append(1)
    else:
            post_bias_llama.append(0)

In [219]:
unsafe_category = []

for i in llama_guard['safe label']:
    if i != "\n\nsafe":
        unsafe_category.append(i[-2:])
    else:
        unsafe_category.append('none')

In [221]:
llama_and_llama_guard_data = {"stigma": df_ssqa['stigma'], "prompt": df_ssqa['prompt'], "prompt style": df_ssqa['prompt style'],
                               "biased answer" : df_ssqa['biased answer'], 
                               "original llama answer": llama_reset["model answer"], "original answer bias" : llama_reset['biased answer'],
                              "llama guard safety label" : llama_guard['safe label'], "llama guard unsafe category": unsafe_category,
                               "post llama answer bias" : post_bias_llama, "clusters": llama_reset['clusters']}
llama_and_llama_guard = pd.DataFrame(llama_and_llama_guard_data)

In [419]:
granite_and_granite_guardian.to_csv('granite_and_granite_guardian.csv')
mistral_and_mistral_moderation.to_csv('mistral_and_mistral_moderation.csv')
llama_and_llama_guard.to_csv('llama_and_llama_guard.csv')

### Getting percentages of biased answers per stigma for Experiment 2 and 3 analysis ####

In [1]:
#granite = pd.read_csv("/data/SSQA-performance-and-guardrail-mitigations/mistral_and_mistral_moderation.csv")
#llama = pd.read_csv("/data/SSQA-performance-and-guardrail-mitigations/llama_and_llama_guard.csv")
#mistral = pd.read_csv("/data/SSQA-performance-and-guardrail-mitigations/mistral_and_mistral_moderation.csv")

In [403]:
pachankis_all = pd.read_csv('/data/results-from-pachankis-all.csv')
#save each model seperate
llama_data = {"Stigma": pachankis_all['Stigma'],"Cluster": pachankis_all['Cluster'], "Visibility" : pachankis_all['visibility-llama'], "Persistent Course": pachankis_all['course-llama'],
             "Disrupt": pachankis_all['disrupt-llama'], "Unappealing Aesthetics" : pachankis_all['aesthetics-llama'],
       "Controllable Origin": pachankis_all['origin-llama'], "Peril" :pachankis_all['peril-llama'], 
              "visibility human": pachankis_all['visibility-participants'], "course human": pachankis_all['course-participants'], 
               "disrupt human": pachankis_all['disrupt-participants'],"aesthetics human": pachankis_all['aesthetics-participants'],
               "origin human":pachankis_all['origin-participants'], "peril human": pachankis_all['peril-participants']}
llama_percentages = pd.DataFrame(llama_data)

mistral_data = {"Stigma": pachankis_all['Stigma'],"Cluster": pachankis_all['Cluster'], "Visibility" : pachankis_all['visibility-mistral'], "Persistent Course": pachankis_all['course-mistral'],
             "Disrupt": pachankis_all['disrupt-mistral'], "Unappealing Aesthetics" : pachankis_all['aesthetics-mistral'],
       "Controllable Origin": pachankis_all['origin-mistral'], "Peril" :pachankis_all['peril-mistral'], 
              "visibility human": pachankis_all['visibility-participants'], "course human": pachankis_all['course-participants'], 
               "disrupt human": pachankis_all['disrupt-participants'],"aesthetics human": pachankis_all['aesthetics-participants'],
               "origin human":pachankis_all['origin-participants'], "peril human": pachankis_all['peril-participants']}
mistral_percentages = pd.DataFrame(mistral_data)

granite_data = {"Stigma": pachankis_all['Stigma'],"Cluster": pachankis_all['Cluster'], "Visibility" : pachankis_all['visibility-granite'], "Persistent Course": pachankis_all['course-granite'],
             "Disrupt": pachankis_all['disrupt-granite'], "Unappealing Aesthetics" : pachankis_all['aesthetics-granite'],
       "Controllable Origin": pachankis_all['origin-granite'], "Peril" :pachankis_all['peril-granite'], 
              "visibility human": pachankis_all['visibility-participants'], "course human": pachankis_all['course-participants'], 
               "disrupt human": pachankis_all['disrupt-participants'],"aesthetics human": pachankis_all['aesthetics-participants'],
               "origin human":pachankis_all['origin-participants'], "peril human": pachankis_all['peril-participants']}
granite_percentages = pd.DataFrame(granite_data)

In [283]:
#get averages per stigma
def get_bias_per_stigma(table, post_bias_column):
    percent_pre = []
    percent_post = []
    for stigma in pachankis_all['Stigma']:
        percent_pre.append(sum(table[table['stigma'] == stigma]['original answer bias']) / 111)
        percent_post.append(sum(table[table['stigma'] == stigma][post_bias_column]) / 111)
    return percent_pre, percent_post

In [405]:
pre_llama, post_llama = get_bias_per_stigma(llama_and_llama_guard, "post llama answer bias")
pre_mistral, post_mistral = get_bias_per_stigma(mistral_and_mistral_moderation, "post moderation answer bias")
pre_granite, post_granite =  get_bias_per_stigma(granite_and_granite_guardian, "post guardian answer bias")

In [409]:
llama_percentages['percent biased'] = pre_llama
llama_percentages['percent biased post guardian'] = post_llama
mistral_percentages['percent biased'] = pre_mistral
mistral_percentages['percent biased post guardian'] = post_mistral
granite_percentages['percent biased'] = pre_granite
granite_percentages['percent biased post guardian'] = post_granite

In [415]:
#llama_percentages.to_csv('llama_bias_percentages.csv')
#mistral_percentages.to_csv('mistral_bias_percentages.csv')
#granite_percentages.to_csv('granite_bias_percentages.csv')

Reformatting percent bias changes for cluster and prompt style for easier visualization in R

In [None]:
mistral_guard = pd.read_csv("mistral_and_mistral_moderation.csv")
llama_guard = pd.read_csv("llama_and_llama_guard.csv")
granite_guard = pd.read_csv("granite_and_granite_guardian.csv")

In [198]:
def percent_bias_per_cluster(table, pre_or_post):
    none = sum(table[table["clusters"] == "no stigma"][pre_or_post]) / len(table[table["clusters"] == "no stigma"])
    one = sum(table[table["clusters"] == "1"][pre_or_post]) / len(table[table["clusters"] == "1"])
    two = sum(table[table["clusters"] == "2"][pre_or_post]) / len(table[table["clusters"] == "2"])
    three = sum(table[table["clusters"] == "3"][pre_or_post]) / len(table[table["clusters"] == "3"])
    four = sum(table[table["clusters"] == "4"][pre_or_post]) / len(table[table["clusters"] == "4"])
    five = sum(table[table["clusters"] == "5"][pre_or_post]) / len(table[table["clusters"] == "5"])
            
    return none, one, two, three, four, five

def percent_bias_per_prompt_style(table, pre_or_post):
    base = sum(table[table["prompt style"] == "base"][pre_or_post]) / len(table[table["prompt style"] == "base"])
    original = sum(table[table["prompt style"] == "original"][pre_or_post]) / len(table[table["prompt style"] == "original"])
    positive = sum(table[table["prompt style"] == "positive"][pre_or_post]) / len(table[table["prompt style"] == "positive"])
    doubt = sum(table[table["prompt style"] == "doubt"][pre_or_post]) / len(table[table["prompt style"] == "doubt"])

    return base, original, positive, doubt

Processing data to make change in bias per cluster type easier to graph:

In [None]:
bias_pre_llama = percent_bias_per_cluster(llama_guard, "original answer bias")
bias_pre_mistral = percent_bias_per_cluster(mistral_guard, "original answer bias")
bias_pre_granite = percent_bias_per_cluster(granite_guard, "original answer bias")
bias_pre_granite_1 = percent_bias_per_cluster(granite_guard, "original answer bias")

bias_post_llama = percent_bias_per_cluster(llama_guard, "post llama guard answer bias")
bias_post_mistral = percent_bias_per_cluster(mistral_guard, "post moderation answer bias")
bias_post_granite = percent_bias_per_cluster(granite_guard, "post guardian answer bias")
bias_post_granite_bias_guardian = percent_bias_per_cluster(granite_guard, "post bias guardian answer bias")

all_tuples = [bias_pre_llama, bias_post_llama, bias_pre_mistral, bias_post_mistral, bias_pre_granite, bias_post_granite, bias_pre_granite_1, bias_post_granite_bias_guardian]
bias_pre_and_post_all = list(itertools.chain(*all_tuples))

In [None]:
models_1 = [["llama"] * 12, ["mistral"] * 12, ["granite"] * 12, ["granite bias guardian"] * 12]
models = list(itertools.chain.from_iterable(models_1))
clusters = ["No stigma", "Awkward", "Threatening", "Sociodemographic","Innocuous Persistent", "Unappealing Persistent"] * 8
pre_post = ["A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B"] * 4

In [None]:
data_for_graphing = {"cluster": clusters, 'percent bias per cluster' : bias_pre_and_post_all, "model": models, "type":pre_post}
rq3_table = pd.DataFrame(data_for_graphing)
rq3_table.to_csv("rq3_data_for_bias_change.csv")

Processing data to make change in bias per prompt style easier to graph:

In [None]:
bias_pre_llama_promptstyle = percent_bias_per_prompt_style(llama_guard, "original answer bias")
bias_pre_mistral_promptstyle = percent_bias_per_prompt_style(mistral_guard, "original answer bias")
bias_pre_granite_promptstyle = percent_bias_per_prompt_style(granite_guard, "original answer bias")
bias_pre_granite_promptstyle_1 = percent_bias_per_prompt_style(granite_guard, "original answer bias")

bias_post_llama_promptstyle = percent_bias_per_prompt_style(llama_guard, "post llama guard answer bias")
bias_post_mistral_promptstyle = percent_bias_per_prompt_style(mistral_guard, "post moderation answer bias")
bias_post_granite_promptstyle = percent_bias_per_prompt_style(granite_guard, "post guardian answer bias")
bias_post_bias_granite_promptstyle = percent_bias_per_prompt_style(granite_guard, "post bias guardian answer bias")

all_tuples_promptstyle = [bias_pre_llama_promptstyle, bias_post_llama_promptstyle, bias_pre_mistral_promptstyle, bias_post_mistral_promptstyle, bias_pre_granite_promptstyle, bias_post_granite_promptstyle, bias_pre_granite_promptstyle_1, bias_post_bias_granite_promptstyle]
bias_pre_and_post_all_promptstyle = list(itertools.chain(*all_tuples_promptstyle))

models_1_promptstyle = [["llama"] * 8, ["mistral"] * 8, ["granite"] * 8 , ["granite bias guardian"] * 8]
models_promptstyle = list(itertools.chain.from_iterable(models_1_promptstyle))
pre_post_promptstyle = ["A", "A", "A", "A", "B", "B", "B", "B"] * 4
promptstyle = ["base", "original", "positive", "doubt"] * 8

In [None]:
data_for_prompt_style = {'prompt style': promptstyle, 'percent bias per style' : bias_pre_and_post_all_promptstyle, "model": models_promptstyle, "type":pre_post_promptstyle}
promptstyle_data = pd.DataFrame(data_for_prompt_style)
promptstyle_data.to_csv("rq3_data_for_prompt_style.csv")