## Load the Data

In [1]:
import pandas as pd
import config

df_test = pd.read_csv(config.SBIC_TEST_PATH)
df_dev = pd.read_csv(config.SBIC_DEV_PATH)

## Aggregate the Data and Format the Prompts

In [2]:
from utils.helper import clean_post, create_md5_hash
from typing import Dict


def format_prompts_and_labels(df: pd.DataFrame) -> Dict[str, dict]:
    lookup = {}

    for _, row in df.iterrows():
        md5_hash = create_md5_hash(row["post"])

        if md5_hash not in lookup:
            lookup[md5_hash] = {
                "prompt": config.GENERATION_TEMPLATE.format(
                    post=clean_post(row["post"])
                ),
                "post": clean_post(row["post"]),
                "md5_hash": md5_hash,
                "lewd": [],
                "off": [],
                "intention": [],
                "grp": [],
                "ing": [],
                "group": [],
                "statement": [],
            }

        # add labels to the lookup
        lookup[md5_hash]["lewd"].extend([1] if row["sexYN"] == 1.0 else [0])
        lookup[md5_hash]["off"].extend([1] if row["offensiveYN"] == 1.0 else [0])
        lookup[md5_hash]["intention"].extend([1] if row["intentYN"] == 1.0 else [0])
        lookup[md5_hash]["grp"].extend([1] if row["targetMinority"] else [0])
        lookup[md5_hash]["ing"].extend([1] if row["speakerMinorityYN"] == 1.0 else [0])

        if (
            isinstance(row["targetMinority"], str)
            and row["targetMinority"] not in lookup[md5_hash]["group"]
        ):
            lookup[md5_hash]["group"].append(row["targetMinority"])
        if (
            isinstance(row["targetStereotype"], str)
            and row["targetStereotype"] not in lookup[md5_hash]["statement"]
        ):
            lookup[md5_hash]["statement"].append(row["targetStereotype"])

    return lookup


test_dict = format_prompts_and_labels(df_test)
dev_dict = format_prompts_and_labels(df_dev)

len(test_dict), len(dev_dict)

(4698, 4673)

## Re-Binaries the Categorical Variables

In [3]:
import numpy as np


def re_binarize_variables(post_dict: dict, threshold: float = 0.5) -> Dict[str, dict]:
    for _, value_dict in post_dict.items():
        for key, value in value_dict.items():
            # skip text variables
            if key in ["post", "group", "statement", "prompt", "md5_hash"]:
                continue

            # re-binarizing the variables based on the threshold (default 0.5)
            value_dict[key] = 1 if np.mean(value) > threshold else 0

    return post_dict


test_dict = re_binarize_variables(test_dict)
dev_dict = re_binarize_variables(dev_dict)

## Write Results to CSV

In [4]:
df_test = pd.DataFrame(test_dict).T
df_dev = pd.DataFrame(dev_dict).T

df_test.to_csv("tmp/test_eval_prompts.csv", index=False)
df_dev.to_csv("tmp/dev_eval_prompts.csv", index=False)