# Test datasets

In [3]:
import os, sys, pytest
import pandas as pd
from torch.utils.data import Dataset as PtDataset
from datasets import Dataset as HfDataset

ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if "__file__" in globals() else os.path.abspath("..")
sys.path.insert(0, ROOT_PATH)

from FairLangProc.datasets.fairness_datasets import BiasDataLoader

In [4]:
IMPLEMENTED = [
    "BBQ",
    "BEC-Pro",
    "BOLD",
    "BUG",
    "CrowS-Pairs",
    "GAP",
    "StereoSet",
    "UnQover",
    "WinoBias+",
    "WinoBias",
    "Winogender"
]

DATASETS = [
    "BBQ",
    "BEC-Pro",
    "BOLD",
    "BUG",
    "Bias-NLI",
    "CrowS-Pairs",
    "GAP",
    "Grep-BiasIR",
    "HONEST",
    "HolisticBias",
    "PANDA",
    "RedditBias",
    "StereoSet",
    "TrustGPT",
    "UnQover",
    "WinoBias",
    "WinoBias+",
    "WinoQueer",
    "Winogender",
]

REMAINING = [dataset for dataset in DATASETS if dataset not in IMPLEMENTED]

CONFIGURATIONS = {
    "BBQ": ["Age", "Disability_Status", "Gender_identity", "Nationality", "Physical_appearance", "Race_ethnicity", "Race_x_gender", "Race_x_SES", "Religion", "SES", "Sexual_orientation", "all"],
    "BEC-Pro": ["english", "german", "all"],
    "BOLD": ["prompts", "wikipedia", "all"],
    "BUG": ["balanced", "full", "gold", "all"],
    "Bias-NLI": ["process", "load", "all"],
    "CrowS-Pairs": [""],
    "GAP": [""],
    "Grep-BiasIR": ["queries", "documents", "relevance", "all"],
    "HolisticBias": ["noun_phrases", "sentences", "all"],
    "PANDA": ["train", "test", "dev", "all"],
    "RedditBias": ["posts", "comments", "annotations", "all"],
    "StereoSet": ["word", "sentence", "all"],
    "TrustGPT": ["process", "load", "all", "benchmarks"],
    "UnQover": ["questions", "answers", "annotations"],
    "WinoBias": ["pairs", "WinoBias"],
    "WinoBias+": [""],
    "WinoQueer": ["sentences", "templates", "annotations", "all"],
    "Winogender": [""],
}

FORMATS = ["hf", "pt", "raw"]

CLASS_DICT = {
    "hf": HfDataset,
    "pt": PtDataset,
    "raw": pd.DataFrame
}

In [5]:
TEST_CASES_FORMAT = [
    (dataset, config, format)
    for dataset in CONFIGURATIONS.keys()
    for config in CONFIGURATIONS[dataset] 
    for format in FORMATS if dataset in IMPLEMENTED
]

@pytest.mark.parametrize("dataset, config, format", TEST_CASES_FORMAT)
def test_format(dataset, config, format):
    result = BiasDataLoader(dataset = dataset, config = config, format = format)
    assert isinstance(result, dict)
    for key in result:
        assert isinstance(result[key], CLASS_DICT[format])

In [6]:
def _get_columns():
    for dataset in CONFIGURATIONS.keys():
        if dataset in IMPLEMENTED:
            result = BiasDataLoader(dataset = dataset, config = 'all', format = 'raw')
            if result is None:
                result = BiasDataLoader(dataset = dataset, config = '', format = 'raw')
            try:
                print(dataset)
                print(list(result[list(result.keys())[0]].keys()))
            except:
                pass

def _get_rows():
    for dataset in CONFIGURATIONS.keys():
        if dataset in IMPLEMENTED:
            result = BiasDataLoader(dataset = dataset, config = 'all', format = 'raw')
            if result is None:
                result = BiasDataLoader(dataset = dataset, config = '', format = 'raw')
            try:
                string = f"\"{dataset}\": {{"
                for data in result.keys():
                    if data == 'templates' and dataset == 'BBQ':
                        continue
                    string += f"\"{data}\": {len(result[data].index)}, "
                string += "}, "
                print(string)
            except:
                print(dataset + ": nothing")


In [7]:
COLUMNS = {
    "BBQ": ['example_id', 'question_index', 'question_polarity', 'context_condition', 'category', 'answer_info', 'additional_metadata', 'context', 'question', 'ans0', 'ans1', 'ans2', 'label'],
    "BEC-Pro": ['Unnamed: 0', 'Sentence', 'Sent_TM', 'Sent_AM', 'Sent_TAM', 'Template', 'Person', 'Gender', 'Profession', 'Prof_Gender'],
    "BOLD": ['gender_prompt.json', 'political_ideology_prompt.json', 'profession_prompt.json', 'race_prompt.json', 'religious_ideology_prompt.json'],
    "BUG": ['Unnamed: 0', 'sentence_text', 'tokens', 'profession', 'g', 'profession_first_index', 'g_first_index', 'predicted gender', 'stereotype', 'distance', 'num_of_pronouns', 'corpus', 'data_index'],
    "CrowS-Pairs": ['Unnamed: 0', 'sent_more', 'sent_less', 'stereo_antistereo', 'bias_type', 'annotations', 'anon_writer', 'anon_annotators'],
    "GAP": ['ID', 'Text', 'Pronoun', 'Pronoun-offset', 'A', 'A-offset', 'A-coref', 'B', 'B-offset', 'B-coref', 'URL'],
    "HolisticBias": None,
    "StereoSet": ['options', 'context', 'target', 'bias_type', 'labels'],
    "WinoBias+": ['gendered', 'neutral'],
    "WinoBias": ['sentence', 'entity', 'pronoun'],
    "Winogender": ['sentid', 'sentence']
}

TEST_CASES_COLUMNS = [
    
]

In [8]:
ROWS = {
    "BBQ": {"Age.jsonl": 3680, "Disability_status.jsonl": 1556, "Gender_identity.jsonl": 5672, "Nationality.jsonl": 3080, "Physical_appearance.jsonl": 1576, "Race_ethnicity.jsonl": 6880, "Race_x_SES.jsonl": 11160, "Race_x_gender.jsonl": 15960, "Religion.jsonl": 1200, "SES.jsonl": 6864, "Sexual_orientation.jsonl": 864, "additional_metadata.csv": 58556, },
    "BEC-Pro": {"english": 5400, "german": 5400, }, 
    "BUG": {"balanced_BUG.csv": 25504, "full_BUG.csv": 105687, "gold_BUG.csv": 1717, }, 
    "CrowS-Pairs": {"data": 1508, }, 
    "GAP": {"gap-development.tsv": 2000, "gap-test.tsv": 2000, "gap-validation.tsv": 454, }, 
    "StereoSet": {"test_sentence": 6374, "test_word": 6392, "dev_sentence": 2123, "dev_word": 2106, }, 
    "WinoBias": {"anti_stereotyped_type1.txt.dev": 396, "anti_stereotyped_type1.txt.test": 396, "anti_stereotyped_type2.txt.dev": 396, "anti_stereotyped_type2.txt.test": 396, "pro_stereotyped_type1.txt.dev": 396, "pro_stereotyped_type1.txt.test": 396, "pro_stereotyped_type2.txt.dev": 396, "pro_stereotyped_type2.txt.test": 396, }, 
    "WinoBias+": {"data": 3167, }, 
    "Winogender": {"data": 720, }, 
}

TEST_CASES_ROWS = [
    
]

# Test metrics

## Test probability

In [11]:
import os
import sys

import torch
import pytest

ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if "__file__" in globals() else os.path.abspath("..")
sys.path.insert(0, ROOT_PATH)

from FairLangProc.metrics import LPBS, CBS, CPS, AUL

In [None]:
class DummyModel:
    def __call__(self, **kwargs):
        batch_size, seq_len = kwargs["input_ids"].shape
        logits = torch.zeros(batch_size, seq_len, 30522)
        logits[:, :, 200] = 5.0
        logits[:, :, 201] = -5.0 
        logits[:, :, 202] = 15.0 
        logits[:, :, 300] = 5.0
        logits[:, :, 301] = -5.0 
        logits[:, :, 302] = 15.0 
        logits[:, :, 400] = 10.0 
        logits[:, :, 401] = -10.0 
        logits[:, :, 402] = -15.0
        return type("Output", (), {"logits": logits})

class DummyTokenizer:
    pad_token_id = 101
    cls_token_id = 102
    mask_token_id = 103
    hash_map_tokens = {
        'doctor': 200,
        'nurse': 201,
        'engineer': 202,
        'science': 300,
        'art': 301,
        'math': 302,
        'he': 400,
        'she': 401,
        'it': 402,
    }

    def __init__(self):
        return
    
    def __call__(self, sentences, padding=True, return_tensors="pt"):
        split = [sentence.split() for sentence in sentences]
        maxLen = max([len(sentence) for sentence in split])
        ids = [[self.convert_tokens_to_ids(word) for word in sentence] for sentence in split]
        if padding:
            for i in range(len(ids)):
                lenId = len(ids[i])
                if lenId < maxLen:
                    ids[i] = ids[i] + [self.pad_token_id for _ in range(maxLen - lenId)] 
        return {"input_ids": torch.tensor(ids)}

    def tokenize(self, word):
        return [word]

    def convert_tokens_to_ids(self, token):
        return self.hash_map_tokens.get(token, 100)

In [31]:
sentences = [
    "[MASK] is a [MASK]",
    "[MASK] is a [MASK]",
    "[MASK] teaches [MASK]"
]

target_words = [
    ("he", "she"),
    ("he", "she"),
    ("he", "she")
]

fill_words = [
    'engineer',
    'doctor',
    'math',
]

mask_indices = [0, 0, 0]


LPBSscore = LPBS(
    model = model,
    tokenizer = tokenizer,
    sentences = sentences,
    target_words = target_words,
    fill_words = fill_words,
    mask_indices = mask_indices
)


AttributeError: 'dict' object has no attribute 'input_ids'

In [24]:
tokenizer = DummyTokenizer()
entrada = tokenizer(['hola que haces', 'aquí con el doctor'])

In [30]:
model = DummyModel()
model(**entrada).logits

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [3]:
from transformers import AutoTokenizer

print(AutoTokenizer.from_pretrained("bert-base-uncased"))

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


## Test embedding

In [None]:
import os
import sys
from math import abs

import torch
import pytest

ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if "__file__" in globals() else os.path.abspath("..")
sys.path.insert(0, ROOT_PATH)

from FairLangProc.metrics import WEAT

In [None]:
class DummyWEAT(WEAT):
    def _get_embedding(self, outputs):
        return outputs

def test_cosine_similarity():
    tokenizer = DummyTokenizer()
    model = DummyModel()
    pass

def test_effect_size():
    pass

def test_metric():
    pass

## Test generated text

In [None]:
import os
import sys
import pytest

ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if "__file__" in globals() else os.path.abspath("..")
sys.path.insert(0, ROOT_PATH)

from FairLangProc.metrics import DemRep, StereoAsoc, HONEST

In [11]:
GENDERED_WORDS = {
    'male': ['he', 'him', 'his'],
    'female': ['she', 'her', 'actress', 'hers']
    }

ATTRIBUTES = GENDERED_WORDS.keys()

SENTENCES = [
    'She is such a good match to him.',
    'He is trying way too hard to be an actor.',
    'Her mother is trying to make ends meet.'
    'My aunt is baking, do you want to try?'
]

def test_demographic_representation_type():
    DR = DemRep(sentences = SENTENCES, demWords = GENDERED_WORDS)
    assert isinstance(DR, dict)

def test_demographic_representation_keys():
    DR = DemRep(sentences = SENTENCES, demWords = GENDERED_WORDS)
    assert len(DR.keys()) == 2

def test_demographic_representation_values():
    DR = DemRep(sentences = SENTENCES, demWords = GENDERED_WORDS)
    assert DR['male'] == 1
    assert DR['female'] == 2

def test_demographic_representation_empty_demwords():
    DR = DemRep(sentences = SENTENCES, demWords = {})
    assert DR == {}

def test_demographic_representation_empty_sentences():
    DR = DemRep(sentences = [], demWords = GENDERED_WORDS)
    assert len(DR.keys()) == 2
    assert DR['male'] == 0
    assert DR['female'] == 0

def test_demographic_representation_empty_demwords_sentences():
    DR = DemRep(sentences = [], demWords = {})
    assert DR == {}

In [12]:
TARGET_WORDS = ['mother', 'baking']

In [13]:
def test_stereorep_type():
    ST = StereoAsoc(sentences = SENTENCES, demWords = GENDERED_WORDS, targetWords = TARGET_WORDS)
    assert isinstance(ST, dict)
    for key in ST.keys():
        assert isinstance(ST[key], dict)

def test_stereorep_keys():
    ST = StereoAsoc(sentences = SENTENCES, demWords = GENDERED_WORDS, targetWords = TARGET_WORDS)
    assert len(ST.keys()) == 2
    for key in ST.keys():
        assert len(ST[key].keys()) == 2

def test_stereorep_values():
    ST = StereoAsoc(sentences = SENTENCES, demWords = GENDERED_WORDS, targetWords = TARGET_WORDS)
    assert ST['mother']['male'] == 0
    assert ST['mother']['female'] == 1
    assert ST['baking']['male'] == 0
    assert ST['baking']['female'] == 0

def test_stereorep_empty_target():
    ST = StereoAsoc(sentences = SENTENCES, demWords = GENDERED_WORDS, targetWords = [])
    assert ST == {}

def test_stereorep_empty_dem():
    ST = StereoAsoc(sentences = SENTENCES, demWords = {}, targetWords = TARGET_WORDS)
    assert ST['mother'] == {}
    assert ST['baking'] == {}

def test_stereorep_empty_sentences():
    ST = StereoAsoc(sentences = [], demWords = GENDERED_WORDS, targetWords = TARGET_WORDS)
    assert ST['mother']['male'] == 0
    assert ST['mother']['female'] == 0
    assert ST['baking']['male'] == 0
    assert ST['baking']['female'] == 0

def test_stereorep_empty_dem_sentences():
    ST = StereoAsoc(sentences = [], demWords = {}, targetWords = TARGET_WORDS)
    assert ST['mother'] == {}
    assert ST['baking'] == {}

def test_stereorep_empty_dem_target():
    ST = StereoAsoc(sentences = SENTENCES, demWords = {}, targetWords = [])
    assert ST == {}

def test_stereorep_empty_sentences_target():
    ST = StereoAsoc(sentences = [], demWords = GENDERED_WORDS, targetWords = [])
    assert ST == {}

def test_stereorep_empty_dem_sentences_target():
    ST = StereoAsoc(sentences = [], demWords = {}, targetWords = [])
    assert ST == {}

In [14]:
COMPLETIONS = [
    ['he is so stupid', 'he is such a fool', 'he is so misunderstood'],
    ['she is so bossy', 'she is an incompetent manager', 'she does what is necessary'],
    ['they were so agreeable', 'they were so nice to us', 'they showed hospitality']
]

COMPLETIONS_DIFFERENT_LENGTH = [
    ['he is so stupid', 'he is such a fool', 'he is so misunderstood'],
    ['she is so bossy', 'she is an incompetent manager'],
    ['they were so agreeable', 'they were so nice to us', 'they showed hospitality']
]

COMPLETIONS_WITHOUT_LIST = [
    ['he is so stupid', 'he is such a fool', 'he is so misunderstood'],
    ('she is so bossy', 'she is an incompetent manager', 'she does what is necessary'),
    ['they were so agreeable', 'they were so nice to us', 'they showed hospitality']
]

HURTLEX = ['fool', 'stupid', 'incompetent']

def test_honest_type():
    honest = HONEST(completions = COMPLETIONS, hurtLex = HURTLEX)
    assert isinstance(honest, float)

def test_honest_value():
    honest = HONEST(completions = COMPLETIONS, hurtLex = HURTLEX)
    assert abs(honest - 1/3) < 1e-15

def test_honest_empty_hurt():
    honest = HONEST(completions = COMPLETIONS, hurtLex = [])
    assert abs(honest - 0.0) < 1e-15

def test_honest_empty_completions():
    with pytest.raises(AssertionError) as excinfo:
        honest = HONEST(completions = [], hurtLex = HURTLEX)
    assert "completions is empty" in excinfo

def test_honest_not_list():
    with pytest.raises(AssertionError) as excinfo:
        honest = HONEST(completions = {}, hurtLex = HURTLEX)
    assert "completions is not a list" in excinfo

def test_element_not_list():
    with pytest.raises(AssertionError) as excinfo:
        honest = HONEST(completions = COMPLETIONS_WITHOUT_LIST, hurtLex = HURTLEX)
    assert "completions is not a list of lists" in excinfo

def test_honest_different_length():
    with pytest.raises(AssertionError) as excinfo:
        honest = HONEST(completions = COMPLETIONS_DIFFERENT_LENGTH, hurtLex = HURTLEX)
    assert "Number of completions is not uniform" in excinfo

# Test models