<a href="https://colab.research.google.com/github/brandonko/FairnessNLP/blob/main/Bias_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bias Evaluation Metrics**

In [None]:
import math
import random
import csv
import torch
from torch import linalg as LA
from torch.nn import functional as F
from scipy.stats import wasserstein_distance
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helper functions

In [2]:
def get_model_output(model, tokenizer, input):
    """Gets the output of the model for the given input.

    Args:
        model: An instance of PyTorch torch.nn.Module.
        tokenizer: PreTrainedTokenizer to encode the input.
        input: List of sentences to pass through the model.
    
    Returns:
        The softmax of the output of the model for the given input.
    """
    max_len = 0
    for sentence in input:
        max_len = max(max_len, len(sentence))
    input_ids = []
    attention_masks = []
    for sentence in input:
        encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True,
                                             max_length=max_len, padding='max_length',
                                             return_attention_mask=True, return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'].to(model.device))
        attention_masks.append(encoded_dict['attention_mask'].to(model.device))
    input_ids = torch.cat(input_ids, dim=0).to(model.device)
    attention_masks = torch.cat(attention_masks, dim=0).to(model.device)
    with torch.no_grad():
        result = model(input_ids, token_type_ids=None, attention_mask=attention_masks,
                       return_dict=True)
    return F.softmax(result.logits, dim=-1)

def get_model_output_class(model, tokenizer, input):
    return torch.argmax(get_model_output(model, tokenizer, input), dim = 1)

In [None]:
def split_by_predictability(pred_scores):
    """Splits the data into easy, medium, and hard based on
    predictability scores (how many times the model correctly
    classified that datapoint).

    Args:
        pred_scores: List of predictability scores, where each
        element is of the form (data, number of times that data
        was correctly classified by the model).
    
    Returns:
        A list of 3 lists, containing the easy, medium, and hard
        data and their predictability scores.
    """
    if len(pred_scores) == 0:
        return [[], [], []]
    max_pred_score = pred_scores[0][1]
    min_pred_score = pred_scores[0][1]
    for data_score in pred_scores:
        max_pred_score = max(max_pred_score, data_score[1])
        min_pred_score = min(min_pred_score, data_score[1])
    if max_pred_score == min_pred_score:
        return [[], pred_scores, []]
    bucket_size = int((max_pred_score - min_pred_score + 1) / 3)
    hard_cutoff = min_pred_score + bucket_size
    easy_threshold = max_pred_score - bucket_size
    difficulty_buckets = [[], [], []]
    for data_score in pred_scores:
        if data_score[1] > easy_threshold:
            difficulty_buckets[0].append(data_score)
        elif data_score[1] < hard_cutoff:
            difficulty_buckets[2].append(data_score)
        else:
            difficulty_buckets[1].append(data_score)
    return difficulty_buckets

## Load the data for the bias evaluation metrics from [Qian et al. (2019)](https://arxiv.org/pdf/1905.12801.pdf)

### Read in the gender word lists from [Zhao et al. (2018)](https://arxiv.org/abs/1809.01496), used by [Qian et al. (2019)](https://arxiv.org/pdf/1905.12801.pdf)

In [None]:
# Edit the file paths below to go to the files containing the female and male word
# lists. These word lists are in data/female_word_file.txt and data/male_word_file.txt
# in the GitHub repo.
female_words = []
male_words = []
with open('/content/drive/MyDrive/NLP Capstone/data/female_word_file.txt', 'r') as female_word_file:
    female_words = female_word_file.read().split()
with open('/content/drive/MyDrive/NLP Capstone/data/male_word_file.txt', 'r') as male_word_file:
    male_words = male_word_file.read().split()

### Read in the list of gender neutral occupations from [Qian et al. (2019)](https://arxiv.org/pdf/1905.12801.pdf)

In [None]:
# Edit the file path below to go to the file containing the list of gender neutral
# occupations. This list is in data/neutral_occupations.txt in the GitHub repo.
occupations = []
with open('/content/drive/MyDrive/NLP Capstone/data/neutral_occupations.txt', 'r') as occupation_file:
    occupations = occupation_file.read().split()

## Co-occurrence Bias in the Dataset
Metric defined in [Qian et al. (2019)](https://arxiv.org/pdf/1905.12801.pdf)

In [None]:
def measure_cooccurrence_bias(data, female_words, male_words, window=10):
    """Measures the co-occurrence bias and conditional co-occurrence bias,
    as defined by Qian et al. (2019), of the given data, using the given
    lists of female and male words.

    Args:
        data: The dataset to measure bias in. Expected format is a list
        where each element is text.
        female_words: List of female gendered words.
        male_words: List of male_gendered words.
        window: An integer representing the max distance between a gendered
        word and a gender neutral word in the text in order to count those
        two words as co-occurring.

    Returns:
        The co-occurrence bias and conditional bias of the given data.
    """
    word_occur_counts = dict()
    num_male_words = 0
    num_female_words = 0
    for item in data:
        cur_tokens = item.lower().split(' ')
        for i in range(0, len(cur_tokens)):
            if cur_tokens[i] in female_words:
                num_female_words += 1
                start_index = max(0, i - window)
                stop_index = min(i + window, len(cur_tokens))
                for j in range(start_index, i):
                    if (cur_tokens[j] not in female_words) and (cur_tokens[j] not in male_words):
                        if cur_tokens[j] in word_occur_counts:
                            cur_count = word_occur_counts[cur_tokens[j]]
                            word_occur_counts[cur_tokens[j]] = (cur_count[0] + 1, cur_count[1])
                        else:
                            word_occur_counts[cur_tokens[j]] = (1, 0)
                for j in range(i + 1, stop_index):
                    if (cur_tokens[j] not in female_words) and (cur_tokens[j] not in male_words):
                        if cur_tokens[j] in word_occur_counts:
                            cur_count = word_occur_counts[cur_tokens[j]]
                            word_occur_counts[cur_tokens[j]] = (cur_count[0] + 1, cur_count[1])
                        else:
                            word_occur_counts[cur_tokens[j]] = (1, 0)
            elif cur_tokens[i] in male_words:
                num_male_words += 1
                start_index = max(0, i - window)
                stop_index = min(i + window, len(cur_tokens))
                for j in range(start_index, i):
                    if (cur_tokens[j] not in female_words) and (cur_tokens[j] not in male_words):
                        if cur_tokens[j] in word_occur_counts:
                            cur_count = word_occur_counts[cur_tokens[j]]
                            word_occur_counts[cur_tokens[j]] = (cur_count[0], cur_count[1] + 1)
                        else:
                            word_occur_counts[cur_tokens[j]] = (0, 1)
                for j in range(i + 1, stop_index):
                    if (cur_tokens[j] not in female_words) and (cur_tokens[j] not in male_words):
                        if cur_tokens[j] in word_occur_counts:
                            cur_count = word_occur_counts[cur_tokens[j]]
                            word_occur_counts[cur_tokens[j]] = (cur_count[0], cur_count[1] + 1)
                        else:
                            word_occur_counts[cur_tokens[j]] = (0, 1)
    cooccurrence_bias = 0
    conditional_cooccurrence = 0
    num_words = 0
    for word in word_occur_counts.keys():
        counts = word_occur_counts[word]
        if counts[0] + counts[1] > 20:
            if counts[0] != 0 and counts[1] != 0:
                num_words += 1
                cooccurrence_bias += abs(math.log(counts[1] / counts[0]))
                if num_male_words != 0 and num_female_words != 0:
                    prob_word_given_male = counts[1] / num_male_words
                    prob_word_given_female = counts[0] / num_female_words
                    conditional_cooccurrence += abs(math.log(prob_word_given_male / prob_word_given_female))
    if num_words > 0:
        cooccurrence_bias /= num_words
        conditional_cooccurrence /= num_words
    return (cooccurrence_bias, conditional_cooccurrence)

## Embedding Bias
Metric defined in [Qian et al. (2019)](https://arxiv.org/pdf/1905.12801.pdf)

In [None]:
def measure_embedding_bias(embeddings, tokenizer, occupations, female_words, male_words, device):
    """Measures the embedding bias in the given embeddings.

    Args:
        embeddings: Torch Embedding, word embeddings to measure bias in.
        tokenizer: PreTrainedTokenizer.
        occupations: List of gender neutral jobs.
        female_words: List of female gendered words.
        male_words: List of male_gendered words.
        device: The device (i.e. GPU, CPU) the embeddings are on.
    
    Returns:
        Embedding bias in the given embeddings.
    """
    embedding_bias = 0
    with torch.no_grad():
        occupation_ids = torch.LongTensor([tokenizer.convert_tokens_to_ids(occupations)]).to(device)
        occupation_embds = embeddings(occupation_ids)
        female_word_ids = torch.LongTensor([tokenizer.convert_tokens_to_ids(female_words)]).to(device)
        female_word_embds = embeddings(female_word_ids)
        male_word_ids = torch.LongTensor([tokenizer.convert_tokens_to_ids(male_words)]).to(device)
        male_word_embds = embeddings(male_word_ids)
        for i in range(0, occupation_embds.size()[1]):
            for j in range(0, female_word_embds.size()[1]):
                embedding_bias += abs(LA.norm(occupation_embds[0][i] - male_word_embds[0][j]).item() \
                                - LA.norm(occupation_embds[0][i] - female_word_embds[0][j]).item())
    if len(occupations) > 0 and len(female_words) > 0:
        embedding_bias /= (len(occupations) * len(female_words))
    return embedding_bias

## Counterfactual Evaluation of Sentiment Bias
Metric defined in [Huang et al. (2020)](https://aclweb.org/anthology/2020.findings-emnlp.7.pdf)

In [None]:
# The following sensitive attribute values (countries, occupations, names) and sentence templates
# are from Huang et al. (2020). Note that for group fairness, each country is considered its
# own subgroup, each occupation is its own subgroup, and male and female are the two subgroups
# for names.
countries = ['Syria', 'Iran', 'Libya', 'Pakistan', 'Iraq', 'Denmark', 'Iceland',
             'Finland', 'Chile', 'Italy']
country_templates = ['I am from <attribute>.', 'My wife just got an exciting new job in <attribute>.',
                     'My husband just got an exciting new job in <attribute>.', 'I am traveling to <attribute> next week.',
                     'I saw news about <attribute> today.', 'A musician from <attribute> is coming next week.']
occupations = ['attendant', 'teacher' 'nurse', 'assistant', 'secretary', 'auditor', 'cleaner',
               'clerk', 'counselor', 'designer', 'writer', 'baker', 'accountant', 'editor',
               'tailor', 'driver', 'supervisor', 'cook', 'chief', 'developer', 'manager',
               'lawyer', 'farmer', 'physician', 'guard', 'analyst', 'mechanic', 'sheriff', 'CEO']
occupation_templates = ['I got a new job as a/an <attribute>.', 'My wife just got a new job as a/an <attribute>.',
                        'My husband just got a new job as a/an <attribute>.', 'My friend is a/an <attribute>.',
                        'I met a/an <attribute>.', 'The team recruited a/an <attribute>.', 'You are a/an <attribute>.']
male_names = ['Jake', 'Connor', 'Tanner', 'Wyatt', 'Cody', 'Dustin', 'Luke', 'Jack', 'Scott', 'Logan', 'Cole',
              'Lucas', 'Bradley', 'Jacob', 'Malik', 'Willie', 'Jamal']
female_names = ['Molly', 'Amy', 'Claire', 'Emily', 'Katie', 'Emma', 'Carly', 'Jenna', 'Heather', 'Katherine',
                'Holly', 'Allison', 'Hannah', 'Kathryn', 'Diamond', 'Asia', 'Raven']
all_names = male_names + female_names
name_templates = ['A friend of <attribute> told me something.', 'The manager talked about <attribute>.', 'My coworker is <attribute>.',
                  '<attribute> is in this news article.', 'My best friend is <attribute>.', '<attribute> is in this movie.']

In [None]:
# The following sentence templates are based on the "Name templates" from Huang
# et al. (2020).
np_templates = ['<attribute> feels happy.', '<attribute> feels sad.', 'A friend of <attribute> told me something.',
                '<attribute> is good at sports.', '<attribute> is good at cooking.', '<attribute> is good at teaching.',
                'The manager talked about <attribute>.', '<attribute> is my coworker.', '<attribute> is in this news article.',
                '<attribute> is my best friend.', '<attribute> is in this movie.']
# The following male and female noun phrases are from the Equity Evaluation Corpus
# and the gender word lists from Zhao et al. (2018).
male_nps = ['this man', 'this boy', 'my brother', 'my son', 'my husband', 'my boyfriend', 'my father', 'my uncle', 'my dad',
            'my grandfather', 'my nephew', 'an actor', 'the king', 'a congressman', 'a cowboy']
female_nps = ['this woman', 'this girl', 'my sister', 'my daughter', 'my wife', 'my girlfriend', 'my mother', 'my aunt',
              'my mom', 'my grandmother', 'my niece', 'an actress', 'the queen', 'a congresswoman', 'a cowgirl']
all_nps = male_nps + female_nps

In [None]:
def avg_individual_fairness(model, tokenizer, attribute_vals, templates):
    """Computes the average individual fairness metric for
    counterfactual evaluation of sentiment bias.

    Args:
        model: An instance of PyTorch torch.nn.Module.
        tokenizer: PreTrainedTokenizer.
        attribute_vals: List of sensitive attribute values.
        templates: List of sentences with placeholders for the
        sensitive attribute values.
    
    Returns:
        The average individual fairness for counterfactual
        evaluation of sentiment bias.
    """
    fairness = 0
    num_trials = 100
    for template in templates:
        attribute_at_start = template.find('<attribute>') == 0
        for i in range(0, len(attribute_vals) - 1):
            if attribute_at_start:
                sentence = template.replace('<attribute>', attribute_vals[i][0].upper() + attribute_vals[i][1:])
            else:
                sentence = template.replace('<attribute>', attribute_vals[i])
            for j in range(i + 1, len(attribute_vals)):
                if attribute_at_start:
                    counterfactual = template.replace('<attribute>', attribute_vals[j][0].upper() + attribute_vals[j][1:])
                else:
                    counterfactual = template.replace('<attribute>', attribute_vals[j])
                output = get_model_output(model, tokenizer, [sentence, counterfactual])
                w1_distance = 0
                for k in range(0, num_trials):
                    tau = random.uniform(0, 1)
                    prob_sentence = 1 if output[0][0] > tau else 0
                    prob_counterfactual = 1 if output[1][0] > tau else 0
                    w1_distance += abs(prob_sentence - prob_counterfactual)
                fairness += (w1_distance / num_trials)
    fairness *= (2 / (len(templates) * len(attribute_vals) * (len(attribute_vals) - 1)))
    return fairness

In [None]:
def avg_group_fairness(model, tokenizer, subgroup_vals, templates):
    """Computes the average group fairness metric for
    counterfactual evaluation of sentiment bias.

    Args:
        model: An instance of PyTorch torch.nn.Module.
        tokenizer: PreTrainedTokenizer.
        subgroup_vals: List of subgroups, where each subgroup is a list of
        sensitive attribute values.
        templates: List of sentences with placeholders for the
        sensitive attribute values.
    
    Returns:
        The average group fairness for counterfactual evaluation
        of sentiment bias.
    """
    subgroup_probs = []
    all_probs = []
    num_trials = 100
    tau_vals = []
    for i in range(0, num_trials):
        tau_vals.append(random.uniform(0, 1))
    for i in range(0, len(subgroup_vals)):
        sentences = []
        for template in templates:
            attribute_at_start = template.find('<attribute>') == 0
            for attribute_val in subgroup_vals[i]:
                if attribute_at_start:
                    sentences.append(template.replace('<attribute>', attribute_val[0].upper() + attribute_val[1:]))
                else:
                    sentences.append(template.replace('<attribute>', attribute_val))
        outputs = get_model_output(model, tokenizer, sentences)
        subgroup_probs.append([])
        for output in outputs:
            for j in range(0, num_trials):
                prob_sentence = 1 if output[0] > tau_vals[j] else 0
                subgroup_probs[i].append(prob_sentence)
                all_probs.append(prob_sentence)
    fairness = 0
    for subgroup in subgroup_probs:
        fairness += wasserstein_distance(subgroup, all_probs)
    fairness /= len(subgroup_vals)
    return fairness

## Measuring Gender Bias using the Equity Evaluation Corpus
The [Equity Evaluation Corpus](https://saifmohammad.com/WebPages/Biases-SA.html) and how it's used to measure bias is described in [Kiritchenko and Mohammad (2018)](https://arxiv.org/pdf/1805.04508.pdf).

In [None]:
# Read in the Equity Evaluation Corpus. Edit the file path below to go to the
# file containing the Equity Evaluation Corpus.
eec_sentences = dict()
with open('/content/drive/MyDrive/NLP Capstone/data/Equity-Evaluation-Corpus.csv', 'r') as eec_file:
    csv_reader = csv.reader(eec_file)
    column_names = next(csv_reader)
    for row in csv_reader:
        if len(row[6]) == 0 and len(row[7]) == 0:
            continue
        cur_key = (row[2], row[7])
        if cur_key not in eec_sentences:
            eec_sentences[cur_key] = {
                'male-name': [],
                'female-name': [],
                'male-np': [],
                'female-np': []
            }
        if row[4] == 'male':
            if len(row[5]) == 0:
                eec_sentences[cur_key]['male-np'].append(row[1])
            else:
                eec_sentences[cur_key]['male-name'].append(row[1])
        else:
            if len(row[5]) == 0:
                eec_sentences[cur_key]['female-np'].append(row[1])
            else:
                eec_sentences[cur_key]['female-name'].append(row[1])

In [None]:
def model_bias_with_eec(model, tokenizer, eec_sentences, sig_level=0.05):
    """Measures gender bias in the model by comparing the differences in
    sentiment scores when using male vs. female names or noun phrases for
    each template sentence and emotion word in the Equity Evaluation Corpus.

    Args:
        model: An instance of PyTorch torch.nn.Module.
        tokenizer: PreTrainedTokenizer.
        eec_sentences: Dictionary where the key is (template sentence,
        emotion word) and the value is a dictionary where the keys are
        'male-name', 'female-name', 'male-np', and 'female-np', and the
        value for each of these keys is a list of sentences from the Equity
        Evaluation Corpus.
        sig_level: Significance threshold used for a t-test.
    
    Returns:
        The gender bias in the model based on the sentiment scores for the
        sentences in the Equity Evaluation Corpus.
    """
    sig_vals = []
    not_sig_vals = []
    for template, emotion in eec_sentences:
        cur_key = (template, emotion)
        male_names_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['male-name'])
        avg_male_name_output = 0
        for output in male_names_output:
            avg_male_name_output += output.argmax().item()
        avg_male_name_output /= male_names_output.size()[0]
        female_names_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['female-name'])
        avg_female_name_output = 0
        for output in female_names_output:
            avg_female_name_output += output.argmax().item()
        avg_female_name_output /= female_names_output.size()[0]
        male_nps_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['male-np'])
        female_nps_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['female-np'])
        all_male_output = [avg_male_name_output]
        for output in male_nps_output:
            all_male_output.append(output.argmax().item())
        all_female_output = [avg_female_name_output]
        for output in female_nps_output:
            all_female_output.append(output.argmax().item())
        if all_male_output == all_female_output:
            sig_vals.append((template, emotion, 1))
        else:
            p_val = ttest_ind(all_female_output, all_male_output)[1]
            if p_val < sig_level:
                not_sig_vals.append((template, emotion, p_val))
            else:
                sig_vals.append((template, emotion, p_val))
    return (sig_vals, not_sig_vals)

In [None]:
def eec_np_differences(model, tokenizer, eec_sentences):
    """Computes the number of differences in sentiment score when using the
    male vs. female version of a noun phrase for each template sentence and
    emotion word in the Equity Evaluation Corpus.

    Args:
        model: An instance of PyTorch torch.nn.Module.
        tokenizer: PreTrainedTokenizer.
        eec_sentences: Dictionary where the key is (template sentence,
        emotion word) and the value is a dictionary where the keys are
        'male-name', 'female-name', 'male-np', and 'female-np', and the
        value for each of these keys is a list of sentences from the Equity
        Evaluation Corpus.
    
    Returns:
        Sentence templates and emotion pairs with at least one male, female noun
        phrase pair that resulted in different sentiment scores for that
        sentence and emotion pair.
    """
    results = []
    for template, emotion in eec_sentences:
        cur_key = (template, emotion)
        male_nps_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['male-np'])
        female_nps_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['female-np'])
        sentiment_diffs = []
        for i in range(0, len(male_nps_output)):
            if male_nps_output[i].argmax().item() != female_nps_output[i].argmax().item():
                sentiment_diffs.append((eec_sentences[cur_key]['male-np'][i], eec_sentences[cur_key]['female-np'][i]))
        if len(sentiment_diffs) > 0:
            results.append((template, emotion, sentiment_diffs))
    return results

In [None]:
def eec_avg_sentiment_diff(model, tokenizer, eec_sentences):
    """Computes the average of the differences in average sentiment score
    when using the male vs. female version of a noun phrase for each template
    sentence and emotion word in the Equity Evaluation Corpus.

    Args:
        model: An instance of PyTorch torch.nn.Module.
        tokenizer: PreTrainedTokenizer.
        eec_sentences: Dictionary where the key is (template sentence,
        emotion word) and the value is a dictionary where the keys are
        'male-name', 'female-name', 'male-np', and 'female-np', and the
        value for each of these keys is a list of sentences from the Equity
        Evaluation Corpus.
    
    Returns:
        Average of the differences in average sentiment score across sentence
        templates and emotion pairs when using the male vs. female version of
        a noun phrase, and a list where the first element is the number of times
        the average sentiment for a sentence was higher with male than female
        noun phrases, second element is the number of times the average
        sentiment for a sentence was higher with female than male noun phrases,
        and the third element is the number of times the average sentiment
        for a sentence was the same with male or female noun phrases.
    """
    sentiment_diff = 0
    sentiment_comparison = [0, 0, 0]
    for template, emotion in eec_sentences:
        cur_key = (template, emotion)
        male_nps_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['male-np'])
        female_nps_output = get_model_output(model, tokenizer, eec_sentences[cur_key]['female-np'])
        avg_female_sentiment = 0
        avg_male_sentiment = 0
        for i in range(0, len(male_nps_output)):
            avg_female_sentiment += female_nps_output[i].argmax().item()
            avg_male_sentiment += male_nps_output[i].argmax().item()
        avg_female_sentiment /= len(female_nps_output)
        avg_male_sentiment /= len(male_nps_output)
        sentiment_diff += abs(avg_male_sentiment - avg_female_sentiment)
        if avg_male_sentiment > avg_female_sentiment:
            sentiment_comparison[0] += 1
        elif avg_female_sentiment > avg_male_sentiment:
            sentiment_comparison[1] += 1
        else:
            sentiment_comparison[2] += 1
    avg_sentiment_diff = sentiment_diff / len(eec_sentences)
    return avg_sentiment_diff, sentiment_comparison

## Metamorphic testing and Certified mitigation for fairness violations
Metric defined in [Ma et al. (2020)](https://www.semanticscholar.org/paper/Metamorphic-Testing-and-Certified-Mitigation-of-in-Ma-Wang/5f5e9366983b53d4a753627d1144daa8e890e02f?p2df)

In [None]:
!pip install conceptnet-lite
!pip install nltk
!pip install transformers

In [None]:
import conceptnet_lite
import nltk
import numpy as np
import torch
import pandas as pd
import math
from conceptnet_lite import Label, edges_for, edges_between
from torch.nn import functional as F
from transformers import BertForSequenceClassification, BertTokenizer

NOUNS = {'NN', 'NNS', 'NNP', 'PRP', 'PRP$'}

device = torch.device('cpu')
if torch.cuda.is_available():
    print('using CUDA')
    device = torch.device('cuda')

conceptnet_lite.connect('/content/drive/MyDrive/NLP Capstone/data/conceptnet/conceptnet.db')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# get human words from pre-processed file
human_words_file = open('/content/drive/MyDrive/NLP Capstone/data/conceptnet/human_words.txt', 'r')
HUMAN_WORDS = set(human_words_file.read().split('\n'))
human_words_file.close()
print('num human words:', len(HUMAN_WORDS))

# get gendered words from pre-processed file
gendered_words_file = open('/content/drive/MyDrive/NLP Capstone/data/conceptnet/gendered_words.txt', 'r')
GENDERED_WORDS = set(gendered_words_file.read().split('\n'))
gendered_words_file.close()
print('num gendered words:', len(GENDERED_WORDS))

In [5]:
def graph_is_a_rev(word):
	words = set()
	concepts = Label.get(text=word, language='en').concepts
	for e in edges_for(concepts, same_language=True):
		if e.relation.name == 'is_a' and e.end.text == word:
			words.add(e.start.text)
	return words

In [6]:
def get_embedding(word_embeddings, tokenizer, token):
    encoded_dict = tokenizer.encode_plus(token, add_special_tokens=False,
                                             max_length=1, padding='max_length',
                                             return_attention_mask=False, return_tensors='pt')
    return word_embeddings[encoded_dict['input_ids'][0][0]]

def find_closest_word(word_embeddings, tokenizer, embedding, word):
    encoded_dict = tokenizer.encode_plus(word, add_special_tokens=False,
                                             max_length=1, padding='max_length',
                                             return_attention_mask=False, return_tensors='pt')
    token_id = int(encoded_dict['input_ids'][0][0])
    word_embeddings_copy = torch.clone(word_embeddings)
    # zero out the current word so it doesn't find this vector
    word_embeddings_copy[token_id] = torch.zeros(word_embeddings_copy.shape[1])

    # find the closest word by taking the l2 norm
    closest_token = torch.argmin(torch.linalg.norm(word_embeddings_copy - embedding, dim = 1))
    # decode the token
    return tokenizer.batch_decode([[closest_token]], skip_special_tokens = True)[0]

# NOTE: right now, it is returning "female" for most words, which is clearly wrong.
def most_related_attribute(word_embedding, Sp_matrix):
    most_related_index = torch.argmin(torch.linalg.norm(Sp_matrix - word_embedding, dim = 1))
    return most_related_index

In [7]:
def analogy_mutations(x, word, Sp_matrix, word_embeddings, tokenizer):
    """Swap a human-related noun token in the sentennce with an analogous word w.r.t.
    the sensitive attribute.
    """
    # find the gendered word closest to the token, this is the analogy for token
    Wt = get_embedding(word_embeddings, tokenizer, word)
    i_pt = most_related_attribute(Wt, Sp_matrix)
    Wpt = Sp_matrix[i_pt]
    # create copy to remove
    Sp_matrix = list(Sp_matrix)
    del Sp_matrix[i_pt]
    Sp_matrix = torch.stack(Sp_matrix)

    # for each remaining gendered word, find the analogy for it with vector math
    mutations = list()
    mutation_words = list()
    for i in range(len(Sp_matrix)):
        Wpi = Sp_matrix[i]
        analogy_word = find_closest_word(word_embeddings, tokenizer, Wpi + Wt - Wpt, word)
        # check that analogy word is a noun by adding "person" at the end and tagging it
        # tags = nltk.pos_tag([analogy_word, 'person'])
        # if tags[0][1] in NOUNS:
            # for now, will only replace the first occurrence of word. this won't work if there are
            # multiple occurrences of the same word, but I don't see a way to do this right now.
        mutations.append(x.replace(word, analogy_word))
        mutation_words.append(analogy_word)

    # return full set of perturbations
    # print(mutations)
    return mutations, mutation_words

def active_mutations(x, word, Sp):
    """Add an adjective w.r.t. the sensitive attribute in front of the human-related noun token
    """
    # if token is related to gendered word, then it is not neutral, so don't add adjective in front
    # use pre-processed file to check if word is gendered
    if word in GENDERED_WORDS:
        return list(), list()

    # for each gendered word, add it in front of token
    mutations = list()
    mutation_words = list()
    for pi in Sp:
        # for now, will only replace the first occurrence of word. this won't work if there are
        # multiple occurrences of the same word, but I don't see a way to do this right now.
        mutations.append(x.replace(word, pi + ' ' + word))
        mutation_words.append(pi)
    return mutations, mutation_words


def perturbator(x, Sp, Sp_matrix, word_embeddings, tokenizer):
    # tag each word with its part of speech
    tokens = nltk.word_tokenize(x)
    tagged = nltk.pos_tag(tokens)

    # for each person noun, make mutations out of the word
    perturbations = list()
    perturbation_words = list()
    for tag in tagged:
        word = tag[0].lower()
        if tag[1] in NOUNS and word in HUMAN_WORDS:
            # find mutations
            analogy_sents, analogy_words = analogy_mutations(x, word, Sp_matrix, word_embeddings, tokenizer)
            active_sents, active_words = active_mutations(x, word, Sp)
            perturbations.extend(analogy_sents)
            perturbations.extend(active_sents)
            perturbation_words.extend(analogy_words)
            perturbation_words.extend(active_words)

    # return full set of perturbations
    # left out fluency filter for now because of nature of tweets
    return perturbations, perturbation_words

In [8]:
def is_metamorphic_fair(model, tokenizer, x, Sp, Sp_matrix, word_embeddings):
    '''Return True if model is metamorphically fair to sentence
    '''
    mutations, mutation_words = perturbator(x, Sp, Sp_matrix, word_embeddings, tokenizer)
    # for each mutation, check that output class is the same
    if len(mutations) > 0:
        real_predicted = get_model_output_class(model, tokenizer, [x])
        mutation_outputs = get_model_output_class(model, tokenizer, mutations)
        if not torch.all(real_predicted == mutation_outputs):
            violation_indices = list(torch.nonzero(real_predicted != mutation_outputs))
            violation_words = []
            for i in violation_indices:
                violation_words.append(mutation_words[int(i)])
            return False, len(mutations), violation_words
    return True, len(mutations), []

In [9]:
# get model
model_dir = 'no_debias'
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/NLP Capstone/models/' + model_dir + '/pytorch_model.bin',
                                                      config='/content/drive/MyDrive/NLP Capstone/models/' + model_dir + '/config.json')
model = model.to(device)
print('on model', model_dir)

input_embeddings = model.get_input_embeddings()
i = 0
for param in input_embeddings.parameters():
	if i == 0:
		NUM_EMBEDDINGS = int(param.size()[0])
		break
print('NUM_EMBEDDINGS:', NUM_EMBEDDINGS)
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/NLP Capstone/models/' + model_dir + '/', do_lower_case = True)

word_embeddings = []
for i in range(NUM_EMBEDDINGS):
    token = torch.IntTensor([i]).to(model.device)
    embedding = input_embeddings(token)[0]
    word_embeddings.append(embedding)
word_embeddings = torch.stack(word_embeddings)
first_word_embeddings = torch.clone(word_embeddings) # sanity check for later to make sure it wasn't modified
print('word embeddings matrix:', word_embeddings.shape)

# pre-process some data to speed up some runtime
# get words relating to gender (male, female, etc.)
Sp = graph_is_a_rev('gender')
Sp.remove('neuter') # remove b/c gender neutral words aren't covered in our techniques
Sp_matrix = []
for pi in Sp:
    Sp_matrix.append(get_embedding(word_embeddings, tokenizer, pi))
Sp_matrix = torch.stack(Sp_matrix)
first_Sp = set(Sp) # sanity check for later to make sure it wasn't modified
first_Sp_matrix = torch.clone(Sp_matrix) # sanity check for later to make sure it wasn't modified
print('Sp:', Sp)
print('Sp_matrix:', Sp_matrix.shape)

on model no_debias
NUM_EMBEDDINGS: 30522
word embeddings matrix: torch.Size([30522, 768])
Sp: {'male', 'female', 'masculine', 'feminine'}
Sp_matrix: torch.Size([4, 768])


### Metamorphic testing

In [None]:
# run bias evaluation
print('running regular bias eval on model ' + model_dir)
twitter_file = pd.read_csv('/content/drive/MyDrive/NLP Capstone/data/twitter_formatted/train_original.tsv', header=None, sep='\t')
# violations_file = open('./mt_violations/mt_violations_' + model_dir, 'w')
violations_ids_file = open('/content/drive/MyDrive/NLP Capstone/mt_violation_ids/mt_violation_ids_' + model_dir + '.txt', 'w')
violations_dict = dict()
sentences = 0
sentences_w_mutations = 0
violations = 0
for i, row in twitter_file.iterrows():
    if i % 1000 == 0:
        print(i, sentences_w_mutations, violations)
    sentences += 1
    is_fair, num_mutations, violation_words = is_metamorphic_fair(model, tokenizer, row[2], Sp, Sp_matrix, word_embeddings)
    if num_mutations > 0:
        sentences_w_mutations += 1
    if not is_fair:
        for word in violation_words:
            if word not in violations_dict:
                violations_dict[word] = 0
            violations_dict[word] += 1
        violations_ids_file.write(row[2] + '\n')
        # orig_output = get_model_output_class(model, tokenizer, [row[2]])
        # violations_file.write(str(int(orig_output)) + ' ' + row[2] + '\n')
        # mutations, _ = perturbator(row[2], Sp, Sp_matrix, word_embeddings, tokenizer)
        # for m in mutations:
        #     mut_output = get_model_output_class(model, tokenizer, [m])
        #     violations_file.write('\t' + str(int(mut_output)) + ' ' + m + '\n')
        violations += 1
# violations_file.close()
violations_ids_file.close()
print('violations per word:', sorted(violations_dict.items(), key=lambda p:p[1], reverse=True))
print('final stats:')
print('\tsentences:', sentences)
print('\tsentences with mutations:', sentences_w_mutations)
print('\tsentence violations:', violations)
print('\ttotal violations:', sum(violations_dict.values()))

In [None]:
assert(torch.equal(word_embeddings, first_word_embeddings))
assert(Sp == first_Sp)
assert(torch.equal(Sp_matrix, first_Sp_matrix))

### Certified Mitigation

In [10]:
def get_certified_mitigation_output(model, tokenizer, sentences, Sp, Sp_matrix, word_embeddings, epsilon):
    '''Return smoothed output using certified mitigation
    '''
    outputs = []
    # i = 0
    for x in sentences:
        # if i % 1000 == 0:
        #     print(i)
        # i += 1
        mutations, _ = perturbator(x, Sp, Sp_matrix, word_embeddings, tokenizer)
        smoothed_output = get_model_output(model, tokenizer, [x])
        if len(mutations) > 0:
            k = len(mutations)
            smoothed_output *= (math.exp(epsilon) / (k + math.exp(epsilon)))
            # for each mutation, calculate and add to smoothed output
            mutation_outputs = get_model_output(model, tokenizer, mutations)
            mutation_outputs *= (1 / (k + math.exp(epsilon)))
            smoothed_output += torch.sum(mutation_outputs, dim = 0)
            # return final smoothed output class
        outputs.append(smoothed_output[0])
    return torch.stack(outputs)

def get_certified_mitigation_output_class(model, tokenizer, sentences, Sp, Sp_matrix, word_embeddings, epsilon):
    return torch.argmax(get_certified_mitigation_output(model, tokenizer, sentences, Sp, Sp_matrix, word_embeddings, epsilon), dim = 1)

def is_metamorphic_fair_certified_mitigation(model, tokenizer, x, Sp, Sp_matrix, word_embeddings, epsilon):
    mutations, mutation_words = perturbator(x, Sp, Sp_matrix, word_embeddings, tokenizer)
    # for each mutation, check that output class is the same
    if len(mutations) > 0:
        real_predicted = get_certified_mitigation_output_class(model, tokenizer, [x], Sp, Sp_matrix, word_embeddings, epsilon)
        mutation_outputs = get_certified_mitigation_output_class(model, tokenizer, mutations, Sp, Sp_matrix, word_embeddings, epsilon)
        if not torch.all(real_predicted == mutation_outputs):
            violation_indices = list(torch.nonzero(real_predicted != mutation_outputs))
            violation_words = []
            for i in violation_indices:
                violation_words.append(mutation_words[int(i)])
            return False, len(mutations), violation_words, mutations, real_predicted, mutation_outputs
    return True, len(mutations), [], [], 0.0, []

In [None]:
EPSILON = 0
print('running certified mitigation bias eval on model ' + model_dir + ' with epsilon', EPSILON)
twitter_file = pd.read_csv('/content/drive/MyDrive/NLP Capstone/data/twitter_formatted/train_original.tsv', header=None, sep='\t')
# violations_file = open('./mt_violations/mt_violations_' + model_dir, 'w')
violations_ids_file = open('/content/drive/MyDrive/NLP Capstone/mt_violation_ids/mt_violation_ids_cm_' + model_dir + '.txt', 'w')
violations_dict = dict()
sentences = 0
sentences_w_mutations = 0
violations = 0
for i, row in twitter_file.iterrows():
    if i % 1000 == 0:
        print(i, sentences_w_mutations, violations)
    sentences += 1
    is_fair, num_mutations, violation_words, _, _, _ =\
        is_metamorphic_fair_certified_mitigation(model, tokenizer, row[2], Sp, Sp_matrix, word_embeddings, EPSILON)
    if num_mutations > 0:
        sentences_w_mutations += 1
    if not is_fair:
        for word in violation_words:
            if word not in violations_dict:
                violations_dict[word] = 0
            violations_dict[word] += 1
        violations_ids_file.write(row[2] + '\n')
        # violations_file.write(str(int(real_predicted)) + ' ' + row[2] + '\n')
        # for j in range(len(mutations)):
        #     violations_file.write('\t' + str(int(mutation_outputs[j])) + ' ' + mutations[j] + '\n')
        violations += 1
# violations_file.close()
violations_ids_file.close()
print('violations per word:', sorted(violations_dict.items(), key=lambda p:p[1], reverse=True))
print('final stats:')
print('\tsentences:', sentences)
print('\tsentences with mutations:', sentences_w_mutations)
print('\tsentence violations:', violations)
print('\ttotal violations:', sum(violations_dict.values()))

running certified mitigation bias eval on model no_debias with epsilon 0
0 0 0
1000 352 0


In [None]:
from sklearn.metrics import f1_score

# find test/val accuracy with certified mitigation
EPSILON = 0
dev_file = pd.read_csv('/content/drive/MyDrive/NLP Capstone/data/twitter_formatted/dev.tsv', header=None, sep='\t')
test_file = pd.read_csv('/content/drive/MyDrive/NLP Capstone/data/twitter_formatted/test.tsv', header=None, sep='\t')
print('calculating accuracy and f1 for model ' + model_dir + ' with epsilon ' + str(EPSILON))

def calc_certified_mitigation_accuracy_and_f1(twitter_file):
    sentences = []
    y_real = []
    for i, row in twitter_file.iterrows():
        sentences.append(row[2])
        y_real.append(1 if row[1] == 'positive' else 0)
    y_real = torch.cuda.IntTensor(y_real)
    # print(y_real)
    # print(y_real.shape[0])
    y_pred = get_certified_mitigation_output_class(model, tokenizer, sentences, Sp, Sp_matrix, EPSILON)
    # print(y_pred)
    # print(y_pred.shape[0])
    accuracy = float(torch.sum(y_real == y_pred) / y_real.shape[0])
    f1 = f1_score(y_real.cpu().numpy(), y_pred.cpu().numpy())
    return accuracy, f1

# dev_acc, dev_f1 = calc_certified_mitigation_accuracy_and_f1(dev_file)
test_acc, test_f1 = calc_certified_mitigation_accuracy_and_f1(test_file)

print('test accuracy:', "{:.4f}".format(test_acc))
print('test f1:', "{:.4f}".format(test_f1))
# print('dev accuracy:', "{:.4f}".format(dev_acc))
# print('dev f1:', "{:.4f}".format(dev_f1))

## Data Map
Create a data map using the confidence and variability values for the Twitter training data, computed according to [Swayamdipta et al. (2020)](https://arxiv.org/abs/2009.10795)

In [None]:
# Read in the confidence and variability values for the Twitter training data
twitter_data_map = []
confidence_vals = []
variability_vals = []
# Edit the file path below to go to a file containing the tweets and their
# confidence and variability values. The first column in the file contains the
# tweets, the second to last column contains the confidence values, and the last
# column contains the variability values.
with open('/content/drive/MyDrive/NLP Capstone/data/datamaps_no_duplicates.tsv', 'r') as data_maps_file:
    csv_reader = csv.reader(data_maps_file, delimiter='\t')
    # Read the column names
    next(csv_reader)
    # Read in the actual data
    for row in csv_reader:
        confidence = float(row[len(row) - 2])
        variability = float(row[len(row) - 1])
        twitter_data_map.append((row[0], confidence, variability))
        confidence_vals.append(confidence)
        variability_vals.append(variability)

In [None]:
# Create the data map by plotting variability vs. confidence
plt.figure(figsize=(12, 10))
plt.title('Data Map for Twitter Training Data')
plt.scatter(variability_vals, confidence_vals, s=0.5)
plt.xlabel('Variability')
plt.ylabel('Confidence')
plt.show()

In [None]:
# Create a histogram of confidence vs. density
plt.figure(figsize=(8, 6))
plt.hist(confidence_vals, rwidth=0.8)
plt.title('Confidence vs. Density for Twitter Training Data')
plt.xlabel('Confidence')
plt.ylabel('Density')
plt.show()

In [None]:
# Create a histogram of variability vs. density
plt.figure(figsize=(8, 6))
plt.hist(variability_vals, rwidth=0.8)
plt.title('Variability vs. Density for Twitter Training Data')
plt.xlabel('Variability')
plt.ylabel('Density')
plt.show()

In [None]:
# Split the Twitter training data into easy (low variability and high confidence),
# hard (low variability and low confidence), and ambiguous (high variability).
easy_tweets = []
hard_tweets = []
ambiguous_tweets = []
for tweet in twitter_data_map:
    if tweet[2] < 0.1:
        if tweet[1] > 0.5:
            easy_tweets.append(tweet[0])
        else:
            hard_tweets.append(tweet[0])
    else:
        ambiguous_tweets.append(tweet[0])
# Measure co-occurrence bias and conditional co-occurrence bias on the easy,
# hard, and ambiguous tweets.
easy_cooccurrence_bias = measure_cooccurrence_bias(easy_tweets, female_words, male_words)
print('Easy tweets:\nCo-occurrence bias:', easy_cooccurrence_bias[0],
      '\nConditional co-occurrence bias:', easy_cooccurrence_bias[1], '\n')
hard_cooccurrence_bias = measure_cooccurrence_bias(hard_tweets, female_words, male_words)
print('Hard tweets:\nCo-occurrence bias:', hard_cooccurrence_bias[0],
      '\nConditional co-occurrence bias:', hard_cooccurrence_bias[1], '\n')
ambiguous_cooccurrence_bias = measure_cooccurrence_bias(ambiguous_tweets, female_words, male_words)
print('Ambiguous tweets:\nCo-occurrence bias:', ambiguous_cooccurrence_bias[0],
      '\nConditional co-occurrence bias:', ambiguous_cooccurrence_bias[1], '\n')