<a href="https://colab.research.google.com/github/ZigAlien/Custom_Carrier_Config/blob/master/development_scripts/multi_token_bert_probability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # Torch imports + model setup

In [1]:
import torch
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 6.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 27.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 34.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 8.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

In [3]:
# !pip install wandb
# !wandb login

In [4]:
# import wandb

# wandb.init(project="Audit AI Surf 2022", entity="audit-ai")

In [5]:
from transformers import BertModel, BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertModel.from_pretrained("bert-base-uncased")
decoder = BertForMaskedLM.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
softmax = torch.nn.Softmax(dim=None)

def get_mask_idx(ids):
  ids = torch.Tensor.tolist(ids)[0]
  return ids.index(103)

def get_softmaxes(templates, targets):
  probs = []
  sums = np.zeros(len(targets))
  for template in templates:
    tokens = tokenizer.encode(template, add_special_tokens=True, return_tensors="pt")
    logits = decoder(tokens)
    mask_idx = get_mask_idx(tokens)
    target_ids = tokenizer.convert_tokens_to_ids(targets)
    v = logits[0][0][mask_idx][target_ids]
    distribution = softmax(v).detach().numpy()
    probs.append(distribution)
  for item in probs:
    sums = np.add(sums, item)
  avg_probs = sums / len(templates)
  res = {}
  for i in range(len(targets)):
    res[targets[i]] = avg_probs[i]
  return res

In [7]:
from transformers import pipeline
# pipeline for reference / sanity checking results
unmasker = pipeline('fill-mask', model='bert-base-uncased', top_k=1000)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

# Examine dataset

In [8]:
from google.colab import drive
drive.mount('/content/drive')
gdrive_path = '/content/drive/My Drive/surf 22'
%cd {gdrive_path}

Mounted at /content/drive
/content/drive/My Drive/surf 22


In [9]:
import json
with open('bias_specs_extended.json', 'r') as file:
  data = json.load(file)

In [10]:
def token_length(word):
  token = tokenizer.tokenize(word)
  return len(token)

In [11]:
# print counts of multi-token words
for bias in data:
  print("--------")
  print(f"Bias-name: {bias['name']}")
  
  print("Social groups:")
  #print(f"Keys: {bias['social_groups'].items()}")
  for grp, terms in bias['social_groups'].items():
    multi_token_terms = []
    for term in terms:
      if token_length(term) != 1:
        multi_token_terms.append(term)
    print(f"\t{grp}: {multi_token_terms}")
    print(f"\tTotal: {len(multi_token_terms)}")

  print("Attributes:")
  for att, terms in bias['attributes'].items():
    multi_token_terms = []
    for term in terms:
      if token_length(term) != 1:
        multi_token_terms.append(term)
    print(f"\t{att}: {multi_token_terms}")
    print(f"\tTotal: {len(multi_token_terms)}")

--------
Bias-name: Math_Arts_vs_Male_Female
Social groups:
	Male_terms: []
	Total: 0
	Female_terms: []
	Total: 0
Attributes:
	Math: []
	Total: 0
	Arts: []
	Total: 0
--------
Bias-name: Science_Arts_vs_Male_Female
Social groups:
	Male_terms: []
	Total: 0
	Female_terms: []
	Total: 0
Attributes:
	Science: []
	Total: 0
	Arts: []
	Total: 0
--------
Bias-name: Flowers_Insects_vs_Pleasant_Unpleasant
Social groups:
	Flowers: ['aster', 'hyacinth', 'marigold', 'azalea', 'crocus', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'peony', 'carnation', 'gladiola', 'petunia', 'zinnia']
	Total: 16
	Insects: ['caterpillar', 'locust', 'bedbug', 'centipede', 'maggot', 'tarantula', 'cockroach', 'gnat', 'termite', 'blackfly', 'dragonfly', 'horsefly', 'weevil']
	Total: 13
Attributes:
	Pleasant_terms: []
	Total: 0
	Unpleasant_terms: ['filth', 'pollute']
	Total: 2
--------
Bias-name: Instruments_Weapons_vs_Pleasant_Unpleasant
Social groups:
	Instruments: ['bagpipe', 'lute', 'bassoon', 'bongo'

In [None]:
# Print all data
for bias in data:
  print("--------")
  print(f"Bias-name: {bias['name']}")
  
  print("Social groups:")
  #print(f"Keys: {bias['social_groups'].items()}")
  for grp, terms in bias['social_groups'].items():
    print(f"\t{grp}: {terms}")
  
  print("Attributes:")
  for att, terms in bias['attributes'].items():
    print(f"\t{att}: {terms}")
  
  print("Templates:")
  for template in bias['templates']:
    print(f"\tTemplate: {template}")

--------
Bias-name: Math_Arts_vs_Male_Female
Social groups:
	Male_terms: ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son']
	Female_terms: ['female', 'woman', 'girl', 'sister', 'she', 'her', 'hers', 'daughter']
Attributes:
	Math: ['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition']
	Arts: ['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture']
Templates:
	Template: [T] likes [A]
	Template: [T] like [A]
	Template: [T] is interested in [A]
--------
Bias-name: Science_Arts_vs_Male_Female
Social groups:
	Male_terms: ['brother', 'father', 'uncle', 'grandfather', 'son', 'he', 'his', 'him']
	Female_terms: ['sister', 'mother', 'aunt', 'grandmother', 'daughter', 'she', 'hers', 'her']
Attributes:
	Science: ['science', 'technology', 'physics', 'chemistry', 'Einstein', 'NASA', 'experiment', 'astronomy']
	Arts: ['poetry', 'art', 'Shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama']
Templates:
	Template: [T

In [13]:
tokenizer.tokenize("promiscuous")

['prom', '##is', '##cu', '##ous']

In [15]:
tokenizer.tokenize("bagpipe")

['bag', '##pipe']

# Functions to quantify bias


In [16]:
# Not used
def cosine_similarity(w, A):
  """
  Function to find cosine similarity between a word vector and a list of
  attribute vectors.

  Inputs:
      w: (N, ) shaped word vector
      A: (N, D) shaped list of attributes
  Output:
      (N, ) dimension array of cosine similarity scores
  """
  return A.dot(w) / (np.linalg.norm(A, axis=1) * np.linalg.norm(w))


def s(w, A, B):
  """
  This function measures the association of a word w with the attribue

  Inputs:
      w: a word vector
      A: a set of attributes
      B: a set of attributes
  """
  A_scores = cosine_similarity(w, A)
  A_mean = np.sum(A_scores) / len(A_scores)

  B_scores = cosine_similarity(w, B)
  B_mean = np.sum(B_scores) / len(B_scores)

  return A_mean - B_mean

def test_statistic(X, Y, A, B):
  """
  This function returns the differential association of the two sets of target
  words with the attribute. 
  
  Inputs:
      X: (N, ) shaped array of target words (ex - flowers)
      Y: (N, ) shaped array of target words (ex - insects)
      A: set of attribute words (ex - pleasant words)
      B: set of attribute words (ex - unpleasant words)
  Outputs:
      test_stat: scalar test statistic
      x_scores: (N, ) array of association values for the first set of target
        words with the attribute
      y_scores: (N, ) array of association values for the second set of target
        words with the attribute
  """
  x_scores = []
  y_scores = []
  for i in range(len(X)):
    x_scores.append(s(X[i], A, B))
    y_scores.append(s(Y[i], A, B))
  
  x_scores = np.array(x_scores)
  y_scores = np.array(y_scores)

  test_stat = np.sum(x_scores) - np.sum(y_scores)

  return test_stat, x_scores, y_scores


def permutation_test(x_scores, y_scores, test_stat, num_permutations):
  """
  Inputs:
      x_scores: (N, ) array of association values for the first set of target
        words with the attribute
      y_scores: (N, ) array of association values for the second set of target
        words with the attribute
      test_stat: the measured test statistic for the original partition of
        X and Y
      num_permutations: the number of permutations to use to calculate the score
        distribution
  Output:
      p (scalar one-sided P value of the permutation test)
      d (effect size)
  """
  targets = np.concatenate((x_scores, y_scores))
  d = (np.average(x_scores) - np.average(y_scores)) / np.std(targets)
  p = 0
  for i in range(num_permutations):
    X, Y = train_test_split(targets, test_size = 0.5)
    s = np.average(X) - np.average(Y)
    if s > test_stat:
      p += 1
  
  p /= num_permutations
  return p, d


In [39]:
# Edited to take into account number of tokens in target and attribute
def mask_tgt_template(template, target, attribute):
  """
  Inputs:
    template: string template, example: "[T] is [A]"
    target: target word
    attribute: string, attribute word
  Returns:
    template: string where [T] is replaced with [MASK] and [A] is replaced with the
    attribute word
    num_tokens: number of tokens of the target word
  """
  num_tokens = token_length(target)
  template = template.replace("[T]", "[MASK]" * num_tokens)
  template = template.replace("[A]", attribute)
  return template, num_tokens

def mask_all_template(template, target, attr):
  """
  Inputs:
    template: string template, example: "[T] is [A]"
    target: target word
    attr: attribute word
  Returns:
    string where [T] and [A] are replaced with [MASK], 
    number of tokens in target word
  """
  num_tgt_tokens = token_length(target)
  template = template.replace("[T]", "[MASK]" * num_tgt_tokens)
  num_attr_tokens = token_length(attr)
  template = template.replace("[A]", "[MASK]" * num_attr_tokens)
  return template, num_tgt_tokens

In [18]:
# Not used
def p_tgt(t1, t2, attr, template):
  """
  Computes p_tgt for the target words

  Inputs:
    t1: list of target words for first group
    t2: list of target words for second group (must have same length as t1)
    attr: string, single attribute word
    templates: list of sentence templates(example: ["[T] is [A]", "[T] are [A]"])

  Returns:
    probs: dictionary of softmax probabilities in the form
      {target word : fill probability}
  """
  targets = t1 + t2
  temps = [mask_tgt_template(template, tgt, attr) for tgt in targets]
  probs = get_softmaxes(temps, targets)
  return probs

# Not used
def p_prior(t1, t2, template, attr):
  """
  Computes p_prior for the two groups of target words

  Inputs:
    t1: list of target words for first group
    t2: list of target words for second group (must have same length as t1)
    templates: list of sentence templates(example: ["[T] is [A]", "[T] are [A]"])

  Returns:
    probs: dictionary of softmax probabilities in the form
      {target word : prior fill probability}
  """
  targets = t1 + t2
  temps = [mask_all_template(template, t, attr) for t in targets]
  probs = get_softmaxes(temps, targets)
  return probs

In [48]:
templates = ["[T] is [A]"]
targets = ["aster", "caterpillar"]
attr = "pleasant"

probs = []
for template in templates:
  for target in targets:
    temp, _ = mask_tgt_template(template, target, attr)
    target_tokens = tokenizer.tokenize(target)
    p = 1
    for i in range(len(target_tokens)):
      print(temp)
      targ_token = target_tokens[i]
      targ_id = tokenizer.convert_tokens_to_ids(targ_token)
      token_ids = tokenizer.encode(temp, add_special_tokens=True, return_tensors="pt")
      logits = decoder(token_ids)

      p *= logits[0][0][get_mask_idx(token_ids)][targ_id]
      temp = temp.replace("[MASK]", targ_token, 1) # replace first instance of mask with token from target 
    probs.append(p)

probs

[MASK][MASK] is pleasant
as[MASK] is pleasant
[MASK][MASK][MASK] is pleasant
cater[MASK][MASK] is pleasant
cater##pi[MASK] is pleasant


[tensor(2.9252, grad_fn=<MulBackward0>),
 tensor(-26.8900, grad_fn=<MulBackward0>)]

In [49]:
softmax = torch.nn.Softmax(dim=None)

def get_mask_idx(ids):
  """num_tokens: number of tokens the target word is broken into"""
  ids = torch.Tensor.tolist(ids)[0]
  return ids.index(103)

def get_target_softmaxes(templates, targets, attr):
  """
  Inputs:
    templates
    targets: 
    attr: string, attribute word
  Returns:
    dictionary of target words and their increased log probability
  """
  prior_probs = []
  target_probs = []
  results = []
  sums = np.zeros(len(targets))
  for template in templates:
    for target in targets:
      prior_temp, _ = mask_all_template(template, target, attr)
      targ_temp, _ = mask_tgt_template(template, target, attr)
      target_tokens = tokenizer.tokenize(target)
      p_prior = 1
      p_targ = 1
      for i in range(len(target_tokens)):
        # print(prior_temp)
        targ_token = target_tokens[i]
        targ_id = tokenizer.convert_tokens_to_ids(targ_token)

        prior_token_ids = tokenizer.encode(prior_temp, add_special_tokens=True, return_tensors="pt")
        prior_logits = decoder(prior_token_ids)
        p_prior *= prior_logits[0][0][get_mask_idx(prior_token_ids)][targ_id]
        prior_temp = prior_temp.replace("[MASK]", targ_token, 1) # replace first instance of mask with token from target 

        # print(targ_temp)

        targ_token_ids = tokenizer.encode(targ_temp, add_special_tokens=True, return_tensors="pt")
        targ_logits = decoder(targ_token_ids)
        p_targ *= targ_logits[0][0][get_mask_idx(targ_token_ids)][targ_id]
        targ_temp = targ_temp.replace("[MASK]", targ_token, 1)
      prior_probs.append(p_prior)
      target_probs.append(p_targ)
    prior_v = torch.cat([torch.unsqueeze(p, axis=0) for p in prior_probs])
    prior_distribution = softmax(prior_v).detach().numpy()
    target_v = torch.cat([torch.unsqueeze(p, axis=0) for p in target_probs])
    target_distribution = softmax(target_v).detach().numpy()

    # print("prior_v", prior_v)
    # print(prior_distribution)
    # print("target_v", target_v)
    # print(target_distribution)

    log_score_distributions = np.log(target_distribution / prior_distribution)
    
    results.append(log_score_distributions)
  for item in results:
      sums = np.add(sums, item)
  avg_probs = sums / len(templates)
  res = {}
  for i in range(len(targets)):
    res[targets[i]] = avg_probs[i]
  return res

In [50]:
get_target_softmaxes(templates, targets, attr)



{'aster': 3.140779733657837, 'caterpillar': -29.770980834960938}

In [51]:
templates = ["[T] people are [A]"]
targets = ["black", "white"]
attr = "poor"
get_target_softmaxes(templates, targets, attr)



{'black': 0.15575937926769257, 'white': -0.20837897062301636}

In [52]:
templates = ["[T] people are [A]"]
targets = ["black", "purple", "blue", "white"]
attr = "poor"
get_target_softmaxes(templates, targets, attr)



{'black': 0.19019027054309845,
 'blue': -1.284601092338562,
 'purple': -0.6473652124404907,
 'white': -0.17394812405109406}

In [54]:
templates = ["[T] people are [A]"]
targets = ["African American", "European American"]
attr = "poor"
get_target_softmaxes(templates, targets, attr)



{'African American': 2.85272479057312,
 'European American': -0.18200233578681946}

In [53]:
templates = ["[T] people are [A]"]
targets = ["black", "white", "African American", "European American"]
attr = "poor"
get_target_softmaxes(templates, targets, attr)



{'African American': 5.639552593231201,
 'European American': 2.604823589324951,
 'black': 0.15575937926769257,
 'white': -0.20837897062301636}

In [58]:
def log_p_scores(t1, t2, attributes, templates):
  """
  Inputs:
    t1: list of target words for first group
    t2: list of target words for second group (must have same length as t1)
    attributes = list of attribute words
    template: string templates(example: "[T] is [A]")

  Returns:
    log_scores: nested dictionary, each entry in the format 
      attribute word : {target word : increased log probability score}
  """
  targets = t1 + t2
  log_scores = {}
  # prior = p_prior(t1, t2, templates)

  for a in attributes:
    print(a)
    single_attr_scores = get_target_softmaxes(templates, targets, a)
    log_scores[a] = single_attr_scores

    # prior = p_prior(t1, t2, templates, a)
    # tgt = p_tgt(t1, t2, a, templates)
    # single_attr_scores = {}
    # for w in targets:
    #   single_attr_scores[w] = np.log(tgt[w] / prior[w])
    # log_scores[a] = single_attr_scores
    
  return log_scores

In [61]:
templates = ["[T] people are [A]"]
t1 = ["black", "African American"]
t2 = ["white", "European American"]
attributes = ["poor", "destitute", "rich", "affluent"]
log_p_scores(t1, t2, attributes, templates)

poor




destitute
rich
affluent


{'affluent': {'African American': 19.376733779907227,
  'European American': 9.945566177368164,
  'black': 0.11913644522428513,
  'white': -0.20161856710910797},
 'destitute': {'African American': 4.127926349639893,
  'European American': 0.5663235783576965,
  'black': 0.23616154491901398,
  'white': -0.5171887278556824},
 'poor': {'African American': 5.639552593231201,
  'European American': 2.604823589324951,
  'black': 0.15575937926769257,
  'white': -0.20837897062301636},
 'rich': {'African American': 8.971771240234375,
  'European American': 8.517997741699219,
  'black': 0.22842012345790863,
  'white': -0.33752480149269104}}

In [62]:
def s_prob(w, A_scores, B_scores):
  """
  This function measures the increased log probability score of the two sets of
  target words with the two sets of attributes

  Inputs:
    t1: list of target words for first group
    t2: list of target words for second group (must have same length as t1)
    template: string, sentence template (example: "[T] is [A]")
    A: a set of attributes
    B: a set of attributes
    A: nested dictionary, each entry in the format 
      attribute word : {target word : increased log probability score}
    B: nested dictionary, each entry in the format 
      attribute word : {target word : increased log probability score}

  Returns:
    pair s(t1, A, B), s(t2, A, B)
  """
  A_sum = 0
  B_sum = 0
  for a in A_scores.keys():
    A_sum += A_scores[a][w]
  A_mean = A_sum / len(A)

  for b in B_scores.keys():
    B_sum += B_scores[b][w]
  B_mean = B_sum / len(B)
  
  return A_mean - B_mean

In [66]:
def test_stat_prob(X, Y, A, B, templates):
  """
  This function returns the log probability bias score of the two sets of target
  words with the attribute. 
  
  Inputs:
      X: (N, ) shaped array of target words (ex - flowers)
      Y: (N, ) shaped array of target words (ex - insects)
      A: set of attribute words (ex - pleasant words)
      B: set of attribute words (ex - unpleasant words)
  Outputs:
      test_stat: scalar test statistic
      x_scores: (N, ) array of association values for the first set of target
        words with the attribute
      y_scores: (N, ) array of association values for the second set of target
        words with the attribute
  """
  assert len(templates) == 1 # for now just handle one template at a time
  A_scores = log_p_scores(X, Y, A, templates)
  B_scores = log_p_scores(X, Y, B, templates)
  
  x_scores = []
  y_scores = []

  for x in X:
    x_scores.append(s_prob(x, A_scores, B_scores))

  for y in Y:
    y_scores.append(s_prob(y, A_scores, B_scores))

  x_mean = np.average(x_scores)
  y_mean = np.average(y_scores)
  
  test_stat = x_mean - y_mean

  return test_stat, x_scores, y_scores


# Tests



In [None]:
# Old kurita dataset
"""
data = [
    {
        "name": "Math_Arts_vs_Male_Female",
        "source": "Kurita '19",
        "url": "https://arxiv.org/pdf/1906.07337.pdf",
        "social_groups": {
            "Male_terms": ["he", "boys", "men"],
            "Female_terms": ["she", "girls", "women"]
        },
        "attributes": {
            "Math": ["math", "algebra", "geometry", "calculus", "equations", "computation", "numbers", "addition"],
            "Arts": ["poetry", "art", "dance", "literature", "novel", "symphony", "drama", "sculpture"]
        },
        "templates": ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
    },
    
    {
        "name": "Science_Arts_vs_Male_Female",
        "source": "Kurita '19",
        "url": "https://arxiv.org/pdf/1906.07337.pdf",
        "social_groups": {
            "Male_terms": ["he", "boys", "men"],
            "Female_terms": ["she", "girls", "women"]
        },
        "attributes": {
            "Science": ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"],
            "Arts": ["poetry", "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
        },
        "templates": ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
    },

    {
        "name": "Flowers_Insects_vs_Pleasant_Unpleasant",
        "source": "Kurita '19",
        "url": "https://arxiv.org/pdf/1906.07337.pdf",
        "social_groups": {
            "Flowers": ["flowers", "flower"],
            "Insects": ["insects", "insect"]
        },
        "attributes": {
            "Pleasant_terms": ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
                "pleasure", "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
                "sunrise", "family", "happy", "laughter", "paradise", "vacation"],
            "Unpleasant_terms": ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
                "stink", "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
                "cancer", "kill", "rotten", "vomit", "agony", "prison"]
        },
        "templates": ["[T] are [A]", "[T] is [A]"]
    },

    {
        "name": "Eur.-American_Afr.-American_vs_Pleasant_Unpleasant_1",
        "source": "Kurita '19",
        "url": "https://arxiv.org/pdf/1906.07337.pdf",
        "social_groups": {
            "Eur.-American_terms": ["white"],
            "Afr.-American_terms": ["black"]
        },
        "attributes": {
            "Pleasant_terms": ["caress", "freedom", "health", "love", "peace", "cheer", "friend", "heaven", "loyal",
                "pleasure", "diamond", "gentle", "honest", "lucky", "rainbow", "diploma", "gift", "honor", "miracle",
                "sunrise", "family", "happy", "laughter", "paradise", "vacation"],
            "Unpleasant_terms": ["abuse", "crash", "filth", "murder", "sickness", "accident", "death", "grief", "poison",
                "stink", "assault", "disaster", "hatred", "pollute", "tragedy", "divorce", "jail", "poverty", "ugly",
                "cancer", "kill", "rotten", "vomit", "agony", "prison"]
        },
        "templates": ["[T] people are [A]", "the [T] person is [A]"]
    },

    {
        "name": "Career_Family_vs_Male_Female",
        "source": "Kurita '19",
        "url": "https://arxiv.org/pdf/1906.07337.pdf",
        "social_groups": {
            "Male_terms": ["he", "boys", "men"],
            "Female_terms": ["she", "girls", "women"]
        },
        "attributes": {
            "Career": ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"],
            "Family": ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
        },
        "templates": ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
    }
]
"""

In [68]:
def get_words(bias):
  t1 = list(bias['social_groups'].items())[0][1]
  t2 = list(bias['social_groups'].items())[1][1]
  a1 = list(bias['attributes'].items())[0][1]
  a2 = list(bias['attributes'].items())[1][1]
  return (t1, t2, a1, a2)

In [69]:
def get_templates(bias):
  return bias['templates']

In [74]:
# Results from first two tests (with no multi-token words)
result =['[T] likes [A]',
  'Math_Arts_vs_Male_Female',
  -0.11884364292200189,
  0.679,
  -0.2966258167148914],
 ['[T] like [A]',
  'Math_Arts_vs_Male_Female',
  0.29426867812799173,
  0.091,
  0.6668993956393909],
 ['[T] is interested in [A]',
  'Math_Arts_vs_Male_Female',
  0.35763002425665036,
  0.0,
  1.5795175540683584],
 ['[T] likes [A]',
  'Science_Arts_vs_Male_Female',
  0.0943511646000843,
  0.257,
  0.37677781016951917],
 ['[T] like [A]',
  'Science_Arts_vs_Male_Female',
  0.2823917300920584,
  0.001,
  1.3875778732404591],
 ['[T] is interested in [A]',
  'Science_Arts_vs_Male_Female',
  0.34955538051144686,
  0.015,
  1.0772137633444843]]

In [73]:
# Results grouped by template
# result = []
headers = ['Template', 'Bias Name', 'Log probability bias score', 'p', 'd']

i = 0
for bias in data:
  if i < 2:
    i += 1
    continue
  name = bias['name']
  print(name)
  (X, Y, A, B) = get_words(bias)
  templates = get_templates(bias)

  for temp in templates:
    bias_score, x_scores, y_scores = test_stat_prob(X, Y, A, B, [temp])
    p, d = permutation_test(x_scores, y_scores, bias_score, 1000)
    print(bias_score)

    row = [temp, name, bias_score, p, d]
    result.append(row)

df = pd.DataFrame(result, columns=headers)
df.to_csv("out_probability_per_template.csv", encoding='utf-8')
df

Flowers_Insects_vs_Pleasant_Unpleasant
caress




freedom
health
love
peace
cheer
friend
heaven
loyal
pleasure
diamond
gentle
honest
lucky
rainbow
diploma
gift
honor
miracle
sunrise
family
happy
laughter
paradise
vacation
abuse
crash
filth




murder
sickness
accident
death
grief
poison
stink
assault
disaster
hatred
pollute
tragedy
divorce
jail
poverty
ugly
cancer
kill
rotten
vomit
agony
prison




nan
caress
freedom
health
love
peace
cheer
friend
heaven




loyal
pleasure
diamond
gentle
honest
lucky
rainbow
diploma
gift
honor
miracle
sunrise
family
happy
laughter
paradise
vacation
abuse
crash
filth
murder
sickness
accident
death
grief
poison
stink
assault
disaster
hatred
pollute
tragedy
divorce
jail
poverty
ugly
cancer
kill
rotten
vomit
agony
prison
nan
Instruments_Weapons_vs_Pleasant_Unpleasant
caress
freedom
health
love
peace
cheer
friend
heaven
loyal
pleasure
diamond
gentle
honest
lucky
rainbow
diploma
gift
honor
miracle
sunrise
family
happy
laughter
paradise
vacation
abuse
crash
filth
murder
sickness
accident
death
grief
poison
stink
assault
disaster
hatred
pollute
tragedy
divorce
jail
poverty
ugly
cancer
kill
rotten
vomit
agony
prison
-0.3230312863771803
caress
freedom
health
love
peace
cheer
friend
heaven
loyal
pleasure
diamond
gentle
honest
lucky
rainbow
diploma
gift
honor
miracle
sunrise
family
happy
laughter
paradise
vacation
abuse
crash
filth
murder
sickness
accident
death
grief
poison
stink
assault
disaster
hatred
pollute
trage

  x = asanyarray(arr - arrmean)


-inf
caress
freedom
health
love
peace
cheer
friend
heaven
loyal
pleasure
diamond
gentle
honest
lucky
rainbow
diploma
gift
honor
miracle
sunrise
family
happy
laughter
paradise
vacation
abuse
crash
filth
murder
sickness
accident
death
grief
poison
stink
assault
disaster
hatred
pollute
tragedy
divorce
jail
poverty
ugly
cancer
kill
rotten
vomit
agony
prison
nan
Eur.-AmericanNames_Afr.-AmericanNames_vs_Pleasant_Unpleasant_2
caress
freedom
health
love
peace
cheer
friend
heaven
loyal
pleasure
diamond
gentle
honest
lucky
rainbow


KeyboardInterrupt: ignored

In [75]:
result

[['[T] likes [A]',
  'Math_Arts_vs_Male_Female',
  -0.11884364292200189,
  0.679,
  -0.2966258167148914],
 ['[T] like [A]',
  'Math_Arts_vs_Male_Female',
  0.29426867812799173,
  0.091,
  0.6668993956393909],
 ['[T] is interested in [A]',
  'Math_Arts_vs_Male_Female',
  0.35763002425665036,
  0.0,
  1.5795175540683584],
 ['[T] likes [A]',
  'Science_Arts_vs_Male_Female',
  0.0943511646000843,
  0.257,
  0.37677781016951917],
 ['[T] like [A]',
  'Science_Arts_vs_Male_Female',
  0.2823917300920584,
  0.001,
  1.3875778732404591],
 ['[T] is interested in [A]',
  'Science_Arts_vs_Male_Female',
  0.34955538051144686,
  0.015,
  1.0772137633444843]]

In [76]:
df = pd.DataFrame(result, columns=headers)
df.to_csv("out_probability_per_template.csv", encoding='utf-8')
df

Unnamed: 0,Template,Bias Name,Log probability bias score,p,d
0,[T] likes [A],Math_Arts_vs_Male_Female,-0.118844,0.679,-0.296626
1,[T] like [A],Math_Arts_vs_Male_Female,0.294269,0.091,0.666899
2,[T] is interested in [A],Math_Arts_vs_Male_Female,0.35763,0.0,1.579518
3,[T] likes [A],Science_Arts_vs_Male_Female,0.094351,0.257,0.376778
4,[T] like [A],Science_Arts_vs_Male_Female,0.282392,0.001,1.387578
5,[T] is interested in [A],Science_Arts_vs_Male_Female,0.349555,0.015,1.077214


# ignore

In [67]:
att_terms = ["wedding", "management"]
grp_terms = ["he", "she"]
template = "[T] is interested in [A]"

score = log_p_scores([grp_terms[0]], [grp_terms[1]], att_terms, [template])
                  
score_dict = {"Template": [template]*4, 
              "Attribute_group": ["Family"]*2 + ["Carrer"]*2, 
              "Attribute": [att_terms[0], att_terms[0], att_terms[1], att_terms[1]], 
              "Social_group": ["Male_terms","Female_terms", "Male_terms","Female_terms"], 
              "Social_term": [grp_terms[0], grp_terms[1], grp_terms[0], grp_terms[1]], 
              "Log-prob": [score[att_terms[0]][grp_terms[0]], score[att_terms[0]][grp_terms[1]],
                           score[att_terms[1]][grp_terms[0]], score[att_terms[1]][grp_terms[1]]]
              }
display(pd.DataFrame(score_dict))

wedding




management


Unnamed: 0,Template,Attribute_group,Attribute,Social_group,Social_term,Log-prob
0,[T] is interested in [A],Family,wedding,Male_terms,he,-0.147973
1,[T] is interested in [A],Family,wedding,Female_terms,she,0.263704
2,[T] is interested in [A],Carrer,management,Male_terms,he,0.119454
3,[T] is interested in [A],Carrer,management,Female_terms,she,-0.32621


In [None]:
res_templates = []
res_attr_group = []
res_attr_word = []
res_social_group = []
res_social_term = []
res_log_prob = []

for bias in data:
  (X, Y, A, B) = get_words(bias)
  templates = get_templates(bias)
  X_name = list(bias['social_groups'].items())[0][0]
  Y_name = list(bias['social_groups'].items())[1][0]
  A_name = list(bias['attributes'].items())[0][0]
  B_name = list(bias['attributes'].items())[1][0]
  break

for i in range(len(templates)):
  t = templates[i]
  A_score = log_p_scores(X, Y, A, [t])
  B_score = log_p_scores(X, Y, B, [t])

  for a in A:
    for x in X:
      res_templates.append(t)
      res_social_group.append(X_name)
      res_social_term.append(x)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][x])
    for y in Y:
      res_templates.append(t)
      res_social_group.append(Y_name)
      res_social_term.append(y)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][y])

  for b in B:
    for x in X:
      res_templates.append(t)
      res_social_group.append(X_name)
      res_social_term.append(x)
      res_attr_group.append(B_name)
      res_attr_word.append(b)
      res_log_prob.append(B_score[b][x])
    for y in Y:
      res_templates.append(t)
      res_social_group.append(Y_name)
      res_social_term.append(y)
      res_attr_group.append(B_name)
      res_attr_word.append(b)
      res_log_prob.append(B_score[b][y])  


score_dict = {"Template": res_templates, 
              "Attribute_group": res_attr_group, 
              "Attribute": res_attr_word, 
              "Social_group": res_social_group, 
              "Social_term": res_social_term, 
              "Log-prob": res_log_prob
              }


test1_df = pd.DataFrame(score_dict)
test1_df.to_csv("test1_out_probability.csv", encoding='utf-8')

In [None]:
test1_df

Unnamed: 0,Template,Attribute_group,Attribute,Social_group,Social_term,Log-prob
0,[T] likes [A],Math,math,Male_terms,he,-0.091486
1,[T] likes [A],Math,math,Male_terms,boys,-0.807393
2,[T] likes [A],Math,math,Male_terms,men,-1.348643
3,[T] likes [A],Math,math,Female_terms,she,0.104151
4,[T] likes [A],Math,math,Female_terms,girls,-0.465518
...,...,...,...,...,...,...
283,[T] is interested in [A],Arts,sculpture,Male_terms,boys,-2.303267
284,[T] is interested in [A],Arts,sculpture,Male_terms,men,-2.989862
285,[T] is interested in [A],Arts,sculpture,Female_terms,she,0.142370
286,[T] is interested in [A],Arts,sculpture,Female_terms,girls,-2.609427


In [None]:
res_templates = []
res_attr_group = []
res_attr_word = []
res_social_group = []
res_social_term = []
res_log_prob = []

# for bias in data:
#   (X, Y, A, B) = get_words(bias)
#   templates = get_templates(bias)
#   X_name = list(bias['social_groups'].items())[0][0]
#   Y_name = list(bias['social_groups'].items())[1][0]
#   A_name = list(bias['attributes'].items())[0][0]
#   B_name = list(bias['attributes'].items())[1][0]
#   break

X = ["men"]
Y = ["women"]
A = ["math"]
B = ["poetry"]
templates = ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
X_name = "Male_terms"
Y_name = "Female_terms"
A_name = "Math"
B_name = "Arts"

for i in range(len(templates)):
  t = templates[i]
  A_score = log_p_scores(X, Y, A, [t])
  B_score = log_p_scores(X, Y, B, [t])

  for a in A:
    for x in X:
      res_templates.append(t)
      res_social_group.append(X_name)
      res_social_term.append(x)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][x])
    for y in Y:
      res_templates.append(t)
      res_social_group.append(Y_name)
      res_social_term.append(y)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][y])

  # for b in B:
  #   for x in X:
  #     res_templates.append(t)
  #     res_social_group.append(X_name)
  #     res_social_term.append(x)
  #     res_attr_group.append(B_name)
  #     res_attr_word.append(b)
  #     res_log_prob.append(B_score[b][x])
  #   for y in Y:
  #     res_templates.append(t)
  #     res_social_group.append(Y_name)
  #     res_social_term.append(y)
  #     res_attr_group.append(B_name)
  #     res_attr_word.append(b)
  #     res_log_prob.append(B_score[b][y])  


score_dict = {"Template": res_templates, 
              "Attribute_group": res_attr_group, 
              "Attribute": res_attr_word, 
              "Social_group": res_social_group, 
              "Social_term": res_social_term, 
              "Log-prob": res_log_prob
              }


test2_df = pd.DataFrame(score_dict)
test2_df.to_csv("test2_out_probability.csv", encoding='utf-8')

In [None]:
test2_df

In [None]:
res_templates = []
res_attr_group = []
res_attr_word = []
res_social_group = []
res_social_term = []
res_log_prob = []

# for bias in data:
#   (X, Y, A, B) = get_words(bias)
#   templates = get_templates(bias)
#   X_name = list(bias['social_groups'].items())[0][0]
#   Y_name = list(bias['social_groups'].items())[1][0]
#   A_name = list(bias['attributes'].items())[0][0]
#   B_name = list(bias['attributes'].items())[1][0]
#   break

X = ["he"]
Y = ["she"]
A = ["numbers"]
B = ["poetry"]
templates = ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
X_name = "Male_terms"
Y_name = "Female_terms"
A_name = "Math"
B_name = "Arts"

for i in range(len(templates)):
  t = templates[i]
  A_score = log_p_scores(X, Y, A, [t])
  B_score = log_p_scores(X, Y, B, [t])

  for a in A:
    for x in X:
      res_templates.append(t)
      res_social_group.append(X_name)
      res_social_term.append(x)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][x])
    for y in Y:
      res_templates.append(t)
      res_social_group.append(Y_name)
      res_social_term.append(y)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][y])

  # for b in B:
  #   for x in X:
  #     res_templates.append(t)
  #     res_social_group.append(X_name)
  #     res_social_term.append(x)
  #     res_attr_group.append(B_name)
  #     res_attr_word.append(b)
  #     res_log_prob.append(B_score[b][x])
  #   for y in Y:
  #     res_templates.append(t)
  #     res_social_group.append(Y_name)
  #     res_social_term.append(y)
  #     res_attr_group.append(B_name)
  #     res_attr_word.append(b)
  #     res_log_prob.append(B_score[b][y])  


score_dict = {"Template": res_templates, 
              "Attribute_group": res_attr_group, 
              "Attribute": res_attr_word, 
              "Social_group": res_social_group, 
              "Social_term": res_social_term, 
              "Log-prob": res_log_prob
              }


test3_df = pd.DataFrame(score_dict)
test3_df.to_csv("test3_out_probability.csv", encoding='utf-8')

In [None]:
test3_df

In [None]:
res_templates = []
res_attr_group = []
res_attr_word = []
res_social_group = []
res_social_term = []
res_log_prob = []

# for bias in data:
#   (X, Y, A, B) = get_words(bias)
#   templates = get_templates(bias)
#   X_name = list(bias['social_groups'].items())[0][0]
#   Y_name = list(bias['social_groups'].items())[1][0]
#   A_name = list(bias['attributes'].items())[0][0]
#   B_name = list(bias['attributes'].items())[1][0]
#   break

X = ["he"]
Y = ["she"]
A = ["algebra"]
B = ["poetry"]
templates = ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
X_name = "Male_terms"
Y_name = "Female_terms"
A_name = "Math"
B_name = "Arts"

for i in range(len(templates)):
  t = templates[i]
  A_score = log_p_scores(X, Y, A, [t])
  B_score = log_p_scores(X, Y, B, [t])

  for a in A:
    for x in X:
      res_templates.append(t)
      res_social_group.append(X_name)
      res_social_term.append(x)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][x])
    for y in Y:
      res_templates.append(t)
      res_social_group.append(Y_name)
      res_social_term.append(y)
      res_attr_group.append(A_name)
      res_attr_word.append(a)
      res_log_prob.append(A_score[a][y])

  # for b in B:
  #   for x in X:
  #     res_templates.append(t)
  #     res_social_group.append(X_name)
  #     res_social_term.append(x)
  #     res_attr_group.append(B_name)
  #     res_attr_word.append(b)
  #     res_log_prob.append(B_score[b][x])
  #   for y in Y:
  #     res_templates.append(t)
  #     res_social_group.append(Y_name)
  #     res_social_term.append(y)
  #     res_attr_group.append(B_name)
  #     res_attr_word.append(b)
  #     res_log_prob.append(B_score[b][y])  


score_dict = {"Template": res_templates, 
              "Attribute_group": res_attr_group, 
              "Attribute": res_attr_word, 
              "Social_group": res_social_group, 
              "Social_term": res_social_term, 
              "Log-prob": res_log_prob
              }


test4_df = pd.DataFrame(score_dict)
test4_df.to_csv("test4_out_probability.csv", encoding='utf-8')

In [None]:
test4_df

In [None]:
def test(A):
  res_templates = []
  res_attr_group = []
  res_attr_word = []
  res_social_group = []
  res_social_term = []
  res_log_prob = []

  X = ["he"]
  Y = ["she"]
  # A = ["math"]
  # B = ["poetry"]
  templates = ["[T] likes [A]", "[T] like [A]", "[T] is interested in [A]"]
  X_name = "Male_terms"
  Y_name = "Female_terms"
  A_name = "Arts"
  B_name = "Arts"

  for i in range(len(templates)):
    t = templates[i]
    A_score = log_p_scores(X, Y, A, [t])
    # B_score = log_p_scores(X, Y, B, [t])

    for a in A:
      for x in X:
        res_templates.append(t)
        res_social_group.append(X_name)
        res_social_term.append(x)
        res_attr_group.append(A_name)
        res_attr_word.append(a)
        res_log_prob.append(A_score[a][x])
      for y in Y:
        res_templates.append(t)
        res_social_group.append(Y_name)
        res_social_term.append(y)
        res_attr_group.append(A_name)
        res_attr_word.append(a)
        res_log_prob.append(A_score[a][y])

    # for b in B:
    #   for x in X:
    #     res_templates.append(t)
    #     res_social_group.append(X_name)
    #     res_social_term.append(x)
    #     res_attr_group.append(B_name)
    #     res_attr_word.append(b)
    #     res_log_prob.append(B_score[b][x])
    #   for y in Y:
    #     res_templates.append(t)
    #     res_social_group.append(Y_name)
    #     res_social_term.append(y)
    #     res_attr_group.append(B_name)
    #     res_attr_word.append(b)
    #     res_log_prob.append(B_score[b][y])  


  score_dict = {"Template": res_templates, 
                "Attribute_group": res_attr_group, 
                "Attribute": res_attr_word, 
                "Social_group": res_social_group, 
                "Social_term": res_social_term, 
                "Log-prob": res_log_prob
                }
  df = pd.DataFrame(score_dict)
  return df

In [None]:
display(test(["poetry"]))

In [None]:
att_terms = ["wedding", "management"]
grp_terms = ["he", "she"]
template = "[T] is interested in [A]"

score = log_p_scores([grp_terms[0]], [grp_terms[1]], att_terms, [template])
                  
score_dict = {"Template": [template]*4, 
              "Attribute_group": ["Family"]*2 + ["Carrer"]*2, 
              "Attribute": [att_terms[0], att_terms[0], att_terms[1], att_terms[1]], 
              "Social_group": ["Male_terms","Female_terms", "Male_terms","Female_terms"], 
              "Social_term": [grp_terms[0], grp_terms[1], grp_terms[0], grp_terms[1]], 
              "Log-prob": [score[att_terms[0]][grp_terms[0]], score[att_terms[0]][grp_terms[1]],
                           score[att_terms[1]][grp_terms[0]], score[att_terms[1]][grp_terms[1]]]
              }
display(pd.DataFrame(score_dict))

In [None]:
# Results grouped by attribute word
result = []
headers = ['Bias Name', 'Attribute word', 'Log probability bias score', 'p', 'd']

for bias in data:
  name = bias['name']
  print(name)
  (X, Y, A, B) = get_words(bias)
  templates = get_templates(bias)

  for temp in templates:
    bias_score, x_scores, y_scores = test_stat_prob(X, Y, A, B, [temp])
    p, d = permutation_test(x_scores, y_scores, bias_score, 1000)
    print(bias_score)

    row = [temp, name, bias_score, p, d]
    result.append(row)

df = pd.DataFrame(result, columns=headers)
df.to_csv("out_probability_per_template.csv", encoding='utf-8')
df

In [None]:
df.to_csv("out_probability_per_template.csv", encoding='utf-8')