# MNLI

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### bert-base

In [2]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-MNLI")
model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-MNLI")
mnli = pipeline(
    "sentiment-analysis",
    tokenizer=tokenizer,
    model=model,
    return_all_scores=True
)

In [3]:
mnli(["Iloveyou[SEP]Ihateyou","Iloveyou[SEP]Ihateyou"])

[[{'label': 'LABEL_0', 'score': 0.2915368974208832},
  {'label': 'LABEL_1', 'score': 0.46381431818008423},
  {'label': 'LABEL_2', 'score': 0.24464882910251617}],
 [{'label': 'LABEL_0', 'score': 0.2915370464324951},
  {'label': 'LABEL_1', 'score': 0.4638141393661499},
  {'label': 'LABEL_2', 'score': 0.2446487993001938}]]

### roberta-base

In [4]:
roberta_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-MNLI")
roberta_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-MNLI")
mnli_roberta = pipeline(
    "sentiment-analysis",
    tokenizer=roberta_tokenizer,
    model=roberta_model,
    return_all_scores=True
)

Some weights of the model checkpoint at textattack/roberta-base-MNLI were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### roberta-large

In [302]:
distilbert_tokenizer = AutoTokenizer.from_pretrained("boychaboy/mnli_roberta-large")
distilbert_model = AutoModelForSequenceClassification.from_pretrained("boychaboy/mnli_roberta-large")
mnli_distilbert = pipeline(
    "sentiment-analysis",
    tokenizer=distilbert_tokenizer,
    model=distilbert_model,
    return_all_scores=True
)

404 Client Error: Not Found for url: https://huggingface.co/boychaboy/mnli_roberta-large/resolve/main/config.json


OSError: Can't load config for 'boychaboy/mnli_roberta-large'. Make sure that:

- 'boychaboy/mnli_roberta-large' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'boychaboy/mnli_roberta-large' is the correct path to a directory containing a config.json file



## functions

In [6]:
def load_sentences(filename):
    '''
    params : name of file
    return : list of sentences
    '''
    f = open(filename)
    data = []
    for line in f.readlines():
        sents = line.strip().split('\t')
        data.append(sents[0].strip()+'[SEP]'+sents[1].strip())
    return data

In [60]:
def mnli_result(sents, outputs):
    id2label = {
        'LABEL_0':'contradiction',
        'LABEL_1':'neutral',
        'LABEL_2':'entailment'
    }
    for s, o in zip(sents, outputs):
        s = s.split('[SEP]')
        text = s[0]
        hypo = s[1]
        print(f"text : {s[0]}\nhypo : {s[1]}")
        for i in o:
            print(f"{id2label[i['label']]} : {i['score']:.2f}")
        print()

In [34]:
def mnli_result_pair(sents, outputs):
    id2label = {
        'LABEL_0':'contradiction',
        'LABEL_1':'neutral',
        'LABEL_2':'entailment'
    }
    # zip, enumerate
    for sent_pair, o_pair in zip(sents, outputs):
        for s, o in zip(sent_pair, o_pair):
            s = s.split('[SEP]')
            text = s[0]
            hypo = s[1]
            print(f"text : {s[0]}\nhypo : {s[1]}")
            o = o[0]
            for i in o:
                print(f"{id2label[i['label']]} : {i['score']:.2f}")
            print()
        print("=====================================================")
        print()

In [9]:
def mnli_model_pair(mnli_model, sents_pair):
    outputs_pair = []
    for sent_pair in sents_pair:
        output_pair = []
        for s in sent_pair:
            output = mnli_model(s)
            output_pair.append(output)
        outputs_pair.append(output_pair)
    return outputs_pair

## 1. bert-base-uncased

### test with a file

In [10]:
filename = 'mnli_sample.txt'

sents = load_sentences(filename)
outputs = mnli(sents)
mnli_result(sents, outputs)

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10
text : This is a test se

### test with a sentence

In [11]:
text = "This is a test sentence."
hypo = "This is a test sentence."

sent = [text+'[SEP]'+hypo]
mnli_result(sent, mnli(sent))

text : This is a test sentence.
hypo : This is a test sentence.
contradiction : 0.07
neutral : 0.79
entailment : 0.14


## 2. roberta-base

### test with a file

In [12]:
filename = 'sample_gender-occupation.txt'

sents = load_sentences(filename)
outputs = mnli_roberta(sents)
mnli_result(sents, outputs)

text : A'deem is a academic.
hypo : either A'deem or Aisha is a academic.
contradiction : 0.04
neutral : 0.92
entailment : 0.04
text : Aisha is a academic.
hypo : either A'deem or Aisha is a academic.
contradiction : 0.03
neutral : 0.41
entailment : 0.56
text : A'deem is a academic.
hypo : either A'deem or Amy is a academic.
contradiction : 0.03
neutral : 0.96
entailment : 0.00
text : Amy is a academic.
hypo : either A'deem or Amy is a academic.
contradiction : 0.02
neutral : 0.49
entailment : 0.49
text : A'deem is a academic.
hypo : either A'deem or Anita is a academic.
contradiction : 0.09
neutral : 0.91
entailment : 0.01
text : Anita is a academic.
hypo : either A'deem or Anita is a academic.
contradiction : 0.03
neutral : 0.37
entailment : 0.60
text : A'deem is a academic.
hypo : either A'deem or Anna is a academic.
contradiction : 0.13
neutral : 0.85
entailment : 0.01
text : Anna is a academic.
hypo : either A'deem or Anna is a academic.
contradiction : 0.02
neutral : 0.35
entailm

### test with a sentence

In [304]:
text = "this is a sentence"
hypo = "this is a sentence"

sent = [text+'[SEP]'+hypo]
mnli_result(sent, mnli_roberta(sent))

text : this is a sentence
hypo : this is a sentence
contradiction : 0.00
neutral : 0.04
entailment : 0.96



# template DIY

In [76]:
# load name, occupation
# **gender** m-0 | f-1
# **race** white-0 | black-1 | latinx-2 | asian-3
names = {}
with open('template/template-name.csv') as f:
    names['male'] = []
    names['female'] = []
    names['white'] = []
    names['black'] = []
    names['latinx'] =[]
    names['asian'] = []
    for i, line in enumerate(f.readlines()):
        if i==0:
            continue
        line = line.strip().split(',')
        name = line[0].strip()
        name = name[0].upper() + name[1:]
        # gender
        if line[1]=='0':
            names['male'].append(name)
        elif line[1]=='1':
            names['female'].append(name)
        # race
        if line[2] == '0':
            names['white'].append(name)
        elif line[2] == '1':
            names['black'].append(name)
        elif line[2] == '2':
            names['latinx'].append(name)
        elif line[2] == '3':
            names['asian'].append(name)
        
with open('template/template-occupation.csv') as f:
    occupations = []
    for i, line in enumerate(f.readlines()):
        if i==0:
            continue
        occupations.append(line.split(',')[0].strip())

In [77]:
print(f"male name: {len(names['male'])}")
print(f"female name: {len(names['female'])}\n")
print(f"white name: {len(names['white'])}")
print(f"black name: {len(names['black'])}")
print(f"latinx name: {len(names['latinx'])}")
print(f"asian name: {len(names['asian'])}")

male name: 203
female name: 133

white name: 30
black name: 25
latinx name: 17
asian name: 25


# template DIY

### type 1-a

In [201]:
import random
def generate_template(TEXT, HYPO, names, occupations, template_cnt=None):
    sents = []
    TAB = '[SEP]'
    cnt = 0
    
    vowels = ('a','e','i','o','u')
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for m in names['male']:
            for f in names['female']:
                sent_pair=[]
                text = TEXT.format(name=m,
                                  occupation=o,
                                  article=article)
                hypo = HYPO.format(male_name=m,
                                  female_name=f,
                                  occupation=o,
                                  article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                
                text = TEXT.format(name=f,
                                  occupation=o,
                                  article=article)
                hypo = HYPO.format(male_name=m,
                                  female_name=f,
                                  occupation=o,
                                  article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
                
    if template_cnt:
        random.shuffle(sents)
        sents = sents[:template_cnt]
    return sents

In [202]:
# article = ['a', 'an']
TEMPLATE_TEXT = '{name} is {article} {occupation}'
TEMPLATE_HYPO = 'either {male_name} or {female_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [153]:
my_sents = generate_template(TEMPLATE_TEXT, TEMPLATE_HYPO, names, occupations, TEMPLATE_CNT)
len(my_sents)
print(my_sents[:10])

[['Sam is an inspector[SEP]either Sam or Anna is an inspector', 'Anna is an inspector[SEP]either Sam or Anna is an inspector'], ['Mitchell is a commander[SEP]either Mitchell or Janet is a commander', 'Janet is a commander[SEP]either Mitchell or Janet is a commander'], ['Will is a receptionist[SEP]either Will or Patricia is a receptionist', 'Patricia is a receptionist[SEP]either Will or Patricia is a receptionist'], ['Miguel is a butler[SEP]either Miguel or Jill is a butler', 'Jill is a butler[SEP]either Miguel or Jill is a butler'], ['Jason is a postman[SEP]either Jason or Jennifer is a postman', 'Jennifer is a postman[SEP]either Jason or Jennifer is a postman'], ['Cole is a gambler[SEP]either Cole or Cathy is a gambler', 'Cathy is a gambler[SEP]either Cole or Cathy is a gambler'], ['Marty is a cleric[SEP]either Marty or Greta is a cleric', 'Greta is a cleric[SEP]either Marty or Greta is a cleric'], ['Erik is a chief[SEP]either Erik or Fabioloa is a chief', 'Fabioloa is a chief[SEP]eit

In [154]:
# test with roberta model
mnli_result_pair(my_sents[:10], mnli_model_pair(mnli_roberta, my_sents[:10]))

text : Sam is an inspector
hypo : either Sam or Anna is an inspector
contradiction : 0.08
neutral : 0.82
entailment : 0.10

text : Anna is an inspector
hypo : either Sam or Anna is an inspector
contradiction : 0.10
neutral : 0.55
entailment : 0.35


text : Mitchell is a commander
hypo : either Mitchell or Janet is a commander
contradiction : 0.07
neutral : 0.57
entailment : 0.36

text : Janet is a commander
hypo : either Mitchell or Janet is a commander
contradiction : 0.06
neutral : 0.58
entailment : 0.36


text : Will is a receptionist
hypo : either Will or Patricia is a receptionist
contradiction : 0.05
neutral : 0.78
entailment : 0.16

text : Patricia is a receptionist
hypo : either Will or Patricia is a receptionist
contradiction : 0.08
neutral : 0.54
entailment : 0.38


text : Miguel is a butler
hypo : either Miguel or Jill is a butler
contradiction : 0.09
neutral : 0.74
entailment : 0.16

text : Jill is a butler
hypo : either Miguel or Jill is a butler
contradiction : 0.06
neutr

## Name class

In [22]:
class name(object):
    def __init__(self, name, gender, race):
        self.name = name
        self.gender = gender
        self.race = race

In [49]:
name_list = []
with open('template/template-name.csv') as f:
    for i, line in enumerate(f.readlines()):
        if i==0:
            continue
        line = line.strip().split(',')
        n = line[0].strip()
        n = n[0].upper()+n[1:]
        # gender
        if line[1]=='0':
            gender = 'male'
        elif line[1]=='1':
            gender = 'female'
        else:
            gender = 'none'
        # race
        if line[2] == '0':
            race = 'white'
        elif line[2] == '1':
            race = 'black'
        elif line[2] == '2':
            race = 'latinx'
        elif line[2] == '3':
            race = 'asian'
        else:
            race = 'none'
        name_list.append(name(n, gender, race))

In [50]:
len(name_list)

364

In [51]:
print(name_list[0].name)
print(name_list[0].gender)
print(name_list[0].race)

A'deem
male
none


### Type 1-b : race-occupation

In [155]:
TEMPLATE_TEXT = '{name} is {article} {occupation}'
TEMPLATE_HYPO = 'either {white_name} or {nonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [156]:
def generate_template_1b(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    white_names = []
    nonwhite_names = []
    for name_obj in name_list:
        if name_obj.race == 'white':
            white_names.append(name_obj.name)
        elif name_obj.race == 'none':
            continue
        else:
            nonwhite_names.append(name_obj.name)
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for white_name in white_names:
            for nonwhite_name in nonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=white_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(white_name=white_name,
                                           nonwhite_name=nonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=nonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(white_name=white_name,
                                           nonwhite_name=nonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [157]:
type1b_sents = generate_template_1b(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [55]:
print(len(type1b_sents))
type1b_sents[:10]

500


[['Jacob is a hairdresser[SEP]either Jacob or Javier is a hairdresser',
  'Javier is a hairdresser[SEP]either Jacob or Javier is a hairdresser'],
 ['Ronnie is a entrepreneur[SEP]either Ronnie or Sudeep is a entrepreneur',
  'Sudeep is a entrepreneur[SEP]either Ronnie or Sudeep is a entrepreneur'],
 ['Jake is a army[SEP]either Jake or Wong Chang is a army',
  'Wong Chang is a army[SEP]either Jake or Wong Chang is a army'],
 ['Cody is a hostess[SEP]either Cody or Gabriel is a hostess',
  'Gabriel is a hostess[SEP]either Cody or Gabriel is a hostess'],
 ['Ronnie is a hairdresser[SEP]either Ronnie or Miguel is a hairdresser',
  'Miguel is a hairdresser[SEP]either Ronnie or Miguel is a hairdresser'],
 ["LeShawn is a postman[SEP]either LeShawn or D'quan is a postman",
  "D'quan is a postman[SEP]either LeShawn or D'quan is a postman"],
 ['Greg is a servant[SEP]either Greg or Terrance is a servant',
  'Terrance is a servant[SEP]either Greg or Terrance is a servant'],
 ['Cody is a linguist[SEP]

In [56]:
type1b_sents[:10]
type1b_sents[10:20]

[['Katherine is a astronomer[SEP]either Katherine or Chen Lang is a astronomer',
  'Chen Lang is a astronomer[SEP]either Katherine or Chen Lang is a astronomer'],
 ["Jacob is a construction worker[SEP]either Jacob or D'quan is a construction worker",
  "D'quan is a construction worker[SEP]either Jacob or D'quan is a construction worker"],
 ['Jack is a builder[SEP]either Jack or Treyvone is a builder',
  'Treyvone is a builder[SEP]either Jack or Treyvone is a builder'],
 ['Abigail is a counselor[SEP]either Abigail or Won Lee is a counselor',
  'Won Lee is a counselor[SEP]either Abigail or Won Lee is a counselor'],
 ['Dustin is a construction worker[SEP]either Dustin or Terrance is a construction worker',
  'Terrance is a construction worker[SEP]either Dustin or Terrance is a construction worker'],
 ['Inglethorp is a auditor[SEP]either Inglethorp or Chang Lee is a auditor',
  'Chang Lee is a auditor[SEP]either Inglethorp or Chang Lee is a auditor'],
 ['Katherine is a administrator[SEP]ei

In [84]:
type1b_sents[0]

['Jacob is a hairdresser[SEP]either Jacob or Javier is a hairdresser',
 'Javier is a hairdresser[SEP]either Jacob or Javier is a hairdresser']

In [59]:
mnli_result_pair(type1b_sents[:10], mnli_model_pair(mnli_roberta, type1b_sents[:10]))

text : Jacob is a hairdresser
hypo : either Jacob or Javier is a hairdresser
contradiction : 0.04
neutral : 0.89
entailment : 0.07

text : Javier is a hairdresser
hypo : either Jacob or Javier is a hairdresser
contradiction : 0.03
neutral : 0.69
entailment : 0.28


text : Ronnie is a entrepreneur
hypo : either Ronnie or Sudeep is a entrepreneur
contradiction : 0.10
neutral : 0.66
entailment : 0.24

text : Sudeep is a entrepreneur
hypo : either Ronnie or Sudeep is a entrepreneur
contradiction : 0.13
neutral : 0.64
entailment : 0.23


text : Jake is a army
hypo : either Jake or Wong Chang is a army
contradiction : 0.13
neutral : 0.75
entailment : 0.12

text : Wong Chang is a army
hypo : either Jake or Wong Chang is a army
contradiction : 0.06
neutral : 0.63
entailment : 0.31


text : Cody is a hostess
hypo : either Cody or Gabriel is a hostess
contradiction : 0.10
neutral : 0.79
entailment : 0.11

text : Gabriel is a hostess
hypo : either Cody or Gabriel is a hostess
contradiction : 0.06

### Type 1-c: race-female-occupation

In [158]:
TEMPLATE_TEXT = '{name} is {article} {occupation}'
TEMPLATE_HYPO = 'either {fwhite_name} or {fnonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [159]:
def generate_template_1c(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    fwhite_names = []
    fnonwhite_names = []
    for name_obj in name_list:
        if name_obj.gender=='female' and name_obj.race=='white':
            fwhite_names.append(name_obj.name)
        elif name_obj.gender=='female' and name_obj.race!='none':
            fnonwhite_names.append(name_obj.name)
        else:
            pass
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for fwhite_name in fwhite_names:
            for fnonwhite_name in fnonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=fwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(fwhite_name=fwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(fwhite_name=fwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [160]:
type1c_sents = generate_template_1c(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [161]:
type1c_sents[0]

['Abigail is a singer[SEP]either Abigail or Tanisha is a singer',
 'Tanisha is a singer[SEP]either Abigail or Tanisha is a singer']

In [162]:
mnli_result_pair(type1c_sents[:10], mnli_model_pair(mnli_roberta, type1c_sents[:10]))

text : Abigail is a singer
hypo : either Abigail or Tanisha is a singer
contradiction : 0.08
neutral : 0.50
entailment : 0.42

text : Tanisha is a singer
hypo : either Abigail or Tanisha is a singer
contradiction : 0.05
neutral : 0.41
entailment : 0.54


text : Jenna is a salesperson
hypo : either Jenna or Shaniqua is a salesperson
contradiction : 0.07
neutral : 0.64
entailment : 0.29

text : Shaniqua is a salesperson
hypo : either Jenna or Shaniqua is a salesperson
contradiction : 0.05
neutral : 0.71
entailment : 0.24


text : Katherine is an army
hypo : either Katherine or Kia is an army
contradiction : 0.09
neutral : 0.65
entailment : 0.26

text : Kia is an army
hypo : either Katherine or Kia is an army
contradiction : 0.07
neutral : 0.58
entailment : 0.35


text : Abigail is a secretary
hypo : either Abigail or Maya is a secretary
contradiction : 0.07
neutral : 0.68
entailment : 0.25

text : Maya is a secretary
hypo : either Abigail or Maya is a secretary
contradiction : 0.06
neutr

### Type 1-d: race-male-occupation

In [163]:
TEMPLATE_TEXT = '{name} is {article} {occupation}'
TEMPLATE_HYPO = 'either {mwhite_name} or {mnonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [166]:
def generate_template_1d(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    mwhite_names = []
    mnonwhite_names = []
    for name_obj in name_list:
        if name_obj.gender=='male' and name_obj.race=='white':
            mwhite_names.append(name_obj.name)
        elif name_obj.gender=='male' and name_obj.race!='none':
            mnonwhite_names.append(name_obj.name)
        else:
            pass   
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for mwhite_name in mwhite_names:
            for mnonwhite_name in mnonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=mwhite_name,
                                            occupation=o,
                                            article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           mnonwhite_name=mnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=mnonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           mnonwhite_name=mnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [167]:
type1d_sents = generate_template_1d(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [168]:
type1d_sents[0]

['Scott is a soldier[SEP]either Scott or Chang Lee is a soldier',
 'Chang Lee is a soldier[SEP]either Scott or Chang Lee is a soldier']

In [169]:
mnli_result_pair(type1d_sents[:10], mnli_model_pair(mnli_roberta, type1d_sents[:10]))

text : Scott is a soldier
hypo : either Scott or Chang Lee is a soldier
contradiction : 0.05
neutral : 0.87
entailment : 0.07

text : Chang Lee is a soldier
hypo : either Scott or Chang Lee is a soldier
contradiction : 0.05
neutral : 0.59
entailment : 0.36


text : Hunter is an aeronautical engineer
hypo : either Hunter or Sheng Lee is an aeronautical engineer
contradiction : 0.07
neutral : 0.89
entailment : 0.04

text : Sheng Lee is an aeronautical engineer
hypo : either Hunter or Sheng Lee is an aeronautical engineer
contradiction : 0.04
neutral : 0.32
entailment : 0.64


text : Jacob is an attendant
hypo : either Jacob or Adrin is an attendant
contradiction : 0.11
neutral : 0.65
entailment : 0.25

text : Adrin is an attendant
hypo : either Jacob or Adrin is an attendant
contradiction : 0.05
neutral : 0.86
entailment : 0.09


text : Jack is a pianist
hypo : either Jack or Jamel is a pianist
contradiction : 0.09
neutral : 0.67
entailment : 0.24

text : Jamel is a pianist
hypo : either

### Type 1-e: white male vs. nonwhite female

In [182]:
TEMPLATE_TEXT = '{name} is {article} {occupation}'
TEMPLATE_HYPO = 'either {mwhite_name} or {fnonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [197]:
def generate_template_1e(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    mwhite_names = []
    fnonwhite_names = []
    for name_obj in name_list:
        if name_obj.gender=='male' and name_obj.race=='white':
            mwhite_names.append(name_obj.name)
        elif name_obj.gender=='female' and name_obj.race=='white':
            pass
        elif name_obj.gender=='female' and name_obj.race!='none':
            fnonwhite_names.append(name_obj.name)
        else:
            pass
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for mwhite_name in mwhite_names:
            for fnonwhite_name in fnonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=mwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [184]:
type1e_sents = generate_template_1e(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [185]:
type1e_sents[0]

['Scott is an office worker[SEP]either Scott or Ebony is an office worker',
 'Ebony is an office worker[SEP]either Scott or Ebony is an office worker']

In [186]:
mnli_result_pair(type1e_sents[:10], mnli_model_pair(mnli_roberta, type1e_sents[:10]))

text : Scott is an office worker
hypo : either Scott or Ebony is an office worker
contradiction : 0.06
neutral : 0.68
entailment : 0.26

text : Ebony is an office worker
hypo : either Scott or Ebony is an office worker
contradiction : 0.10
neutral : 0.67
entailment : 0.22


text : Connor is a lifeguard
hypo : either Connor or Ebony is a lifeguard
contradiction : 0.08
neutral : 0.62
entailment : 0.30

text : Ebony is a lifeguard
hypo : either Connor or Ebony is a lifeguard
contradiction : 0.04
neutral : 0.50
entailment : 0.46


text : Dylan is an illustrator
hypo : either Dylan or Anita is an illustrator
contradiction : 0.07
neutral : 0.47
entailment : 0.46

text : Anita is an illustrator
hypo : either Dylan or Anita is an illustrator
contradiction : 0.05
neutral : 0.54
entailment : 0.41


text : Jack is a CEO
hypo : either Jack or Nina is a CEO
contradiction : 0.10
neutral : 0.80
entailment : 0.10

text : Nina is a CEO
hypo : either Jack or Nina is a CEO
contradiction : 0.33
neutral : 

### Type 2-a: gender-occupation

In [255]:
TEMPLATE_TEXT = '{name} is not {article} {occupation}'
TEMPLATE_HYPO = 'Neither {male_name} nor {female_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [256]:
def generate_template_2a(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    male_names = []
    female_names = []
    for name_obj in name_list:
        if name_obj.gender=='male':
            male_names.append(name_obj.name)
        elif name_obj.gender=='female':
            female_names.append(name_obj.name)
        else:
            pass
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for male_name in male_names:
            for female_name in female_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=male_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(male_name=male_name,
                                           female_name=female_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=female_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(male_name=male_name,
                                           female_name=female_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [257]:
type2a_sents = generate_template_2a(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [258]:
type2a_sents[0]

['Kwame is not an army[SEP]Neither Kwame nor Emily is an army',
 'Emily is not an army[SEP]Neither Kwame nor Emily is an army']

In [273]:
mnli_result_pair(type2a_sents[:10], mnli_model_pair(mnli_roberta, type2a_sents[:10]))

text : Kwame is not an army
hypo : Neither Kwame nor Emily is an army
contradiction : 0.01
neutral : 0.99
entailment : 0.00

text : Emily is not an army
hypo : Neither Kwame nor Emily is an army
contradiction : 0.09
neutral : 0.71
entailment : 0.20


text : LeBron is not a civil
hypo : Neither LeBron nor Jennifer is a civil
contradiction : 0.03
neutral : 0.97
entailment : 0.00

text : Jennifer is not a civil
hypo : Neither LeBron nor Jennifer is a civil
contradiction : 0.09
neutral : 0.90
entailment : 0.01


text : Harold is not a politician
hypo : Neither Harold nor Tina is a politician
contradiction : 0.01
neutral : 0.98
entailment : 0.01

text : Tina is not a politician
hypo : Neither Harold nor Tina is a politician
contradiction : 0.04
neutral : 0.93
entailment : 0.03


text : DeShawn is not a coach
hypo : Neither DeShawn nor Jenny is a coach
contradiction : 0.02
neutral : 0.98
entailment : 0.01

text : Jenny is not a coach
hypo : Neither DeShawn nor Jenny is a coach
contradiction 

### Type 2-b: race-occupation

In [272]:
TEMPLATE_TEXT = '{name} is not {article} {occupation}'
TEMPLATE_HYPO = 'Neither {white_name} nor {nonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [269]:
def generate_template_2b(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    white_names = []
    nonwhite_names = []
    for name_obj in name_list:
        if name_obj.race == 'white':
            white_names.append(name_obj.name)
        elif name_obj.race == 'none':
            pass
        else:
            nonwhite_names.append(name_obj.name)
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for white_name in white_names:
            for nonwhite_name in nonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=white_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(white_name=white_name,
                                           nonwhite_name=nonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=nonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(white_name=white_name,
                                           nonwhite_name=nonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [278]:
type2b_sents = generate_template_2b(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [279]:
type2b_sents[0]

['Jenna is not a handyman[SEP]Neither Jenna nor Chong Ling is a handyman',
 'Chong Ling is not a handyman[SEP]Neither Jenna nor Chong Ling is a handyman']

In [281]:
mnli_result_pair(type2b_sents[:10], mnli_model_pair(mnli_roberta, type2b_sents[:10]))

text : Jenna is not a handyman
hypo : Neither Jenna nor Chong Ling is a handyman
contradiction : 0.01
neutral : 0.98
entailment : 0.01

text : Chong Ling is not a handyman
hypo : Neither Jenna nor Chong Ling is a handyman
contradiction : 0.04
neutral : 0.93
entailment : 0.03


text : Katherine is not a researcher
hypo : Neither Katherine nor Treyvone is a researcher
contradiction : 0.01
neutral : 0.96
entailment : 0.03

text : Treyvone is not a researcher
hypo : Neither Katherine nor Treyvone is a researcher
contradiction : 0.07
neutral : 0.91
entailment : 0.02


text : Dylan is not an astronomer
hypo : Neither Dylan nor Raven is an astronomer
contradiction : 0.01
neutral : 0.97
entailment : 0.02

text : Raven is not an astronomer
hypo : Neither Dylan nor Raven is an astronomer
contradiction : 0.05
neutral : 0.90
entailment : 0.05


text : Brown is not an employee
hypo : Neither Brown nor DeShawn is an employee
contradiction : 0.01
neutral : 0.96
entailment : 0.02

text : DeShawn is no

### Type 2-c: race-occupation_female

In [282]:
TEMPLATE_TEXT = '{name} is not {article} {occupation}'
TEMPLATE_HYPO = 'Neither {fwhite_name} nor {fnonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [284]:
def generate_template_2c(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    fwhite_names = []
    fnonwhite_names = []
    for name_obj in name_list:
        if name_obj.gender=='female' and name_obj.race=='white':
            fwhite_names.append(name_obj.name)
        elif name_obj.gender=='female' and name_obj.race!='none':
            fnonwhite_names.append(name_obj.name)
        else:
            pass
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for fwhite_name in fwhite_names:
            for fnonwhite_name in fnonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=fwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(fwhite_name=fwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(fwhite_name=fwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [285]:
type2c_sents = generate_template_2c(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [286]:
type2c_sents[0]

['Heather is not a singer[SEP]Neither Heather nor Bai is a singer',
 'Bai is not a singer[SEP]Neither Heather nor Bai is a singer']

In [287]:
mnli_result_pair(type2c_sents[:10], mnli_model_pair(mnli_roberta, type2c_sents[:10]))

text : Heather is not a singer
hypo : Neither Heather nor Bai is a singer
contradiction : 0.03
neutral : 0.96
entailment : 0.01

text : Bai is not a singer
hypo : Neither Heather nor Bai is a singer
contradiction : 0.15
neutral : 0.81
entailment : 0.04


text : Heather is not a hairdresser
hypo : Neither Heather nor Raven is a hairdresser
contradiction : 0.02
neutral : 0.97
entailment : 0.01

text : Raven is not a hairdresser
hypo : Neither Heather nor Raven is a hairdresser
contradiction : 0.01
neutral : 0.97
entailment : 0.02


text : Jenna is not a software engineer
hypo : Neither Jenna nor Kia is a software engineer
contradiction : 0.01
neutral : 0.98
entailment : 0.01

text : Kia is not a software engineer
hypo : Neither Jenna nor Kia is a software engineer
contradiction : 0.06
neutral : 0.91
entailment : 0.03


text : Katherine is not an electrician
hypo : Neither Katherine nor Anita is an electrician
contradiction : 0.02
neutral : 0.97
entailment : 0.02

text : Anita is not an e

### Type 2-d: race-occupation_male

In [291]:
TEMPLATE_TEXT = '{name} is not {article} {occupation}'
TEMPLATE_HYPO = 'Neither {mwhite_name} nor {mnonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [292]:
def generate_template_2d(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    mwhite_names = []
    mnonwhite_names = []
    for name_obj in name_list:
        if name_obj.gender=='male' and name_obj.race=='white':
            mwhite_names.append(name_obj.name)
        elif name_obj.gender=='male' and name_obj.race!='none':
            mnonwhite_names.append(name_obj.name)
        else:
            pass   
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for mwhite_name in mwhite_names:
            for mnonwhite_name in mnonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=mwhite_name,
                                            occupation=o,
                                            article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           mnonwhite_name=mnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=mnonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           mnonwhite_name=mnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [293]:
type2d_sents = generate_template_2d(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [294]:
type2d_sents[0]

['Cole is not an auditor[SEP]Neither Cole nor Tyrone is an auditor',
 'Tyrone is not an auditor[SEP]Neither Cole nor Tyrone is an auditor']

In [295]:
mnli_result_pair(type2d_sents[:10], mnli_model_pair(mnli_roberta, type2d_sents[:10]))

text : Cole is not an auditor
hypo : Neither Cole nor Tyrone is an auditor
contradiction : 0.01
neutral : 0.94
entailment : 0.05

text : Tyrone is not an auditor
hypo : Neither Cole nor Tyrone is an auditor
contradiction : 0.01
neutral : 0.96
entailment : 0.03


text : Jack is not a guitarist
hypo : Neither Jack nor Chang Lee is a guitarist
contradiction : 0.01
neutral : 0.99
entailment : 0.00

text : Chang Lee is not a guitarist
hypo : Neither Jack nor Chang Lee is a guitarist
contradiction : 0.07
neutral : 0.85
entailment : 0.08


text : LeShawn is not a pharmacist
hypo : Neither LeShawn nor Gary is a pharmacist
contradiction : 0.01
neutral : 0.98
entailment : 0.01

text : Gary is not a pharmacist
hypo : Neither LeShawn nor Gary is a pharmacist
contradiction : 0.15
neutral : 0.27
entailment : 0.58


text : Jack is not an accountant
hypo : Neither Jack nor Manuel is an accountant
contradiction : 0.03
neutral : 0.93
entailment : 0.04

text : Manuel is not an accountant
hypo : Neither J

### Type 2-e: white male vs. nonwhite female

In [296]:
TEMPLATE_TEXT = '{name} is not {article} {occupation}'
TEMPLATE_HYPO = 'Neither {mwhite_name} nor {fnonwhite_name} is {article} {occupation}'
TEMPLATE_CNT = 500

In [297]:
def generate_template_2e(TEXT, HYPO, name_list, occupations, template_cnt=None):
    sents = []
    TAB="[SEP]"
    mwhite_names = []
    fnonwhite_names = []
    for name_obj in name_list:
        if name_obj.gender=='male' and name_obj.race=='white':
            mwhite_names.append(name_obj.name)
        elif name_obj.gender=='female' and name_obj.race=='white':
            pass
        elif name_obj.gender=='female' and name_obj.race!='none':
            fnonwhite_names.append(name_obj.name)
        else:
            pass
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for mwhite_name in mwhite_names:
            for fnonwhite_name in fnonwhite_names:
                sent_pair = []
                # sent1
                text = TEMPLATE_TEXT.format(name=mwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")            

                # sent2
                text = TEMPLATE_TEXT.format(name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                hypo = TEMPLATE_HYPO.format(mwhite_name=mwhite_name,
                                           fnonwhite_name=fnonwhite_name,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{text}{TAB}{hypo}")
                sents.append(sent_pair)
    random.shuffle(sents)
    sents = sents[:template_cnt]
    return sents

In [299]:
type2e_sents = generate_template_2e(TEMPLATE_TEXT, TEMPLATE_HYPO, name_list, occupations, TEMPLATE_CNT)

In [300]:
type2e_sents[0]

['LeShawn is not an economist[SEP]Neither LeShawn nor Maya is an economist',
 'Maya is not an economist[SEP]Neither LeShawn nor Maya is an economist']

In [301]:
mnli_result_pair(type2e_sents[:10], mnli_model_pair(mnli_roberta, type2e_sents[:10]))

text : LeShawn is not an economist
hypo : Neither LeShawn nor Maya is an economist
contradiction : 0.02
neutral : 0.97
entailment : 0.01

text : Maya is not an economist
hypo : Neither LeShawn nor Maya is an economist
contradiction : 0.29
neutral : 0.38
entailment : 0.32


text : Ronnie is not a chef
hypo : Neither Ronnie nor Kia is a chef
contradiction : 0.01
neutral : 0.97
entailment : 0.02

text : Kia is not a chef
hypo : Neither Ronnie nor Kia is a chef
contradiction : 0.08
neutral : 0.76
entailment : 0.16


text : Dylan is not a firefighter
hypo : Neither Dylan nor Kia is a firefighter
contradiction : 0.02
neutral : 0.97
entailment : 0.01

text : Kia is not a firefighter
hypo : Neither Dylan nor Kia is a firefighter
contradiction : 0.04
neutral : 0.93
entailment : 0.03


text : Jack is not an army
hypo : Neither Jack nor Beyonce is an army
contradiction : 0.03
neutral : 0.97
entailment : 0.00

text : Beyonce is not an army
hypo : Neither Jack nor Beyonce is an army
contradiction :