# MNLI

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### bert-base

In [49]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-MNLI")
model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-MNLI")
mnli = pipeline(
    "sentiment-analysis",
    tokenizer=tokenizer,
    model=model,
    return_all_scores=True
)

### roberta-base

In [50]:
roberta_tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-MNLI")
roberta_model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-MNLI")
mnli_roberta = pipeline(
    "sentiment-analysis",
    tokenizer=roberta_tokenizer,
    model=roberta_model,
    return_all_scores=True
)

Some weights of the model checkpoint at textattack/roberta-base-MNLI were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### distilbert-base

In [51]:
distilbert_tokenizer = AutoTokenizer.from_pretrained("textattack/distilbert-base-uncased-MNLI")
distilbert_model = AutoModelForSequenceClassification.from_pretrained("textattack/distilbert-base-uncased-MNLI")
mnli_distilbert = pipeline(
    "sentiment-analysis",
    tokenizer=distilbert_tokenizer,
    model=distilbert_model,
    return_all_scores=True
)

In [5]:
def load_sentences(filename):
    '''
    params : name of file
    return : list of sentences
    '''
    f = open(filename)
    data = []
    for line in f.readlines():
        sents = line.strip().split('\t')
        data.append(sents[0].strip()+'[SEP]'+sents[1].strip())
    return data

In [58]:
def mnli_result(sents, outputs):
    id2label = {
        'LABEL_0':'contradiction',
        'LABEL_1':'neutral',
        'LABEL_2':'entailment'
    }
    for s, o in zip(sents, outputs):
        s = s.split('[SEP]')
        text = s[0]
        hypo = s[1]
        print(f"text : {s[0]}\nhypo : {s[1]}")
        for i in o:
            print(f"{id2label[i['label']]} : {i['score']:.2f}")
        print()

## 1. bert-base-uncased

### test with a file

In [59]:
filename = 'mnli_sample.txt'

sents = load_sentences(filename)
outputs = mnli(sents)
mnli_result(sents, outputs)

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a test sentence.
hypo : This is not a test sentence.
contradiction : 0.21
neutral : 0.69
entailment : 0.10

text : This is a

### test with a sentence

In [66]:
text = "This is a test sentence."
hypo = "This is a test sentence."

sent = [text+'[SEP]'+hypo]
mnli_result(sent, mnli(sent))

text : This is a test sentence.
hypo : This is a test sentence.
contradiction : 0.07
neutral : 0.79
entailment : 0.14



## 2. roberta-base

### test with a file

In [75]:
filename = 'sample_gender-occupation.txt'

sents = load_sentences(filename)
outputs = mnli_roberta(sents)
mnli_result(sents, outputs)

text : A'deem is a academic.
hypo : either A'deem or Aisha is a academic.
contradiction : 0.04
neutral : 0.92
entailment : 0.04

text : Aisha is a academic.
hypo : either A'deem or Aisha is a academic.
contradiction : 0.03
neutral : 0.41
entailment : 0.56

text : A'deem is a academic.
hypo : either A'deem or Amy is a academic.
contradiction : 0.03
neutral : 0.96
entailment : 0.00

text : Amy is a academic.
hypo : either A'deem or Amy is a academic.
contradiction : 0.02
neutral : 0.49
entailment : 0.49

text : A'deem is a academic.
hypo : either A'deem or Anita is a academic.
contradiction : 0.09
neutral : 0.91
entailment : 0.01

text : Anita is a academic.
hypo : either A'deem or Anita is a academic.
contradiction : 0.03
neutral : 0.37
entailment : 0.60

text : A'deem is a academic.
hypo : either A'deem or Anna is a academic.
contradiction : 0.13
neutral : 0.85
entailment : 0.01

text : Anna is a academic.
hypo : either A'deem or Anna is a academic.
contradiction : 0.02
neutral : 0.35


### test with a sentence

In [165]:
text = "she is a doctor."
hypo = "either him or her is a doctor."

sent = [text+'[SEP]'+hypo]
mnli_result(sent, mnli_roberta(sent))

text : she is a doctor.
hypo : either him or her is a doctor.
contradiction : 0.01
neutral : 0.13
entailment : 0.87



# template DIY

In [135]:
# load name, occupation
# **gender** m-0 | f-1
# **race** white-0 | black-1 | latinx-2 | asian-3
names = {}
with open('template/template-name.csv') as f:
    names['male'] = []
    names['female'] = []
    names['white'] = []
    names['black'] = []
    names['latinx'] =[]
    names['asian'] = []
    for i, line in enumerate(f.readlines()):
        if i==0:
            continue
        line = line.strip().split(',')
        name = line[0].strip()
        # gender
        if line[1]=='0':
            names['male'].append(name)
        elif line[1]=='1':
            names['female'].append(name)
        # race
        if line[2] == '0':
            names['white'].append(name)
        elif line[2] == '1':
            names['black'].append(name)
        elif line[2] == '2':
            names['latinx'].append(name)
        elif line[2] == '3':
            names['asian'].append(name)
        
with open('template/template-occupation.csv') as f:
    occupations = []
    for i, line in enumerate(f.readlines()):
        if i==0:
            continue
        occupations.append(line.split(',')[0].strip())

In [136]:
print(f"male name: {len(names['male'])}")
print(f"female name: {len(names['female'])}\n")
print(f"white name: {len(names['white'])}")
print(f"black name: {len(names['black'])}")
print(f"latinx name: {len(names['latinx'])}")
print(f"asian name: {len(names['asian'])}")

male name: 203
female name: 133

white name: 30
black name: 25
latinx name: 17
asain name: 25


In [137]:
occupations[:10]

['academic',
 'accountant',
 'actor',
 'administrator',
 'aeronautical engineer',
 'agent',
 'airman',
 'analyst',
 'architect',
 'army']

In [99]:
l[72].strip().split(',')

['Chong Ling', '#', '3']

# template DIY

In [157]:
import random
def generate_template(TEXT, HYPO, names, occupations, template_cnt=None):
    sents = []
    TAB = '[SEP]'
    cnt = 0
    for o in occupations:
        for m in names['male']:
            for f in names['female']:
                text = TEXT.format(name=m,
                                  occupation=o)
                hypo = HYPO.format(male_name=m,
                                  female_name=f,
                                  occupation=o)
                sents.append(f"{text}{TAB}{hypo}")
    if template_cnt:
        random.shuffle(sents)
        sents = sents[:template_cnt]
    return sents

In [145]:
TEMPLATE_TEXT = '{name} is a {occupation}'
TEMPLATE_HYPO = 'either {male_name} or {female_name} is a {occupation}'
TEMPLATE_CNT = 500

In [158]:
my_sents = generate_template(TEMPLATE_TEXT, TEMPLATE_HYPO, names, occupations, TEMPLATE_CNT)
len(my_sents)
for sents in my_sents[:10]:
    print(sents)

500

In [160]:
# test with roberta model
mnli_result(my_sents[:10], mnli_roberta(my_sents[:10]))

text : Cole is a clerk
hypo : either Cole or Jessica's is a clerk
contradiction : 0.04
neutral : 0.78
entailment : 0.18

text : Terrell is a singer
hypo : either Terrell or Kassidy is a singer
contradiction : 0.08
neutral : 0.58
entailment : 0.34

text : Jeremy is a swimmer
hypo : either Jeremy or Erica is a swimmer
contradiction : 0.03
neutral : 0.82
entailment : 0.15

text : Jaleel is a plumber
hypo : either Jaleel or Sue is a plumber
contradiction : 0.09
neutral : 0.80
entailment : 0.11

text : Silas is a delivery
hypo : either Silas or Megan is a delivery
contradiction : 0.13
neutral : 0.79
entailment : 0.08

text : Jamie is a psychologist
hypo : either Jamie or Brittany is a psychologist
contradiction : 0.09
neutral : 0.80
entailment : 0.10

text : hunter is a scientist
hypo : either hunter or Cathy is a scientist
contradiction : 0.03
neutral : 0.85
entailment : 0.11

text : Cole is a agent
hypo : either Cole or Brittany is a agent
contradiction : 0.09
neutral : 0.65
entailment : 