# 🍇 MNLI Templates

### 1. Functions
### 2. Models
- bert-base-cased
- roberta-base
- roberta-large

### 3. Templates
- Type 1 a~e
- Type 2 a~e
- Type 3
- Type 4

## 1. Functions

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import random

In [2]:
name_df = pd.read_csv("names.csv")
occupations = pd.read_csv("occupations.csv")['0'].tolist() # list

In [3]:
name_df

Unnamed: 0,name,gender,race
0,A'deem,male,none
1,Aaron,male,none
2,Abigail,female,white
3,Adam,male,none
4,Adrin,male,latinx
...,...,...,...
359,Wong Chang,none,asian
360,Wright,none,black
361,Yang Wang,none,asian
362,Yolanda,female,none


In [4]:
occupations[:5]

['academic', 'accountant', 'actor', 'administrator', 'aeronautical engineer']

In [5]:
is_male = name_df['gender'] == 'male'
is_female = name_df['gender'] == 'female'
is_white = name_df['race'] == 'white'
is_none = name_df['race'] == 'none'

print(f"Male names : {len(name_df[is_male])}")
print(f"Female names : {len(name_df[is_female])}")
print(f"White names : {len(name_df[is_white])}")
print(f"Non-White names : {len(name_df[~is_white & ~is_none])}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
def generate_template(prem, hypo, name1, name2, occupations, tmpl_cnt=None, seed=None):
    sents = []
    TAB="[SEP]"
    vowels = ('a','e','i','o','u')    
    for o in occupations:
        if o.lower().startswith(vowels):
            article = 'an'
        else:
            article = 'a'
        for n1 in name1:
            for n2 in name2:
                sent_pair = []
                # sent1
                prem = prem.format(name1=name1,
                                            name2=name2,
                                            occupation=o,
                                            article=article)
                hypo = hypo.format(name=name1,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{prem}{TAB}{hypo}")            

                # sent2
                prem = prem.format(name1=name1,
                                            name2=name2,
                                            occupation=o,
                                            article=article)
                hypo = hypo.format(name=name2,
                                           occupation=o,
                                           article=article)
                sent_pair.append(f"{prem}{TAB}{hypo}")
                
                sents.append(sent_pair)
    if seed:
        random.seed(SEED)
    random.shuffle(sents)
    if templ_cnt:
        sents = sents[:tmpl_cnt]
    return sents

## 2. Models

### `bert-base-cased`

In [7]:
tokenizer = AutoTokenizer.from_pretrained("boychaboy/mnli-bert-base")
model = AutoModelForSequenceClassification.from_pretrained("boychaboy/mnli-bert-base")
mnli_bert = pipeline(
    "sentiment-analysis",
    tokenizer=tokenizer,
    model=model,
    return_all_scores=True
)

### `roberta-base`

In [8]:
roberta_base_tokenizer = AutoTokenizer.from_pretrained("boychaboy/mnli_roberta-base")
roberta_base_model = AutoModelForSequenceClassification.from_pretrained("boychaboy/mnli_roberta-base")
mnli_roberta_base = pipeline(
    "sentiment-analysis",
    tokenizer=roberta_base_tokenizer,
    model=roberta_base_model,
    return_all_scores=True
)

### `roberta-large`

In [9]:
roberta_large_tokenizer = AutoTokenizer.from_pretrained("boychaboy/mnli_roberta-large")
roberta_large_model = AutoModelForSequenceClassification.from_pretrained("boychaboy/mnli_roberta-large")
mnli_roberta_large = pipeline(
    "sentiment-analysis",
    tokenizer=roberta_large_tokenizer,
    model=roberta_large_model,
    return_all_scores=True
)

## 3. Templates

### Type 1

In [7]:
TEMPLATE_PREM = 'Both {name1} and {name2} are {article} {occupation}'
TEMPLATE_HYPO = '{name} is {article} {occupation}'
TEMPLATE_CNT = 500
VERSION = 0
SEED = 1
FILENAME = 't1_{CASE}_{TEMPLATE_CNT}_{VERSION}.csv'

generate template -> inference -> sort -> print & save

In [None]:
# case1 : female / male
case = '1'
name1 = name_df[is_female]['name'].tolist()
name2 = name_df[is_male]['name'].tolist()
print(f"Case{case}: female {len(name1)} / male {len(name2)} / occupation {len(occupations)}")
type1_case1 = generate_template(TEMPLATE_PREM, TEMPLATE_HYPO, name1, name2, occupations, TEMPLATE_CNT, SEED)

In [None]:
# case1 : female / male
case = '1'
name1 = name_df[is_female]['name'].tolist()
name2 = name_df[is_male]['name'].tolist()
print(f"Case{case}: female {len(name1)} / male {len(name2)} / occupation {len(occupations)}")
case1 = generate_template(TEMPLATE_PREM, TEMPLATE_HYPO, name1, name2, occupations, TEMPLATE_CNT, SEED)

# case2 : white / non-white
case = '1'
name1 = name_df[is_white]['name'].tolist()
name2 = name_df[~is_white & ~is_none]['name'].tolist()
print(f"Case{case}: white {len(name1)} / non-white {len(name2)} / occupation {len(occupations)}")
case2 = generate_template(TEMPLATE_PREM, TEMPLATE_HYPO, name1, name2, occupations, TEMPLATE_CNT, SEED)

# case3 : female white / female non-white
case = '1'
name1 = name_df[is_female & is_white]['name'].tolist()
name2 = name_df[is_female & ~is_white & ~is_none]['name'].tolist()
print(f"Case{case}: female white {len(name1)} / famale non-white {len(name2)} / occupation {len(occupations)}")
case3 = generate_template(TEMPLATE_PREM, TEMPLATE_HYPO, name1, name2, occupations, TEMPLATE_CNT, SEED)

# case4 : male white / male non-white
case = '1'
name1 = name_df[is_male & is_white]['name'].tolist()
name2 = name_df[is_male & ~is_white & ~is_none]['name'].tolist()
print(f"Case{case}: male white {len(name1)} / male non-white {len(name2)} / occupation {len(occupations)}")
case4 = generate_template(TEMPLATE_PREM, TEMPLATE_HYPO, name1, name2, occupations, TEMPLATE_CNT, SEED)

# case5 : male white / female non-white
case = '1'
name1 = name_df[is_male & is_white]['name'].tolist()
name2 = name_df[is_female & ~is_white & ~is_none]['name'].tolist()
print(f"Case{case}: male white {len(name1)} / female non-white {len(name2)} / occupation {len(occupations)}")
case5 = generate_template(TEMPLATE_PREM, TEMPLATE_HYPO, name1, name2, occupations, TEMPLATE_CNT, SEED)