# 1. Imports

In [1]:
from nnsight import LanguageModel
import torch as t

DEVICE = t.device("cuda" if t.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model
model = LanguageModel(
    "EleutherAI/pythia-70m-deduped",
    device_map = DEVICE,
    dispatch = True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 2. Template

In [3]:
n_examples = 50
verbs = ['works', 'sits', 'reads', 'rests', 'calls', 'fights', 'walks', 'drinks', 'thinks', 'cooks', 'draws', 'believes', 'speaks', 'kicks', 'answers', 'drives', 'pushes', 'talks', 'understands', 'sings', 'jumps', 'opens', 'dances', 'cleans', 'paints', 'asks', 'hits', 'eats', 'watches', 'writes', 'flies', 'moves', 'cuts', 'knows', 'cries', 'laughs', 'rides', 'runs', 'lifts', 'smells', 'stands', 'turns', 'smiles', 'plays', 'cares', 'argues', 'helps', 'reports', 'observes', 'tracks' ]

## 2.1. Create dataset - Professions

In [4]:
male_prof = ['priest', 'doctor', 'developer', 'director', 'architect', 'programmer', 'guard', 'lawyer', 'soldier', 'coach', 'politician', 'scientist', 'judge', 'analyst', 'technician', 'banker', 'pilot']
female_prof = ['editor', 'model', 'actress', 'assistant', 'writer', 'mediator', 'aide', 'nurse', 'maid', 'secretary', 'teacher', 'nun', 'designer', 'cleaner', 'administrator', 'dancer', 'tailor']

In [5]:
print(len(male_prof), len(female_prof))

17 17


In [6]:
# Generate 50 prompts
male_prof_prefixes = []
female_prof_prefixes = []

for i in range(n_examples):
    m_profession = male_prof[i%len(male_prof)]
    f_profession = female_prof[i%len(female_prof)]
    verb = verbs[i]
    m_prompt = f"The {m_profession} {verb} because"
    f_prompt = f"The {f_profession} {verb} because"
    male_prof_prefixes.append(m_prompt)
    female_prof_prefixes.append(f_prompt)

In [7]:
male_answers = [' he'] * n_examples
female_answers = [' she'] * n_examples

### 2.1.1 Stereo Dataset

In [8]:
# Stereotypical prompts
with open('data/prof_stereo.json', 'w') as file:
    for i in range(len(male_prof_prefixes)):
        # Write all values in one line in JSON format
        line = f'{{"clean_prefix": "{male_prof_prefixes[i]}", "patch_prefix": "{female_prof_prefixes[i]}", "clean_answer": "{male_answers[i]}", "patch_answer": "{female_answers[i]}"}}\n'
        file.write(line)

print(f"File created with {len(male_prof_prefixes)} entries.")


File created with 50 entries.


In [10]:
stereom_prefixes = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in male_prof_prefixes]
stereom_inputs = t.cat(stereom_prefixes, dim=0).to(DEVICE)

stereom_answers = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in male_answers]
stereom_answer_idxs = t.tensor(stereom_answers, device=DEVICE, dtype=t.long)

In [11]:
stereof_prefixes = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in female_prof_prefixes]
stereof_inputs = t.cat(stereof_prefixes, dim=0).to(DEVICE)

stereof_answers = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in female_answers]
stereof_answer_idxs = t.tensor(stereof_answers, device=DEVICE, dtype=t.long)

### 2.1.2 Antistereo Dataset

In [12]:
# Stereotypical prompts
with open('data/prof_anti.json', 'w') as file:
    for i in range(len(male_prof_prefixes)):
        # Write all values in one line in JSON format
        line = f'{{"clean_prefix": "{female_prof_prefixes[i]}", "patch_prefix": "{male_prof_prefixes[i]}", "clean_answer": "{male_answers[i]}", "patch_answer": "{female_answers[i]}"}}\n'
        file.write(line)

print(f"File created with {len(male_prof_prefixes)} entries.")


File created with 50 entries.


## 2.2. Dataset Names

In [13]:
male_names = [
    "John", "Mark", "Paul", "Luke", "Tom", "Nick", "Dan", "Max", "Sam",
    "Bill", "Mike", "Jack", "Carl", "Joe", "Dave", "Matt", "Ben",
    "Will", "Ron", "Tim", "Bob", "Alex", "Adam", "Chris", "Jim"
]
female_names = [
    'Elizabeth', 'Mary', 'Anna', 'Sarah', 'Rose', 'Marie', 'Anne', 'Ann', 'Jane', 
    'Maria', 'Ruby', 'Joy', 'Carol', 'June', 'Jean', 'Hope', 'April', 'Kelly', 
    'Ada', 'Kate', 'Whitney', 'Taylor', 'Lou', "Summer", "Lou"
]

In [14]:
print(len(male_names), len(female_names))

25 25


In [15]:
# Generate 50 prompts
male_prefixes = []
female_prefixes = []

for i in range(n_examples):
    m_name = male_names[i%len(male_names)]
    f_name = female_names[i%len(female_names)]
    verb = verbs[i]
    m_prompt = f"{m_name} {verb} because"
    f_prompt = f"{f_name} {verb} because"
    male_prefixes.append(m_prompt)
    female_prefixes.append(f_prompt)

In [16]:
male_answers = [' he'] * n_examples
female_answers = [' she'] * n_examples

In [17]:
stereof_prefixes = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in female_prefixes]
stereof_inputs = t.cat(stereof_prefixes, dim=0).to(DEVICE)

stereof_answers = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in female_answers]
stereof_answer_idxs = t.tensor(stereof_answers, device=DEVICE, dtype=t.long)

In [18]:
print(female_prefixes[14])
print(stereof_inputs[14])

Jean answers because
tensor([36751,  9172,   984], device='cuda:0')


In [19]:
stereom_prefixes = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in male_prefixes]
stereom_inputs = t.cat(stereom_prefixes, dim=0).to(DEVICE)

stereom_answers = [model.tokenizer(e, return_tensors="pt", padding=False).input_ids for e in male_answers]
stereom_answer_idxs = t.tensor(stereom_answers, device=DEVICE, dtype=t.long)

In [20]:
print(male_prefixes[43])
print(stereom_inputs[42])

Ron plays because
tensor([12978, 28582,   984], device='cuda:0')


In [21]:
# Export examples
with open('data/names.json', 'w') as file:
    for i in range(len(male_prefixes)):
        # Write all values in one line in JSON format
        line = f'{{"clean_prefix": "{male_prefixes[i]}", "patch_prefix": "{female_prefixes[i]}", "clean_answer": "{male_answers[i]}", "patch_answer": "{female_answers[i]}"}}\n'
        file.write(line)

print(f"File created with {len(male_prefixes)} entries.")


File created with 50 entries.


## 2.3 Baseline Names

In [27]:
male_answers_b = male_names * int(n_examples/len(male_names))
female_answers_b =  female_names * int(n_examples/len(female_names))

In [28]:
# Export examples
with open('data/baseline_names.json', 'w') as file:
    for i in range(len(male_prefixes)):
        # Write all values in one line in JSON format
        line = f'{{"clean_prefix": "{male_prefixes[i]}", "patch_prefix": "{female_prefixes[i]}", "clean_answer": "{male_answers_b[i]}", "patch_answer": "{female_answers_b[i]}"}}\n'
        file.write(line)

print(f"File created with {len(male_prefixes)} entries.")


File created with 50 entries.
