# 0. Imports

In [2]:
import pandas as pd
import torch as t
from feature_steering import FeatureSteeringModule
DEVICE = "cuda" if t.cuda.is_available() else "cpu"

# 1. Individual Feature

In [3]:
# Initialize the module with your model
feature_steering = FeatureSteeringModule("EleutherAI/pythia-70m-deduped", device=DEVICE)

# Set up generation parameters
sampling_kwargs = dict(temperature=1, top_p=0.3, freq_penalty=1.0, seed=12, max_new_tokens=50)

# Define layer, coeff, and prompt
layer = '2.hook_resid_post'
feature_id = 1995
coeff = 30
prompt = "The doctor runs because"

# Generate and print long examples
feature_steering.run_with_layer_and_feature(prompt, layer, feature_id, coeff=coeff, sampling_kwargs=sampling_kwargs, table=False)

Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer
Moving model to device:  cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 50/50 [00:01<00:00, 31.46it/s]


Generation with steering:
The doctor runs because she was her mother. She had her mother's maiden name, and she was not a woman. She didn't have any of the names of her sisters, but she did know that the name of her sister was written to her by someone else.

--------------------------------------------------------------------------------

The doctor runs because she's her mother, she's her mother. She's her mother, she's the daughter of the Lord. She lives in a home with a young woman who has been with the Lord since they were married and had children.

She is

--------------------------------------------------------------------------------

The doctor runs because she was not herself. She is a mother who is in her early twenties and she has been with her husband for years. She has had some problems with her life, but she never does it again.

She will always be happy to


100%|██████████| 50/50 [00:01<00:00, 32.80it/s]


Generation without steering:
The doctor runs because he's afraid of you. The job is to be an employee and keep your family safe.

If you're a student, you can't afford to have any other career opportunities, but there are many things that matter. You'll be able

--------------------------------------------------------------------------------

The doctor runs because the computer is a laptop, but it's not.

I have to get the computer out of my system. I'm trying to figure out how to do it. The question is, how do I get the information back from my computer?

--------------------------------------------------------------------------------

The doctor runs because he is not the one who is responsible for the rest of his life.

In fact, there are many other factors that can influence how people behave in their own lives. For example, when you walk around a park and look at an empty





# 2. Feature Set

## 2.0 Gender Features Conversion

In [4]:
gender_features = [{'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '31453'},
 {'modelId': 'pythia-70m-deduped', 'layer': '2-res-sm', 'index': '29295'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '0-att-sm', 'index': '19062'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '19558'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '3-res-sm', 'index': '27334'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '4-res-sm', 'index': '12420'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '4-res-sm', 'index': '30220'},
 {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '26074'},  
 {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '10643'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '31975'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '5-res-sm', 'index': '14088'},
 {'modelId': 'pythia-70m-deduped', 'layer': '2-res-sm', 'index': '1995'},
 {'modelId': 'pythia-70m-deduped', 'layer': '2-att-sm', 'index': '27472'},
 {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '2959'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '19128'},
 {'modelId': 'pythia-70m-deduped', 'layer': '3-att-sm', 'index': '4659'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '4-att-sm', 'index': '31101'}, 
 {'modelId': 'pythia-70m-deduped', 'layer': '4-att-sm', 'index': '22821'},
 {'modelId': 'pythia-70m-deduped', 'layer': '5-att-sm', 'index': '22001'}]

In [5]:
transformed_features = []
for feature in gender_features:
    layer = feature['layer'].split('-')
    layer_number = layer[0]
    layer_type = layer[1] 
    if layer_type == 'res':
        hook_type = 'hook_resid_post' 
    elif layer_type =='att':
        hook_type = 'hook_attn_out'
    elif layer_type == 'mlp':
        hook_type = 'hook_mlp_out'
    transformed_features.append((f"{layer_number}.{hook_type}", feature['index']))

transformed_features


[('3.hook_resid_post', '31453'),
 ('2.hook_resid_post', '29295'),
 ('0.hook_attn_out', '19062'),
 ('3.hook_resid_post', '19558'),
 ('3.hook_resid_post', '27334'),
 ('4.hook_resid_post', '12420'),
 ('4.hook_resid_post', '30220'),
 ('5.hook_resid_post', '26074'),
 ('5.hook_resid_post', '10643'),
 ('5.hook_resid_post', '31975'),
 ('5.hook_resid_post', '14088'),
 ('2.hook_resid_post', '1995'),
 ('2.hook_attn_out', '27472'),
 ('3.hook_attn_out', '2959'),
 ('3.hook_attn_out', '19128'),
 ('3.hook_attn_out', '4659'),
 ('4.hook_attn_out', '31101'),
 ('4.hook_attn_out', '22821'),
 ('5.hook_attn_out', '22001')]

## 2.1 Male Prompt

In [8]:
prompt = "The doctor runs because"
sampling_kwargs = dict(temperature=1, top_p=0.3, freq_penalty=1.0, seed=12, max_new_tokens=1)
df = None

feature_steering = FeatureSteeringModule("EleutherAI/pythia-70m-deduped", device='cpu')

for layer, feature_id in transformed_features:
    for c in [1, 2, 3, 4, 5, 10, 20, 30, 60, 100]:
        result = feature_steering.run_with_layer_and_feature(prompt, layer, int(feature_id), coeff=c, sampling_kwargs=sampling_kwargs)
        if df is None:
            df = pd.DataFrame(result)
        else:
            df = pd.concat([df, pd.DataFrame(result)], axis=0)

Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer
Moving model to device:  cpu


100%|██████████| 1/1 [00:00<00:00, 24.68it/s]
100%|██████████| 1/1 [00:00<00:00, 31.38it/s]
100%|██████████| 1/1 [00:00<00:00, 32.34it/s]
100%|██████████| 1/1 [00:00<00:00, 31.21it/s]
100%|██████████| 1/1 [00:00<00:00, 33.12it/s]
100%|██████████| 1/1 [00:00<00:00, 30.47it/s]
100%|██████████| 1/1 [00:00<00:00, 30.96it/s]
100%|██████████| 1/1 [00:00<00:00, 33.71it/s]
100%|██████████| 1/1 [00:00<00:00, 32.45it/s]
100%|██████████| 1/1 [00:00<00:00, 32.76it/s]
100%|██████████| 1/1 [00:00<00:00, 28.73it/s]
100%|██████████| 1/1 [00:00<00:00, 32.34it/s]
100%|██████████| 1/1 [00:00<00:00, 30.47it/s]
100%|██████████| 1/1 [00:00<00:00, 31.19it/s]
100%|██████████| 1/1 [00:00<00:00, 32.99it/s]
100%|██████████| 1/1 [00:00<00:00, 29.80it/s]
100%|██████████| 1/1 [00:00<00:00, 31.55it/s]
100%|██████████| 1/1 [00:00<00:00, 31.33it/s]
100%|██████████| 1/1 [00:00<00:00, 32.41it/s]
100%|██████████| 1/1 [00:00<00:00, 32.01it/s]
100%|██████████| 1/1 [00:00<00:00, 27.10it/s]
100%|██████████| 1/1 [00:00<00:00,

In [9]:
# Create a new dataframe for storing the results
result_list = []

# Iterate over unique feature_id groups
for feature_id, feature_group in df.groupby('feature_id'):
    # Initialize a flag to indicate if a switch was found for the current feature_id
    switch_found = False
    
    # Iterate over coefficient groups within the current feature_id
    for coefficient, group in feature_group.groupby('coefficient'):
        # Check if any example switched from predicting 'he' without steering to 'she' with steering
        for idx, row in group.iterrows():
            if row['without_steering'] == ' he' and row['with_steering'] == ' she':
                # If a switch is found, add the feature_id and coefficient to the results
                result_list.append({'feature_id': feature_id, 'coefficient': coefficient, 'without_steering': row['without_steering'], 'with_steering': row['with_steering']})
                switch_found = True
                break  # Stop after finding the first switch for this coefficient
        
        if switch_found:
            break  # Move to the next feature_id after finding the first switch
    
    if switch_found == False and coefficient == 100:
        result_list.append({'feature_id': feature_id, 'coefficient': 100, 'without_steering': ' he', 'with_steering': ' he'})


# Convert the result list to a DataFrame
result_df = pd.DataFrame(result_list)

# Display the resulting dataframe
print(result_df)


    feature_id  coefficient without_steering with_steering
0         1995            2               he           she
1         2959            4               he           she
2         4659         1000               he            he
3        10643         1000               he            he
4        12420           20               he           she
5        14088         1000               he            he
6        19062         1000               he            he
7        19128         1000               he            he
8        19558            3               he           she
9        22001         1000               he            he
10       22821         1000               he            he
11       26074         1000               he            he
12       27334         1000               he            he
13       27472            2               he           she
14       29295            2               he           she
15       30220         1000               he            

In [10]:
print('SHE WITHOUT', len(df[df['without_steering']== ' she']))
print('SHE WITH', len(df[df['with_steering']== ' she']))

SHE WITHOUT 0
SHE WITH 156


In [11]:
print('HE WITHOUT', len(df[df['without_steering']==' he']))
print('HE WITH', len(df[df['with_steering']==' he']))

HE WITHOUT 380
HE WITH 277


In [12]:
df['without_steering'].unique()

array([' he', ' the'], dtype=object)

In [13]:
df['with_steering'].unique()

array([' he', ' the', ' she', ' they', ' it'], dtype=object)

# Female to male

In [14]:
prompt = "The nurse runs because"
sampling_kwargs = dict(temperature=1, top_p=0.3, freq_penalty=1.0, seed=12, max_new_tokens=1)
df_f = None

feature_steering = FeatureSteeringModule("EleutherAI/pythia-70m-deduped", device='cpu')

for layer, feature_id in transformed_features:
    for c in [1, 2, 3, 4, 5, 10, 20, 30, 60, 100]:
        result = feature_steering.run_with_layer_and_feature(prompt, layer, int(feature_id), coeff=c, sampling_kwargs=sampling_kwargs)
        if df_f is None:
            df_f = pd.DataFrame(result)
        else:
            df_f = pd.concat([df_f, pd.DataFrame(result)], axis=0)

Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer
Moving model to device:  cpu


100%|██████████| 1/1 [00:00<00:00, 24.85it/s]
100%|██████████| 1/1 [00:00<00:00, 30.69it/s]
100%|██████████| 1/1 [00:00<00:00, 31.39it/s]
100%|██████████| 1/1 [00:00<00:00, 29.82it/s]
100%|██████████| 1/1 [00:00<00:00, 30.27it/s]
100%|██████████| 1/1 [00:00<00:00, 30.17it/s]
100%|██████████| 1/1 [00:00<00:00, 32.09it/s]
100%|██████████| 1/1 [00:00<00:00, 30.53it/s]
100%|██████████| 1/1 [00:00<00:00, 34.23it/s]
100%|██████████| 1/1 [00:00<00:00, 33.84it/s]
100%|██████████| 1/1 [00:00<00:00, 35.29it/s]
100%|██████████| 1/1 [00:00<00:00, 34.76it/s]
100%|██████████| 1/1 [00:00<00:00, 34.96it/s]
100%|██████████| 1/1 [00:00<00:00, 34.26it/s]
100%|██████████| 1/1 [00:00<00:00, 35.28it/s]
100%|██████████| 1/1 [00:00<00:00, 14.19it/s]
100%|██████████| 1/1 [00:00<00:00, 31.06it/s]
100%|██████████| 1/1 [00:00<00:00, 31.37it/s]
100%|██████████| 1/1 [00:00<00:00, 32.56it/s]
100%|██████████| 1/1 [00:00<00:00, 31.95it/s]
100%|██████████| 1/1 [00:00<00:00, 30.89it/s]
100%|██████████| 1/1 [00:00<00:00,

In [15]:
# Create a new dataframe for storing the results
result_list_f = []

# Iterate over unique feature_id groups
for feature_id, feature_group in df_f.groupby('feature_id'):
    # Initialize a flag to indicate if a switch was found for the current feature_id
    switch_found = False
    
    # Iterate over coefficient groups within the current feature_id
    for coefficient, group in feature_group.groupby('coefficient'):
        # Check if any example switched from predicting 'he' without steering to 'she' with steering
        for idx, row in group.iterrows():
            if row['without_steering'] == ' she' and row['with_steering'] == ' he':
                # If a switch is found, add the feature_id and coefficient to the results
                result_list_f.append({'feature_id': feature_id, 'coefficient': coefficient, 'without_steering': row['without_steering'], 'with_steering': row['with_steering']})
                switch_found = True
                break  # Stop after finding the first switch for this coefficient
        
        if switch_found:
            break  # Move to the next feature_id after finding the first switch
    
    if switch_found == False and coefficient == 100:
        result_list_f.append({'feature_id': feature_id, 'coefficient': 1000, 'without_steering': ' she', 'with_steering': ' she'})


# Convert the result list to a DataFrame
result_df_f = pd.DataFrame(result_list_f)

# Display the resulting dataframe
print(result_df_f)


    feature_id  coefficient without_steering with_steering
0         1995         1000              she           she
1         2959         1000              she           she
2         4659           20              she            he
3        10643         1000              she           she
4        12420         1000              she           she
5        14088         1000              she           she
6        19062         1000              she           she
7        19128           10              she            he
8        19558         1000              she           she
9        22001         1000              she           she
10       22821         1000              she           she
11       26074         1000              she           she
12       27334           10              she            he
13       27472         1000              she           she
14       29295         1000              she           she
15       30220         1000              she           s