In [1]:
import pandas as pd
import numpy as np
import scipy.optimize as opt
from scipy.special import erf, erfinv

In [2]:
df = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/gpt2_all_neuron_metrics.csv')

In [3]:
df.head()

Unnamed: 0,layer_idx,neuron_idx,pt_projection_value,dpo_projection_value,pt_activation_value,dpo_activation_value,pt_cossim,dpo_cossim,gpt2_value_vector_projs,dpo_value_vector_projs,projection_diff,activation_diff
0,0,0,-0.004434,-0.004821,-0.086628,-0.094152,0.020808,0.020818,0.051181,0.051206,0.0003874512,0.007524
1,0,1,0.000381,0.000381,-0.010077,-0.010096,-0.015356,-0.015325,-0.037849,-0.037771,7.888138e-08,1.9e-05
2,0,2,0.008536,0.008656,-0.065236,-0.066241,-0.044885,-0.044824,-0.130852,-0.130673,-0.0001195893,0.001005
3,0,3,0.00426,0.004129,-0.081713,-0.079245,-0.015817,-0.015809,-0.052139,-0.052108,0.0001311275,-0.002468
4,0,4,3e-06,-3e-06,-0.073461,-0.079985,-1.8e-05,1.5e-05,-4.6e-05,3.8e-05,6.465025e-06,0.006524


In [4]:
df = df[['layer_idx', 'neuron_idx', 'pt_activation_value']]
df_2 = pd.read_csv('/data/kebl6672/dpo-toxic-general/toxicity/gpt2_embed_neuron_cossims.csv')
df = pd.merge(df, df_2, on=['layer_idx', 'neuron_idx'], how='inner')
print(df.head())

   layer_idx  neuron_idx  pt_activation_value  pt_cossim
0          0           0            -0.086628   0.040009
1          0           1            -0.010077  -0.039795
2          0           2            -0.065236  -0.010002
3          0           3            -0.081713  -0.019028
4          0           4            -0.073461  -0.009697


##### Extract all neuron indexes from each neuron group

In [13]:
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
    (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or # TP
    (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) # TN
]

print(len(tuples_list))

55663


In [15]:
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['projection_diff'] < 0) # all uparrow ones
]

print(len(tuples_list))

40801


In [16]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_all_uparrow_neuron_configs.csv", index=False)

In [40]:
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation_value'])
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or # AP
       (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # TN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP
]

print(len(tuples_list))


57501


In [41]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_all_four_neuron_configs.csv", index=False)

##### Activation hacking on the existing groups

In [18]:
# Halve all four groups 
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or # AP
       (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # TN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP
]

print(len(tuples_list))


57501


In [43]:
# halve TP 
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP  
]
print(len(tuples_list))

3967


In [45]:
# halve AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) # AN
]
print(len(tuples_list))

29252


In [44]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_halve_two_neuron_configs.csv", index=False)

In [3]:
# halve TP + AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or # AN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # TP  
]
print(len(tuples_list))

33219


In [4]:
# 1.5* TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) # TN  
]
print(len(tuples_list))

22444


In [6]:
# 1.5* AP
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) # AP
]
print(len(tuples_list))

1838


In [8]:
# 1.5* (AP + TN)
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or # AP
       (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) # TN  
]
print(len(tuples_list))

24282


In [10]:
# halve for TP and AN, 1.5* for AP and TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] / 2)  # Halve activation
    if (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or  # TP
       (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0)    # AN
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 1.5)  # 1.5* activation
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or  # AP
       (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0) or  # TN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0 and row['projection_diff'] > 0) or  # TP
       (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0 and row['projection_diff'] > 0)    # AN
]

print(len(tuples_list))


57501


In [11]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_halve_1.5_neuron_configs.csv", index=False)

In [16]:
# all neurons increase projection
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['dpo_activation'])
    for _, row in df.iterrows()
    # if (row['pt_cossim'] < 0 and row['pt_activation'] > 0 and row['activation_diff'] > 0) or # AP-
    #    (row['pt_cossim'] < 0 and row['pt_activation'] < 0 and row['activation_diff'] < 0)  # AN+
    if (row['pt_cossim'] > 0 and row['pt_activation'] < 0 and row['activation_diff'] < 0) or # TN-
       (row['pt_cossim'] > 0 and row['pt_activation'] > 0 and row['activation_diff'] < 0) # TP+    
]

print(len(tuples_list))

25794


##### Tuning-free group identification

In [25]:
# halve TP 
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0) # TP  %  and row['projection_diff'] > 0
]
print(len(tuples_list))

5770


In [31]:
# halve AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0) # AN
]
print(len(tuples_list))

44040


In [27]:
# halve TP + AN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']/2)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0) or # AN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0) # TP  
]
print(len(tuples_list))

49810


In [12]:
# 1.5* AP 
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0) # AP
]
print(len(tuples_list))

6728


In [17]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_1.5_two_free_neuron_configs.csv", index=False)

In [14]:
# 1.5* TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0) # TN  
]
print(len(tuples_list))

41764


In [16]:
# 1.5* (AP + TN)
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value']*1.5)
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0) or # AP
       (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0) # TN  
]
print(len(tuples_list))

48492


In [18]:
# halve for TP and AN, 1.5* for AP and TN
tuples_list = [
    (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] / 2)  # Halve activation
    if (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0)    # AN
    else (int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * 1.5)  # 1.5*activation
    for _, row in df.iterrows()
    if (row['pt_cossim'] < 0 and row['pt_activation_value'] > 0) or  # AP
       (row['pt_cossim'] > 0 and row['pt_activation_value'] < 0) or  # TN
       (row['pt_cossim'] > 0 and row['pt_activation_value'] > 0) or  # TP
       (row['pt_cossim'] < 0 and row['pt_activation_value'] < 0)    # AN
]

print(len(tuples_list))


98302


In [19]:
df_subset = pd.DataFrame(tuples_list, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_halve_1.5_free_neuron_configs.csv", index=False)

#### Select 128/256 toxic-aligned neurons

In [6]:
def select_fraction(df, num_of_neurons, top=True):
    """
    Selects a fraction of the rows from df based on descending cossim.
    """
    sorted_group = df.sort_values(by='pt_cossim', ascending=not top) # descending
    selected = sorted_group.head(num_of_neurons)
    return selected

In [7]:
toxic_aligned = select_fraction(df, 256)
df_subset = pd.DataFrame(toxic_aligned, columns=["layer_idx", "neuron_idx", "dpo_activation_value"])
df_subset.to_csv("gpt2_256_patch.csv", index=False)

In [5]:
df_subset.head()

Unnamed: 0,layer_idx,neuron_idx,dpo_activation_value
78594,19,770,-0.008374
49923,12,771,0.025833
76397,18,2669,0.001922
53916,13,668,-0.033308
65791,16,255,-0.000552


#### Tuning-free inspired by DPO 

In [9]:
# def select_fraction(df, condition, fraction=0.5, top=True):
#     """
#     Selects a fraction of the rows from df based on abs(pt_activation_value), either top or bottom fraction.
#     """
#     group = df[condition].copy()
#     group['abs_val'] = group['pt_activation_value'].abs()
#     sorted_group = group.sort_values(by='abs_val', ascending=top) # ascending
#     n = int(len(sorted_group) * fraction)
#     selected = sorted_group.head(n)
#     return selected

In [13]:
def select_fraction(df, condition, fraction=0.5, top=True):
    """
    Selects a fraction of the rows from df based on abs_cossim 
    after removing the top 20% of neurons by pt_activation_value.
    """
    # Remove top 10% neurons with highest pt_activation_value
    threshold = df['pt_activation_value'].quantile(0.8)
    df_removed = df[df['pt_activation_value'] <= threshold]
    
    group = df_removed[condition].copy()

    group['abs_cossim'] = group['pt_cossim'].abs()
    
    sorted_group = group.sort_values(by='abs_cossim', ascending=not top) # descending
    n = int(len(sorted_group) * fraction)
    selected = sorted_group.head(n)
    
    return selected


In [14]:
tp_condition = (df['pt_cossim'] > 0) & (df['pt_activation_value'] > 0)
an_condition = (df['pt_cossim'] < 0) & (df['pt_activation_value'] < 0)
ap_condition = (df['pt_cossim'] < 0) & (df['pt_activation_value'] > 0)
tn_condition = (df['pt_cossim'] > 0) & (df['pt_activation_value'] < 0)

# beta
beta = 0.5

tp_sel = select_fraction(df, tp_condition, fraction=beta, top=True)
an_sel = select_fraction(df, an_condition, fraction=beta, top=True)
ap_sel = select_fraction(df, ap_condition, fraction=beta, top=False)
tn_sel = select_fraction(df, tn_condition, fraction=beta, top=False)

  group = df_removed[condition].copy()
  group = df_removed[condition].copy()
  group = df_removed[condition].copy()
  group = df_removed[condition].copy()


In [15]:
# alpha
alpha = 0.05

intervened = []

for _, row in pd.concat([tp_sel, an_sel]).iterrows():
    intervened.append((int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * (1-alpha)))

for _, row in pd.concat([ap_sel, tn_sel]).iterrows():
    intervened.append((int(row['layer_idx']), int(row['neuron_idx']), row['pt_activation_value'] * (1+alpha)))

print(len(intervened))

39321


In [12]:
df_subset = pd.DataFrame(intervened, columns=["layer_idx", "neuron_idx", "assigned_value"])
df_subset.to_csv("gpt2_0.95_1.05_two_0.5_cossim_embed_dpo.csv", index=False)

#### Get the activations for top 128 toxic neurons / 36 positively activated toxic neurons

In [21]:
# Filter the DataFrame for the top 128 rows by cosine_similarity
top_128_rows = df.nlargest(128, 'pt_cossim')

# Get the total number of such rows
total_filtered_rows = top_128_rows.shape[0]

# Print the result
print("Total number of rows with highest cosine similarity and positive GPT-2 activation:", total_filtered_rows)

# Extract the list of tuples
filtered_tuples = list(top_128_rows[['layer_idx', 'neuron_idx', 'dpo_pregelu_activation']].itertuples(index=False, name=None))

# Print the result
print("List of tuples (layer_index, neuron_index, dpo_pregelu_activation):")
print(filtered_tuples)

Total number of rows with highest cosine similarity and positive GPT-2 activation: 128
List of tuples (layer_index, neuron_index, dpo_pregelu_activation):
[(19, 770, -0.016977283051654416), (12, 771, 0.04969537966124989), (18, 2669, 0.0038321316381261807), (13, 668, -0.07058810171775307), (16, 255, -0.0011058073715099835), (12, 882, -0.113394102185773), (19, 1438, 0.15294501710233885), (9, 545, -0.10686375994200416), (8, 2854, -0.05395217817731824), (3, 3680, -0.015262110350101077), (14, 1958, -0.1336680829902643), (7, 1735, -0.11794289513106787), (13, 2258, -0.10931066076252646), (11, 1550, -0.11230842916543096), (3, 704, -0.10765209018216775), (10, 3477, -0.09382688046268217), (13, 1023, -0.09818235291497254), (13, 253, -0.12699634360404508), (10, 2936, -0.15094208805809126), (0, 2352, -0.024648923979238926), (7, 1916, -0.13753848921096778), (3, 3742, -0.038883108764911044), (11, 2844, -0.19994687566480335), (11, 4021, -0.06456283217355124), (11, 175, -0.03322151300842863), (19, 3341

In [15]:
# Filter the DataFrame for the top 60 rows by cosine_similarity
top_60_rows = df.nlargest(60, 'pt_cossim')

# Get the total number of such rows
total_filtered_rows = top_60_rows.shape[0]

# Print the result
print("Total number of rows with highest cosine similarity and positive GPT-2 activation:", total_filtered_rows)

# Extract the list of tuples
filtered_tuples = list(top_60_rows[['layer_idx', 'neuron_idx', 'dpo_pregelu_activation']].itertuples(index=False, name=None))

# Print the result
print("List of tuples (layer_index, neuron_index, dpo_pregelu_activation):")
print(filtered_tuples)

Total number of rows with highest cosine similarity and positive GPT-2 activation: 60
List of tuples (layer_index, neuron_index, dpo_pregelu_activation):
[(19, 770, -0.016977283051654416), (12, 771, 0.04969537966124989), (18, 2669, 0.0038321316381261807), (13, 668, -0.07058810171775307), (16, 255, -0.0011058073715099835), (12, 882, -0.113394102185773), (19, 1438, 0.15294501710233885), (9, 545, -0.10686375994200416), (8, 2854, -0.05395217817731824), (3, 3680, -0.015262110350101077), (14, 1958, -0.1336680829902643), (7, 1735, -0.11794289513106787), (13, 2258, -0.10931066076252646), (11, 1550, -0.11230842916543096), (3, 704, -0.10765209018216775), (10, 3477, -0.09382688046268217), (13, 1023, -0.09818235291497254), (13, 253, -0.12699634360404508), (10, 2936, -0.15094208805809126), (0, 2352, -0.024648923979238926), (7, 1916, -0.13753848921096778), (3, 3742, -0.038883108764911044), (11, 2844, -0.19994687566480335), (11, 4021, -0.06456283217355124), (11, 175, -0.03322151300842863), (19, 3341,

In [17]:
# Filter the DataFrame for the top 128 rows by cosine_similarity
top_128_rows = df.nlargest(128, 'pt_cossim')

# Further filter for rows with positive gpt2_activation
filtered_rows = top_128_rows[top_128_rows['pt_activation'] > 0]

# Get the total number of such rows
total_filtered_rows = filtered_rows.shape[0]

# Print the result
print("Total number of rows with highest cosine similarity and positive GPT-2 activation:", total_filtered_rows)

# Extract the list of tuples
filtered_tuples = list(filtered_rows[['layer_idx', 'neuron_idx', 'dpo_pregelu_activation']].itertuples(index=False, name=None))

# Print the result
print("List of tuples (layer_index, neuron_index, dpo_pregelu_activation):")
print(filtered_tuples)


Total number of rows with highest cosine similarity and positive GPT-2 activation: 36
List of tuples (layer_index, neuron_index, dpo_pregelu_activation):
[(19, 770, -0.016977283051654416), (12, 771, 0.04969537966124989), (18, 2669, 0.0038321316381261807), (13, 668, -0.07058810171775307), (16, 255, -0.0011058073715099835), (12, 882, -0.113394102185773), (19, 1438, 0.15294501710233885), (8, 2854, -0.05395217817731824), (3, 3680, -0.015262110350101077), (14, 1958, -0.1336680829902643), (13, 2258, -0.10931066076252646), (11, 1550, -0.11230842916543096), (10, 3477, -0.09382688046268217), (0, 2352, -0.024648923979238926), (3, 3742, -0.038883108764911044), (11, 4021, -0.06456283217355124), (11, 175, -0.03322151300842863), (19, 3341, -0.0400893763734337), (16, 603, -0.0830782511479893), (11, 2617, -0.08237272546877626), (8, 3200, 0.0812912976275755), (19, 2312, -0.07821941281372657), (20, 3210, 0.04564807678187968), (12, 3413, -0.09751140549980601), (6, 3972, 0.2822371293946999), (0, 3393, 0.1