In [7]:
import os
import sys
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from dotenv import load_dotenv

module_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

from dialz.dataset import Dataset
from dialz.model import ControlModel
from dialz.vector import ControlVector


In [2]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
model_name = "mistralai/Mistral-7B-Instruct-v0.1"


In [3]:
# Hate Speech Dataset Load
df = pd.read_parquet("hf://datasets/ucberkeley-dlab/measuring-hate-speech/measuring-hate-speech.parquet")
df = df[['text', 'hate_speech_score', 'comment_id', 'annotator_id']]
df.head()

Unnamed: 0,text,hate_speech_score,comment_id,annotator_id
0,Yes indeed. She sort of reminds me of the elde...,-3.9,47777,10873
1,The trans women reading this tweet right now i...,-6.52,39773,2790
2,Question: These 4 broads who criticize America...,0.36,47101,3379
3,It is about time for all illegals to go back t...,0.26,43625,7365
4,For starters bend over the one in pink and kic...,1.54,12538,488


In [4]:
# Create a new column 'label' based on the hate speech score
df['label'] = df['hate_speech_score'].apply(lambda x: 1 if x > 1 else (0 if x < -2 else None))

# Remove rows without a label
df = df.dropna(subset=['label'])

df['label'] = df['label'].astype(int)

# Print the head of the dataframe and the number of rows
print(f"Number of rows: {len(df)}")
label_counts = df['label'].value_counts()
label_percentages = df['label'].value_counts(normalize=True) * 100

print(f"Number of label 0: {label_counts[0]} ({label_percentages[0]:.2f}%)")
print(f"Number of label 1: {label_counts[1]} ({label_percentages[1]:.2f}%)")
df.head()


Number of rows: 78427
Number of label 0: 37856 (48.27%)
Number of label 1: 40571 (51.73%)


Unnamed: 0,text,hate_speech_score,comment_id,annotator_id,label
0,Yes indeed. She sort of reminds me of the elde...,-3.9,47777,10873,0
1,The trans women reading this tweet right now i...,-6.52,39773,2790,0
4,For starters bend over the one in pink and kic...,1.54,12538,488,1
5,Sounds like the kinda wholsesome life I'd die ...,-4.93,11001,527,0
7,Fuck off you insufferable retarded faggot.,2.08,17034,6801,1


In [5]:
from tqdm import tqdm

def test_dataset(
        df: pd.DataFrame,
        scoring_method: str,
        model_name: str, 
        items: list, 
        prompt_type: str, 
        num_sents: int,
        system_role: str,
    ):
    dataset = Dataset.create_dataset(model_name, items, prompt_type, num_sents, system_role)
    
    model = ControlModel(model_name, list(range(-5, -18, -1)), hf_token)
    vector = ControlVector.train(model, dataset)

    for i, text in tqdm(enumerate(df['text']), total=len(df), desc="Calculating activation scores"):
        df.at[i, 'activation_score'] = model.get_activation_score(text, vector, layer_index=list(range(15, 20, 1)), 
                                                                    scoring_method=scoring_method)
    

    def calculate_score(scores_df, label_0_condition, label_1_condition):
        label_0_count = ((scores_df['label'] == 0) & label_0_condition).sum()
        label_1_count = ((scores_df['label'] == 1) & label_1_condition).sum()
        return label_0_count + label_1_count

    # Generate a range of thresholds to test
    thresholds = np.linspace(df['activation_score'].min(), df['activation_score'].max(), 1000)
    condition = [calculate_score(df, df['activation_score'] < t, df['activation_score'] > t) for t in thresholds]

    print(len(df))

    best_threshold = thresholds[np.argmax(condition)]
    accuracy_condition = (np.max(condition) / len(df)) * 100

    print(f"Best threshold: {best_threshold}")
    print(f"Accuracy: {accuracy_condition:.2f}%")
    
    return df



In [6]:
df_2 = df.head(1000).reset_index(drop=True)

print('Test 1')
test1 = test_dataset(df_2, 'default', model_name, ['hate speech', 'loving words'], 'genericqa', 400, 'You are an example of how someone would respond with ')

print('Test 2')
test2 = test_dataset(df_2, 'default', model_name, ['hate speech', 'loving words'], 'starters', 400, 'You are an example of how someone would respond with ')

print('Test 3')
test3 = test_dataset(df_2, 'default', model_name, ['hate speech', 'loving words'], 'genericqa', 400, 'You are an example of how someone would respond with ')

print('Test 4')
test4 = test_dataset(df_2, 'default', model_name, ['hate speech', 'loving words'], 'genericqa', 400, 'You are an example of how someone would respond with ')

print('Test 5')
test5 = test_dataset(df_2, 'default', model_name, ['hate speech', 'loving words'], 'genericqa', 400, 'You are an example of how someone would respond with ')


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/25 [00:32<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# then test scoring_methods default, last_token, max_token, median_token
# test alla the different layer options

In [49]:

import numpy as np

# CONDITION 2 IS THE ONE I WANT

# Define a function to calculate the score for a given threshold
def calculate_score(scores_df, threshold, label_0_condition, label_1_condition):
    label_0_count = ((scores_df['label'] == 0) & label_0_condition).sum()
    label_1_count = ((scores_df['label'] == 1) & label_1_condition).sum()
    return label_0_count + label_1_count

# Generate a range of thresholds to test
thresholds = np.linspace(scores_df['activation_score'].min(), scores_df['activation_score'].max(), 1000)
scores_condition = [calculate_score(scores_df, t, scores_df['activation_score'] < t, scores_df['activation_score'] > t) for t in thresholds]

print(len(scores_df))

best_threshold = thresholds[np.argmax(scores_condition)]
accuracy_condition = (np.max(scores_condition) / len(scores_df)) * 100

print(f"Best threshold: {best_threshold}")
print(f"Accuracy: {accuracy_condition:.2f}%")

1000
Best threshold: 0.22863410285285246
Accuracy: 76.60%


In [50]:
median = scores_df['activation_score'].median()
max_value = scores_df['activation_score'].max()
min_value = scores_df['activation_score'].min()

print(f"Median: {median}")
print(f"Max: {max_value}")
print(f"Min: {min_value}")

label_1_percentage = (scores_df['label'].value_counts(normalize=True)[1] * 100)
label_0_percentage = (scores_df['label'].value_counts(normalize=True)[0] * 100)

print(f"Percentage of label 1s: {label_1_percentage:.2f}%")
print(f"Percentage of label 0s: {label_0_percentage:.2f}%")

Median: 0.027358627319335936
Max: 0.85576171875
Min: -1.26796875
Percentage of label 1s: 51.73%
Percentage of label 0s: 48.27%


In [37]:
dataset = Dataset.create_dataset(model_name, ['hate speech', 'loving words'], 'genericqa', 400, 'You are an example of how someone would respond with ')

model = ControlModel(model_name, list(range(-5, -18, -1)), hf_token)
vector = ControlVector.train(model, dataset)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:15<00:00,  1.60it/s]
100%|██████████| 31/31 [00:03<00:00,  8.32it/s]


In [38]:

# Filter 10 texts with label 0 and 10 texts with label 1
label_0_texts = df[df['label'] == 0].head(10)
label_1_texts = df[df['label'] == 1].head(10)
print(len(label_1_texts))
# Process and print activation visualization and scores for label 0 texts
print("Activation visualization and scores for label 0 texts:")
for i, row in label_0_texts.iterrows():
    text = row['text']
    activation_visualization = model.visualize_activation(input_text=text, control_vector=vector)
    activation_score = model.get_activation_score(text, vector, layer_index=20)
    print(f"Row {i}:\nText: {text}\nActivation Visualization:\n{activation_visualization}\nActivation Score: {activation_score}\n")

print("=====================================")
print("=====================================")

# Process and print activation visualization and scores for label 1 texts
print("Activation visualization and scores for label 1 texts:")
for i, row in label_1_texts.iterrows():
    text = row['text']
    activation_visualization = model.visualize_activation(input_text=text, control_vector=vector)
    activation_score = model.get_activation_score(text, vector, layer_index=20)
    print(f"Row {i}:\nText: {text}\nActivation Visualization:\n{activation_visualization}\nActivation Score: {activation_score}\n")

10
Activation visualization and scores for label 0 texts:
Row 0:
Text: Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn't have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!
Activation Visualization:
[38;2;228;228;255mYes[0m[38;2;0;0;255m indeed[0m[38;2;249;249;255m.[0m[38;2;252;252;255m She[0m[38;2;255;247;247m sort[0m[38;2;255;230;230m of[0m[38;2;255;228;228m reminds[0m[38;2;255;239;239m me[0m[38;2;228;228;255m of[0m[38;2;238;238;255m the[0m[38;2;187;187;255m elder[0m[38;2;236;236;255m lady[0m[38;2;209;209;255m that[0m[38;2;213;213;255m played[0m[38;2;225;225;255m the[0m[38;2;242;242;255m part[0m[38;2;237;237;255m in[0m[38;2;228;228;255m the[0m[38;2;236;236;255m movie[0m[38;2;220;220;255m "[0m[38;2;255;243;243mT[0m[38;2;255;246;246mitan[0m[38;2;218;218;255mic[0m[38;2;215;215;255m"[0m[38

In [39]:


list_of_strings = []

tokenizer = AutoTokenizer.from_pretrained(model_name, token=model.token)
tokenizer.pad_token_id = 0

def generate_with_vector(
    input: str,
    vector: ControlVector,
    coeffs: tuple[float, float],
    max_new_tokens: int = 20,
    repetition_penalty: float = 1.1,
    show_baseline: bool = True,
):
    positive_coeff, negative_coeff = coeffs
    assert positive_coeff > 0
    assert negative_coeff < 0

    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id,  # silence warning
        "do_sample": False,  # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("==baseline ---------------------------------------------------")
        model.reset()
        print(
            tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
        )
        list_of_strings.append(
            tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
        )

    print("\n++control ---------------------------------------------------")
    model.set_control(vector, positive_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    list_of_strings.append(
        tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
    )

    print("\n--control ---------------------------------------------------")
    model.set_control(vector, negative_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    list_of_strings.append(
        tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
    )

    model.reset()

    with open(f"output_{input[:10]}.txt", "w") as f:
        for string in list_of_strings:
            f.write(string + "\n")


generate_with_vector("Write about a black man.", vector, (2,-2), 60)



# highlighted_output = model.visualize_activation(
#     input_text="Good CEOs are all men. Men tend to be better at leadership roles.",
#     control_vector=vector,
# )

# print(highlighted_output)

# highlighted_output = model.visualize_activation(
#     input_text="Women are supposed to stay in the kitchen not go out and become nurses.",
#     control_vector=vector,
# )
# print(highlighted_output)

# highlighted_output = model.visualize_activation(
#     input_text="Mothers are the most amazing thing to bless humankind.",
#     control_vector=vector,
# )
# print(highlighted_output)

# print(
#     model.get_activation_score("Female empowerment is an amazing thing.", vector)
# )

==baseline ---------------------------------------------------
<s> Write about a black man.
A black man is often stereotyped as being strong, powerful and aggressive. However, like all individuals, he is complex with a range of emotions, experiences and perspectives. He may face challenges and discrimination due to his race, but his resilience and determination can overcome these obst

++control ---------------------------------------------------
<s> Write about a black man.
Yes, all black men are black yes but they can be all different types of black yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes yes

--control ---------------------------------------------------
<s> Write about a black man.

A black man is a person who does not discriminate against others based on their race or ethnicity. He is a man who has experienced discrimination and prejudice, but never allows suc