In [None]:
# get datasetd from "geometry of truth paper"
!git clone https://github.com/saprmarks/geometry-of-truth.git /root/geometry-of-truth
!mv /root/geometry-of-truth/datasets/*.csv data

In [1]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from baukit import Trace
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
# import my modules
import importlib
# join the path to the modules to the current working directory
\
import utils, dataset_utils
importlib.reload(utils)
importlib.reload(dataset_utils)
from utils import *
from dataset_utils import *

In [3]:
# iterate through all csv files in data folder
for file in os.listdir('data'):
    if not file.endswith('.csv'):
        continue
    df = pd.read_csv('data/'+file)
    print('\n','#'*10, file, '#'*10)
    # print column names
    print(df.columns)


 ########## cities.csv ##########
Index(['statement', 'label', 'city', 'country', 'correct_country'], dtype='object')

 ########## cities_cities_conj.csv ##########
Index(['statement', 'label', 'statement1', 'label1', 'city1', 'country1',
       'correct_country1', 'statement2', 'label2', 'city2', 'country2',
       'correct_country2'],
      dtype='object')

 ########## cities_cities_disj.csv ##########
Index(['statement', 'label', 'statement1', 'label1', 'city1', 'country1',
       'correct_country1', 'statement2', 'label2', 'city2', 'country2',
       'correct_country2'],
      dtype='object')

 ########## common_claim.csv ##########
Index(['Unnamed: 0', 'examples', 'label', 'agreement'], dtype='object')

 ########## common_claim_true_false.csv ##########
Index(['statement', 'label'], dtype='object')

 ########## companies_true_false.csv ##########
Index(['statement', 'label'], dtype='object')

 ########## counterfact_true_false.csv ##########
Index(['statement', 'label', 'relation

In [4]:
dataset_name = "cities"
df = pd.read_csv('data/'+dataset_name+'.csv')

train_dataset = {'dataset_name': dataset_name,
           'org_data': list(df.statement),
           'label': list(df.label)}

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
precision = torch.float16
# load model
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "HuggingFaceH4/zephyr-7b-beta" 
#model_name = "meta-llama/Llama-2-7b-chat-hf"
short_model_name = model_name.split("/")[-1]
plots_folder = f'plots/{short_model_name}'
os.makedirs(plots_folder, exist_ok=True)
# model_name = "huggyllama/llama-7b"
# load tokenizer
if model_name == "meta-llama/Llama-2-7b-chat-hf":
    # get access token from environment variable
    access_token = os.getenv("HF_TOKEN")
    if not access_token:
        access_token = input("Please set the HF_TOKEN environment variable to your Hugging Face access token.")
    # access_token = input("Enter your access token: ")
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=precision, token=access_token).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id

else:

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = "left"
    tokenizer.pad_token_id = tokenizer.eos_token_id


batch_size = 64
module_names = [f'model.layers.{i}' for i in range(model.config.num_hidden_layers)]
num_modules = len(module_names)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [7]:
torch.cuda.empty_cache()
# get internal activations

token_positions = -1
# returns a dictionary with the hidden states of token_position (shape [len(selected_data), hidden_dim]) for each module
train_dataset['hidden_states'] = get_hidden(model, tokenizer, module_names, train_dataset['org_data'], batch_size=batch_size, token_position=-1)

100%|██████████| 24/24 [00:08<00:00,  2.95it/s]


In [8]:
train_dataset['hidden_states'].shape

torch.Size([32, 1496, 4096])

In [9]:
train_data, test_data, train_labels, test_labels = prep_data(train_dataset['hidden_states'], train_dataset['label'], train_perc=0.8)
train_data.shape, test_data.shape

(torch.Size([32, 1196, 4096]), torch.Size([32, 300, 4096]))

In [10]:
# training probes
probes = {}
for idx, module in tqdm(enumerate(module_names), total=num_modules):
    probes[module] = LRProbe(d_in=model.config.hidden_size, device='cuda', dtype=torch.float32)
    probes[module].train(train_data[idx], train_labels, epochs=2, batch_size=batch_size)

# test on same dataset
test_accs = {}
train_accs = {}
for idx, module in enumerate(module_names):
    test_accs[module] = probes[module].test(test_data[idx], test_labels)
    train_accs[module] = probes[module].test(train_data[idx], train_labels)

100%|██████████| 32/32 [00:04<00:00,  7.99it/s]


In [11]:
# print every 5th test acc:
for index, (key, value) in enumerate(test_accs.items()):
    if index % 5 == 0:
        print(f'{key}\t{value:.2g}') 

model.layers.0	0.47
model.layers.5	0.92
model.layers.10	1
model.layers.15	1
model.layers.20	0.99
model.layers.25	0.99
model.layers.30	0.99


In [12]:
def test_probes(probes, test_data, test_labels, module_names, normalize=True):
    if normalize:
        # calc statistics over samples and token positions
        mean_test = test_data.mean([1, 2], keepdim=True)
        std_test = test_data.std([1, 2], keepdim=True)
        test_data = (test_data - mean_test)/std_test

    test_accs_ood = {}
    for idx, module in tqdm(enumerate(module_names), total=len(module_names)):
        test_accs_ood[module] = {}
        for token_pos in range(test_data.shape[2]):
            test_accs_ood[module][token_pos] = probes[module].test(test_data[idx,:,token_pos, :], test_labels)

    return test_accs_ood

def plot_test_accs(test_accs, positions, train_dataset_name, test_dataset_name):
    # plot test accs for ood dataset
    plt.figure(figsize=(10, 6))

    for pos, label in positions.items():
        plt.plot([acc[pos] for _,acc in test_accs.items()], label=label)
    plt.ylabel('Accuracy')
    plt.xlabel('Layer')
    plt.grid()
    plt.legend()
    plt.title(f"LogRegr trained on \"{train_dataset_name}\", tested on \"{test_dataset_name}\"")
    plt.savefig(f"{plots_folder}/log_regr_trained_{train_dataset_name}_tested_{test_dataset_name}.png")
    plt.show()

In [13]:
# test on out of distribution dataset

max_new_tokens = 10
test_dataset_name = 'Statements1000'

# load dataset
test_dataset = load_data_set(test_dataset_name)
if model_name == "meta-llama/Llama-2-7b-chat-hf":
    lie_format = "[INST]You will complete the sentence with intentionally false information. [/INST] {}"
    truth_format = "[INST]You will complete the sentence with accurate information. [/INST] {}"
    change_format(test_dataset, lie_format, truth_format)

print('\nGenerating truths and lies')
get_overlap_truth_lies(model, tokenizer, test_dataset, max_new_tokens=max_new_tokens, batch_size=batch_size)
# get hidden states for the new dataset



Generating truths and lies
Size of dataset Statements1000: 1012


16it [00:15,  1.01it/s]                        
16it [00:00, 1349.68it/s]             


Success rate when generating truths: 76.68%


16it [00:16,  1.02s/it]                        
16it [00:00, 1407.96it/s]             

Success rate when generating lies:   60.28%
Overlap: 42.29%





In [None]:
token_positions = range(-max_new_tokens-1, 0, 1)

print('\nCalculating hidden states')
# returns a dictionary with the hidden states of token_position (shape [len(selected_data), hidden_dim]) for each module
test_dataset['hidden_states_lie'] = get_hidden_from_tokens(model, module_names, test_dataset['output_tokens_lie'], batch_size=batch_size, token_position=token_positions)
test_dataset['hidden_states_truth'] = get_hidden_from_tokens(model, module_names, test_dataset['output_tokens_truth'], batch_size=batch_size, token_position=token_positions)


In [None]:
print('\nTesting probes')
samples_per_class = test_dataset['hidden_states_lie'].shape[1]
test_labels = torch.cat([torch.zeros(samples_per_class), torch.ones(samples_per_class)])
test_data=torch.cat([test_dataset['hidden_states_lie'], test_dataset['hidden_states_truth']], dim=1)
test_accs_ood = test_probes(probes, test_data=test_data, test_labels=test_labels, module_names=module_names)

# save results
plot_test_accs(test_accs_ood, positions={0:'last statement token', 1:'1st answer token', 2:'2nd answer token', 3:'3rd answer token', max_new_tokens:'last answer token'}, 
            train_dataset_name = train_dataset['dataset_name'], 
            test_dataset_name = test_dataset['dataset_name'])

In [None]:
def test_probes_formats(format_sentence, probes, model, tokenizer, module_names, mean_test, std_test, device='cuda'):
    inputs = tokenizer(format_sentence, return_tensors="pt").to(model.device)
    token_positions = range(inputs['input_ids'].shape[1])
    hidden_states = get_hidden_from_tokens(model, module_names, inputs, token_position=token_positions)
    print(hidden_states.shape)

    # normalize
    hidden_states -= mean_test
    hidden_states /= std_test

    hidden_states = hidden_states.to(device=device, dtype=torch.float32)
    decoded = []
    for idx, input_id in enumerate(inputs['input_ids'][0]):
        decoded.append(tokenizer.decode(input_id))


    test_accs_formats = {}
    for idx, module in tqdm(enumerate(module_names), total=len(module_names)):
        test_accs_formats[module] = {}
        for token_pos in token_positions:
            test_accs_formats[module][token_pos] = probes[module].forward(hidden_states[idx,:,token_pos, :]).detach().cpu().numpy()

    return test_accs_formats, decoded

def calc_averages(test_accs_formats, decoded, average_over=3):
    pos_dict = {}
    ctr =0
    averaged_accs = {}
    for m, accs in test_accs_formats.items():
        ctr =0
        averaged_accs[m] = {}
        sum_ctr = 0
        sum = 0
        for k, acc in accs.items():
            sum += acc
            sum_ctr += 1
            if sum_ctr == average_over or k==list(accs.keys())[-1]:            
                averaged_accs[m][ctr]=sum/average_over
                sum = 0
                sum_ctr = 0
                ctr+=1
    ctr = 0
    for i in range(0, len(decoded), average_over):
        pos_dict[ctr] = ', '.join(decoded[i:i+average_over])
        ctr += 1


    return pos_dict, averaged_accs

# get mean and std
average_over = 3
mean_test = test_data.mean([1, 2], keepdim=True)
std_test = test_data.std([1, 2], keepdim=True)

test_accs_formats, decoded = test_probes_formats(format_sentence=test_dataset['truth_format'].format(''), 
                    probes=probes, model=model, tokenizer=tokenizer, 
                    module_names=module_names, mean_test=mean_test, std_test=std_test, device='cuda')
print(f'format: "{test_dataset["truth_format"].format("")}"')
print(f"tokens: {decoded}")

pos_dict, averaged_accs = calc_averages(test_accs_formats, decoded, average_over=average_over)


plot_test_accs(averaged_accs, positions=pos_dict, 
               train_dataset_name = train_dataset['dataset_name'], 
               test_dataset_name = test_dataset['dataset_name']+'_truth_format')

test_accs_formats, decoded = test_probes_formats(format_sentence=test_dataset['lie_format'].format(''),
                                                probes=probes, model=model, tokenizer=tokenizer, 
                                                module_names=module_names, mean_test=mean_test, std_test=std_test, device='cuda')
print(f'format: "{test_dataset["truth_format"].format("")}"')
print(f"tokens: {decoded}")

pos_dict, averaged_accs = calc_averages(test_accs_formats, decoded, average_over=average_over)
plot_test_accs(averaged_accs, positions=pos_dict, 
               train_dataset_name = train_dataset['dataset_name'], 
               test_dataset_name = test_dataset['dataset_name']+'_lie_format')

# evolution of acc over all tokens

In [14]:
attention_mask_lie = test_dataset['output_tokens_lie']['attention_mask']
attention_mask_truth = test_dataset['output_tokens_truth']['attention_mask']

max_num_tokens_lie = attention_mask_lie.shape[1]
max_num_tokens_truth = attention_mask_truth.shape[1]
max_num_tokens_lie, max_num_tokens_truth
num_samples = attention_mask_lie.shape[0]
hidden_dim = model.config.hidden_size

In [15]:
attention_mask_lie.shape

torch.Size([428, 62])

In [17]:
hidden_states_lie = None
hidden_states_truth = None
torch.cuda.empty_cache()

In [18]:
max_tokens = max(max_num_tokens_lie, max_num_tokens_truth)
hidden_states_lie = torch.zeros([num_modules, num_samples, max_tokens, hidden_dim], device='cpu', dtype=precision)* torch.tensor(float('nan'))
hidden_states_truth = torch.zeros([num_modules, num_samples, max_tokens, hidden_dim], device='cpu', dtype=precision)* torch.tensor(float('nan'))

: 

In [18]:
print('\nCalculating hidden states')
# returns a dictionary with the hidden states of token_position (shape [len(selected_data), hidden_dim]) for each module
step_size = 20
# i iterate from the back as i use left padding and tokens of lying and truth telling are aligned at the end
for i_lie, i_truth in zip(range(0, max_tokens, step_size), range(0,max_tokens, step_size)):
    token_positions_lie = range(i_lie, i_lie+step_size)
    temp_hidden = get_hidden_from_tokens(model, module_names, test_dataset['output_tokens_lie'], batch_size=batch_size, token_position=token_positions_lie)
    temp_attention = (attention_mask_lie[:, token_positions_lie][None, :, :, None]==1)
    temp_hidden = torch.where(temp_attention, temp_hidden, torch.tensor(float('nan')))

    # adapt range so that initial nan values are not overwritten
    offset = max_tokens-max_num_tokens_lie
    token_positions_lie = range(i_lie+offset, i_lie+step_size+offset)
    # set values where attention is zero to nan
    hidden_states_lie[:, :, token_positions_lie, :] = temp_hidden

    token_positions_truth = range(i_truth, i_truth+step_size)
    temp_hidden = get_hidden_from_tokens(model, module_names, test_dataset['output_tokens_truth'], batch_size=batch_size, token_position=token_positions_truth)
    temp_attention = (attention_mask_truth[:, token_positions_truth][None, :, :, None]==1)
    temp_hidden = torch.where(temp_attention, temp_hidden, torch.tensor(float('nan')))

    # adapt range so that initial nan values are not overwritten
    offset = max_tokens-max_num_tokens_truth
    token_positions_lie = range(i_truth+offset, i_truth+step_size+offset)
    # set values where attention is zero to nan
    hidden_states_truth[:, :, token_positions_truth, :] = temp_hidden




Calculating hidden states


100%|██████████| 7/7 [00:11<00:00,  1.59s/it]
100%|██████████| 7/7 [00:07<00:00,  1.10s/it]
100%|██████████| 7/7 [00:07<00:00,  1.13s/it]
100%|██████████| 7/7 [00:07<00:00,  1.10s/it]
100%|██████████| 7/7 [00:10<00:00,  1.44s/it]
100%|██████████| 7/7 [00:09<00:00,  1.35s/it]


In [24]:
range(i_lie, i_lie+step_size)

TypeError: unsupported operand type(s) for +: 'range' and 'int'

In [None]:
test_dataset['hidden_states_lie'].shape, test_dataset['hidden_states_truth'].shape

In [None]:
print('\nTesting probes')
samples_per_class = test_dataset['hidden_states_lie'].shape[1]
test_accs_lie = test_probes(probes, test_data=test_dataset['hidden_states_lie'], test_labels=torch.zeros(samples_per_class), module_names=module_names)
test_accs_truth = test_probes(probes, test_data=test_dataset['hidden_states_truth'], test_labels=torch.ones(samples_per_class), module_names=module_names)

test_accs_lie[].shape, test_accs_truth.shape

In [None]:
# we need to ignore values 

test_accs_lie[module_names[0]].shape