# Extracting directions, Classification and Removal
We search for directions corresponding to a concept in hidden layer activations.
We use several different methods:
* One Prompt
* Logistic Regression
* Principal Component Analysis (PCA)
* Class Means
* K-Means
* Random Direction as a baseline

We check how well the directions correlate with the concept we care about by using them to separate the test data.

We furthermore check how much information about the concept is left after removing information along the directions.

### User data
You need to specify the current working directory and the huggingface [access token](https://huggingface.co/docs/hub/security-tokens) to use this notebook.

In [None]:
# modify to your current working directory (the directory where this notebook is )
cwd = "exploring_directions"

# enter your authentication token from huggingface and press enter to access the models
auth_token = input()

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import gc
import numpy as np

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
from concept_erasure import LeaceEraser
from huggingface_hub import hf_hub_download


In [None]:
# import my modules
import sys
import importlib
# join the path to the modules to the current working directory

sys.path.append(os.path.join(cwd, "modules"))
import wrapping
import utils

importlib.reload(wrapping)
importlib.reload(utils)

from wrapping import WrappedModel, WrappedBlock
from utils import *

In [None]:
# define directories
sentiment_data = True
use_autoencoder = False
sentiment_addon = "_sentiment" if sentiment_data else ""
results_dir =  make_dir(os.path.join(cwd, f'results{sentiment_addon}/'))
plots_dir = make_dir(os.path.join(cwd, 'plots{sentiment_addon}'))
data_dir = os.path.join(cwd, 'data/ethics/utilitarianism/')

model_name = "pythia-410m-deduped"
model_path = f"EleutherAI/{model_name}"
precision = torch.float32

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

## Load Data

We show results on the utility dataset. You can download it from [here](https://people.eecs.berkeley.edu/~hendrycks/ethics.tar). Just copy the downloaded folder into the data folder.

In [None]:
X_train = load_util_data(data_dir=data_dir, split='train')
X_test = load_util_data(data_dir=data_dir, split='test')
print(X_train.shape, X_test.shape)

X_train, y_train = mix_util_pairs(X_train)
X_test, y_test = mix_util_pairs(X_test)

### Example sentences
The label is one if the first sentence is more utilitarian than the second sentence and zero otherwise.

In [None]:
for i in range(5):
    print(f"sentence 1: {X_train[i, 0]}")
    print(f"sentence 2: {X_train[i, 1]}")
    print(f"  -> label: {y_train[i]}")

## Load Model

In [None]:
# load model
model = AutoModelForCausalLM.from_pretrained(model_path, use_auth_token=True, device_map="auto").to(device=DEVICE, dtype=precision)
model.eval()

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=True, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' 

batch_size = 128
token_pos = -1
layer_ids = np.arange(0, model.config.num_hidden_layers, 4)
print(layer_ids)
wrapped_model = model




In [None]:
inputs = tokenizer(["Yesterday I had a coffee and went to work. Today I am", "What I like most about my dog is ", "I love ", "I like going to the "], return_tensors="pt", padding=True, max_length=512, truncation=True).to(device=DEVICE)
tokens = model.generate(**inputs, max_new_tokens=50)
for i in range(len(tokens)):
    print(tokenizer.decode(tokens[i], skip_special_tokens=True))
    print("-"*30)

In [None]:
model.config.num_hidden_layers

In [None]:
for wrapped_layer in layer_ids:
    wrapped_model.gpt_neox.layers[wrapped_layer] = WrappedBlock(model.gpt_neox.layers[wrapped_layer])

In [None]:
hidden_dim = model.config.hidden_size

## Get hidden activations
We use the custom WrappedModel class to get the internal activations. There are other ways to do this (for example with transformer lens or baukit).
We first add a prompt to each sentence to push the model into considering the concept that we aim to extract.
We then run each sentence through the model and save the hidden activations in each layer.
We get the outputs of the residual stream (the decoder block) per default.

In [None]:
format_prompt = 'Consider the amount of happiness in the following scenario:\n"{scenario}"\nThe amount of happiness in the scenario is '

if sentiment_data:
    format_prompt = 'Consider if following review is positive or negative:\n"{scenario}"\nThe review is '

In [None]:
def batchify(lst, batch_size):
    """Yield successive batch_size chunks from lst."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

def calc_hidden_states(wrapped_model, sentences, format_prompt, batch_size=128, token_pos=-1, layer_ids=[4]):
    hidden_states = {}
    for sentence_batch in tqdm(batchify(sentences, batch_size), total=len(sentences)//batch_size):
        formatted_sentences = [format_prompt.format_map({'scenario': s}) for s in sentence_batch] 
               
        # get activations
        with torch.no_grad():
            inputs = tokenizer(formatted_sentences, return_tensors="pt", padding=True, max_length=512, truncation=True)
            input_ids = inputs.input_ids.to(DEVICE)
            attention_mask = inputs.attention_mask.to(DEVICE)
            _ = wrapped_model(input_ids, attention_mask=attention_mask)

        # output is a dictionary with layer_ids as keys and the hidden states of the token_pos as values
        outputs = {}
        for wrapped_layer in layer_ids:
            outputs[wrapped_layer] = wrapped_model.gpt_neox.layers[wrapped_layer].output[:, token_pos]
            
        for key, values in outputs.items():
            values = values.detach().float().cpu().numpy()
            # Check if the key already exists in hidden_states
            if key in hidden_states:
                # Concatenate the tensors along axis 0 and update hidden_states
                hidden_states[key] = np.concatenate((hidden_states[key], values), axis=0)
            else:
                # If the key doesn't exist in hidden_states, simply assign the values
                hidden_states[key] = values
    return hidden_states

In [None]:
if not sentiment_data:

    H0_train = calc_hidden_states(wrapped_model, X_train[:, 0], format_prompt, batch_size=batch_size, token_pos=token_pos, layer_ids=layer_ids)
    H1_train = calc_hidden_states(wrapped_model, X_train[:, 1], format_prompt, batch_size=batch_size, token_pos=token_pos, layer_ids=layer_ids)
    H0_test = calc_hidden_states(wrapped_model, X_test[:, 0], format_prompt, batch_size=batch_size, token_pos=token_pos, layer_ids=layer_ids)
    H1_test = calc_hidden_states(wrapped_model, X_test[:, 1], format_prompt, batch_size=batch_size, token_pos=token_pos, layer_ids=layer_ids)

In [None]:
if sentiment_data:
    from datasets import load_dataset
    from torch.utils.data import DataLoader

    token_pos = -1
    token_amount= 60
    dataset = load_dataset("imdb", split="train").map(
        lambda x: {'text': format_prompt.format(scenario=x['text']), 'label': x['label']},
    ).map(
        lambda x: {'input_ids': tokenizer(x['text'])['input_ids'], 'label': x['label']},
        batched=True,
    ).filter(
        lambda x: len(x['input_ids']) > token_amount
    ).map(
        lambda x: {'input_ids': x['input_ids'][:token_amount], 'label': x['label']}
    )
    
    with torch.no_grad(), dataset.formatted_as("pt"):
        dl = DataLoader(dataset["input_ids"], batch_size=batch_size)
        hidden_states = {}
        labels = []
        for i, batch in enumerate(tqdm(dl)):
            _ = wrapped_model(batch.to(DEVICE))
            outputs = {}
            for wrapped_layer in layer_ids:
                outputs[wrapped_layer] = wrapped_model.gpt_neox.layers[wrapped_layer].output[:, token_pos]
                
            for key, values in outputs.items():
                values = values.detach().float().cpu().numpy()
                # Check if the key already exists in hidden_states
                if key in hidden_states:
                    # Concatenate the tensors along axis 0 and update hidden_states
                    hidden_states[key] = np.concatenate((hidden_states[key], values), axis=0)
                else:
                    # If the key doesn't exist in hidden_states, simply assign the values
                    hidden_states[key] = values

In [None]:
if sentiment_data:
    labels = np.array(dataset["label"])
    test_split = 0.2
    indices = np.arange(len(hidden_states[layer_ids[0]]))

    np.random.seed(42)
    np.random.shuffle(indices)

    test_indices = indices[:int(test_split*len(indices))]
    train_indices = indices[int(test_split*len(indices)):]

    # make sure that the splits are balanced
    positive_train_indices = np.where(labels[train_indices] == 1)[0]
    negative_train_indices = np.where(labels[train_indices] == 0)[0]
    n_train = min(len(positive_train_indices), len(negative_train_indices))
    train_indices_balanced = np.concatenate((train_indices[negative_train_indices[:n_train]], train_indices[positive_train_indices[:n_train]]))

    positive_test_indices = np.where(labels[test_indices] == 1)[0]
    negative_test_indices = np.where(labels[test_indices] == 0)[0]
    n_test = min(len(positive_test_indices), len(negative_test_indices))
    test_indices_balanced = np.concatenate((test_indices[negative_test_indices[:n_test]], test_indices[positive_test_indices[:n_test]]))

    np.random.shuffle(train_indices_balanced)

    H0_train = {key: hidden_states[key][train_indices_balanced] for key in hidden_states.keys()}
    H1_train = {key: hidden_states[key][train_indices_balanced] for key in hidden_states.keys()}
    H0_test = {key: hidden_states[key][test_indices_balanced] for key in hidden_states.keys()}
    H1_test = {key: hidden_states[key][test_indices_balanced] for key in hidden_states.keys()}

    y_train = labels[train_indices_balanced]
    y_test = labels[test_indices_balanced]  


In [None]:
if use_autoencoder:
    def decode(autoencoder, encoded_hidden_states): 

        decoded_hidden_states = None
        for batch in batchify(encoded_hidden_states, batch_size=128):
            values = torch.from_numpy(batch).to(DEVICE)
            decoded_hidden_state_batch = autoencoder.decode(values).detach().cpu().numpy()
            if decoded_hidden_states is None:
                decoded_hidden_states = decoded_hidden_state_batch
            else:
                decoded_hidden_states = np.concatenate((decoded_hidden_states, decoded_hidden_state_batch), axis=0)

        return decoded_hidden_states

    def encode(autoencoder, hidden_states):
        # encode hidden states
        encoded_hidden_states = None
        for batch in batchify(hidden_states, batch_size=128):
            values = torch.from_numpy(batch).to(DEVICE)
            encoded_hidden_state_batch = autoencoder.encode(values).detach().cpu().numpy()
            if encoded_hidden_states is None:
                encoded_hidden_states = encoded_hidden_state_batch
            else:
                encoded_hidden_states = np.concatenate((encoded_hidden_states, encoded_hidden_state_batch), axis=0)

        return encoded_hidden_states

    sys.path.append("sparse_coding")
    if model_name == 'pythia-70m-deduped':
        layer_ids = np.arange(1, 4)
        ending = "_r6"
    elif model_name == 'pythia-410m-deduped':
        layer_ids = [8, 12, 16, 20]
        ending = "_r4"
        
    autoencoders = {}
    for layer in layer_ids:
        ae_download_location = hf_hub_download(repo_id=f"Elriggs/{model_name}", filename=f"tied_residual_l{layer}{ending}/_63/learned_dicts.pt")
        all_autoencoders = torch.load(ae_download_location)
        auto_num = 5
        autoencoder, hyperparams = all_autoencoders[auto_num]
        autoencoder.to_device(DEVICE)

        autoencoders[layer] = autoencoder


    H0_train_encoded = {}
    H1_train_encoded = {}
    H0_test_encoded = {}
    H1_test_encoded = {}

    for layer in tqdm(layer_ids):

        H0_train_encoded[layer] = encode(autoencoders[layer], H0_train[layer])
        H1_train_encoded[layer] = encode(autoencoders[layer], H1_train[layer])
        H0_test_encoded[layer] = encode(autoencoders[layer], H0_test[layer])
        H1_test_encoded[layer] = encode(autoencoders[layer], H1_test[layer])

    H0_train_decoded = {}
    H1_train_decoded = {}
    H0_test_decoded = {}
    H1_test_decoded = {}

    for layer in layer_ids:

        H0_train_decoded[layer] = decode(autoencoders[layer], H0_train_encoded[layer])
        H1_train_decoded[layer] = decode(autoencoders[layer], H1_train_encoded[layer])
        H0_test_decoded[layer] = decode(autoencoders[layer], H0_test_encoded[layer])
        H1_test_decoded[layer] = decode(autoencoders[layer], H1_test_encoded[layer])


    # print reconstruction error

    for layer in layer_ids:
        print(f"layer {layer}")
        print(f"train reconstruction error: {np.mean((H0_train[layer] - H0_train_decoded[layer])**2)}")
        print(f"test reconstruction error: {np.mean((H0_test[layer] - H0_test_decoded[layer])**2)}")
        print("-"*30)


    # overwrite original data
    H0_train = H0_train_encoded
    H1_train = H1_train_encoded
    H0_test = H0_test_encoded
    H1_test = H1_test_encoded

    # delete data
    del H0_train_decoded
    del H1_train_decoded
    del H0_test_decoded
    del H1_test_decoded
    del H0_train_encoded
    del H1_train_encoded
    del H0_test_encoded
    del H1_test_encoded

    gc.collect()
    model_name = model_name + "_autoencoder"

        

We split the training set again, since we want an untouched part of the training set for our removal code.
For some methods we use the differences between contrastive pairs. We standardize all data.

In [None]:
n_dir_find = int(H0_train[layer_ids[0]].shape[0]/2)
start_clf = n_dir_find

H0_train_s, H1_train_s, H_train_s, H_test = {}, {}, {}, {}
H0_train_clf, H1_train_clf, H_train_clf = {}, {}, {}
y_train_s = y_train[:n_dir_find]
y_train_clf = y_train[start_clf:]
for layer in H0_train.keys():
    H0_train_s[layer], H1_train_s[layer] = H0_train[layer][:n_dir_find], H1_train[layer][:n_dir_find]
    H_train_s[layer] = H0_train[layer][:n_dir_find]-H1_train[layer][:n_dir_find]
    H0_train_clf[layer], H1_train_clf[layer] = H0_train[layer][start_clf:], H1_train[layer][start_clf:]
    H_train_clf[layer] = H0_train[layer][start_clf:]-H1_train[layer][start_clf:]
    H_test[layer] = H0_test[layer]-H1_test[layer]

In [None]:
# normalizing data
scalers = {}
dscalers = {}
for layer in layer_ids:

    scaler = StandardScaler()
    scaler.fit(np.concatenate([H0_train_s[layer], H1_train_s[layer]], axis=0))
    scalers[layer] = scaler
    H0_train_s[layer] = scaler.transform(H0_train_s[layer])
    H1_train_s[layer] = scaler.transform(H1_train_s[layer])
    H0_train_clf[layer] = scaler.transform(H0_train_clf[layer])
    H1_train_clf[layer] = scaler.transform(H1_train_clf[layer])
    H0_test[layer] = scaler.transform(H0_test[layer])
    H1_test[layer] = scaler.transform(H1_test[layer])

    
    dscalers[layer] = StandardScaler()
    dscalers[layer].fit(H_train_s[layer])
    H_train_s[layer] = dscalers[layer].transform(H_train_s[layer])
    H_train_clf[layer] = dscalers[layer].transform(H_train_clf[layer])
    H_test[layer] = dscalers[layer].transform(H_test[layer])

In [None]:
if sentiment_data:
    H_train_s = H0_train_s
    H_train_clf = H0_train_clf
    H_test = H0_test
    

# Finding directions using different methods

We find the directions using the hidden representation of our formatted sentences directly or after taking differences between contrastive pairs. To ensure that all directions point towards positive utility we project the training data on the un oriented direction and find the correct coefficient for the orientation using logistic regression.

In [None]:
directions = {}

In [None]:
# PCA on differences of contrastive pairs
directions["PCA_diffs"] = {}
for layer in tqdm(layer_ids):
    pca = PCA(n_components=1, whiten=True).fit(H_train_s[layer])
    direction = pca.components_.squeeze()
    temp = pca.transform(H_train_s[layer])
    lr = LogisticRegression(solver='liblinear').fit(temp, y_train_s)
    coeff = np.sign(lr.coef_).squeeze()
    directions["PCA_diffs"][layer] = coeff*direction

In [None]:
# Class means
directions["ClassMeans"] = {}
for layer in tqdm(layer_ids):
    direction = H_train_s[layer][y_train_s==1].mean(axis=0) - H_train_s[layer][y_train_s==0].mean(axis=0)
    directions["ClassMeans"][layer] = direction

In [None]:
# Logistic regression
directions["LogReg"] = {}
for layer in tqdm(layer_ids):
    lr = LogisticRegression(solver='liblinear', C=1e-2, random_state=0, max_iter=50).fit(H_train_clf[layer], y_train_clf)
    directions["LogReg"][layer] = lr.coef_.squeeze()

In [None]:
fname = os.path.join(results_dir, f'directions_{model_name}.pkl')
with open(fname, 'wb') as f:
    pickle.dump(directions, f, pickle.HIGHEST_PROTOCOL)

In [None]:
fname = os.path.join(results_dir, f'directions_{model_name}.pkl')
with open(fname, 'rb') as f:
    directions = pickle.load(f)

### Cosine similarity between directions

In [None]:
dfs = []
# calculate cosine similarity between directions
for layer in tqdm(layer_ids):
    temp = {key : directions[key][layer] for key in directions.keys()}
    for key in temp.keys():
        temp[key] = temp[key].squeeze()
    df = pd.DataFrame.from_dict(temp, orient='index')
    cosine_sim_matrix = cosine_similarity(df.values)
    cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=df.index, columns=df.index)
    dfs.append(cosine_sim_df)

In [None]:
layer = 3
sns.heatmap(dfs[layer], annot=True)
plt.savefig(os.path.join(plots_dir, f'cos_similarity_{model_name}_layer_{layer}.png'), dpi=300, bbox_inches='tight')

# Classification - Test for Correlation
How well can the found directions separate the data? We test on differences of the hidden representation of the test data.

In [None]:
def classification(H, y, direction):
    Hte_i = np.dot(H, direction.T)
    accte = ((Hte_i > 0) == y).sum()/len(y)
    return accte

In [None]:
directions.keys()

In [None]:
separation_test_accs = {}
separation_train_accs = {}

for method in tqdm(directions.keys()):
    separation_test_accs[method] = {}
    separation_train_accs[method] = {}
    for layer in layer_ids:
        if method == "Random":
            temp = 0
            temp_train = 0
            random_runs = directions[method][layer].shape[0]
            for i in range(random_runs):
                temp += classification(H_test[layer], y_test, directions[method][layer][i])
                temp_train += classification(H_train_s[layer], y_train_s, directions[method][layer][i])
            separation_test_accs[method][layer] = temp/random_runs
            separation_train_accs[method][layer] = temp_train/random_runs
        else:
            separation_test_accs[method][layer] = classification(H_test[layer], y_test, directions[method][layer])
            separation_train_accs[method][layer] = classification(H_train_s[layer], y_train_s, directions[method][layer])


In [None]:
plot_lines(separation_train_accs, "train accuracy", save_path=os.path.join(plots_dir, f'separation_train_accs_{model_name}.png'), method_names=directions.keys(), loc='center right')
plot_lines(separation_test_accs, "test accuracy", save_path=os.path.join(plots_dir, f'separation_test_accs_{model_name}.png'), method_names=directions.keys(), loc='center right')

# Erasure

In [None]:
# project onto hyperplane perpendicular to direction
def project(activation, direction):
    dot_product = np.dot(activation, direction)
    squared_norm = np.sum(direction ** 2)
    projection = np.outer(dot_product / squared_norm, direction)
    output = activation - projection
    return output     

# linear regression
def linear_regr(H_trainl, H_testl, y_train, y_test):
    # scalerem = StandardScaler()
    # H_trainl = scalerem.fit_transform(H_trainl)
    # H_testl = scalerem.transform(H_testl)
    clf = LogisticRegression(solver='liblinear', C=1e-2, random_state=0, max_iter=50).fit(H_trainl, y_train)
    clf.intercept_ = 0
    acctr = clf.score(H_trainl, y_train)
    accte = clf.score(H_testl, y_test)

    acctr = classification(H_trainl, y_train, clf.coef_.squeeze())
    accte = classification(H_testl, y_test, clf.coef_.squeeze())
    return acctr, accte

def erase_concept(H_train, H_test, y_train, y_test, direction=None, layers=list(range(model.config.num_hidden_layers))):
    train_acc_l, test_acc_l = {}, {}
    for layer in tqdm(layers): 
        if direction is not None:
            if len(direction[layer].shape) > 1:
                train_acc_temp, test_acc_temp = 0.0, 0.0
                for i in range(direction[layer].shape[0]):
                    H_train_l = project(H_train[layer], direction[layer][i])
                    H_test_l = project(H_test[layer], direction[layer][i])
                    acctr, accte = linear_regr(H_train_l, H_test_l, y_train, y_test)
                    train_acc_temp += acctr
                    test_acc_temp += accte
                train_acc_l[layer]= train_acc_temp/direction[layer].shape[0]
                test_acc_l[layer] = test_acc_temp/direction[layer].shape[0]
            else:
                H_train_l = project(H_train[layer], direction[layer])
                H_test_l = project(H_test[layer], direction[layer])
                acctr, accte = linear_regr(H_train_l, H_test_l, y_train, y_test)
                train_acc_l[layer]= acctr
                test_acc_l[layer] = accte

        else:
            acctr, accte = linear_regr(H_train[layer], H_test[layer], y_train, y_test)
            train_acc_l[layer]= acctr
            test_acc_l[layer] = accte
            
    return train_acc_l, test_acc_l
    
def leace(H_train_s, y_train_s, H_train_clf, y_train_clf, H_test, y_test, layers=list(range(model.config.num_hidden_layers))):
    train_acc_l, test_acc_l = {}, {}
    for layer in tqdm(layers): 
        H_trainl, H_train_clfl, H_testl = H_train_s[layer], H_train_clf[layer], H_test[layer]
        # eraser is trained on first half of training set
        eraser = LeaceEraser.fit(torch.from_numpy(H_trainl), torch.from_numpy(y_train_s))
        # erase from second half of training set and test set
        H_train_clf_tch = eraser(torch.from_numpy(H_train_clfl))
        H_test_tch = eraser(torch.from_numpy(H_testl))
        H_train_clfl, H_testl = torch.Tensor.numpy(H_train_clf_tch), torch.Tensor.numpy(H_test_tch)
        
        acctr, accte = linear_regr(H_train_clfl, H_testl, y_train_clf, y_test)
        train_acc_l[layer] = acctr
        test_acc_l[layer] = accte
            
    return train_acc_l, test_acc_l

In [None]:
train_accs, test_accs = {}, {}

In [None]:
train_accs, test_accs = {}, {}

for key in directions.keys():
    print(key)
    train_accs[key], test_accs[key] = erase_concept(H_train_clf, H_test, y_train_clf, y_test, directions[key], layers=layer_ids)


In [None]:
# baseline without erasure
train_accs["NoErasure"], test_accs["NoErasure"] = erase_concept(H_train_s, H_test, y_train_s, y_test, layers=layer_ids)

In [None]:
# erase with leace
train_accs["LEACE"], test_accs["LEACE"] = leace(H_train_s, y_train_s, H_train_clf, y_train_clf, H_test, y_test, layers=layer_ids)

In [None]:
# save data
fname = os.path.join(results_dir, f'removal_{model_name}.pkl')
with open(fname, 'wb') as f:
    pickle.dump([train_accs, test_accs], f, pickle.HIGHEST_PROTOCOL)

In [None]:
# load data
fname = os.path.join(results_dir, f'removal_{model_name}.pkl')
with open(fname, 'rb') as f:
    train_accs, test_accs = pickle.load(f)

In [None]:
# plot
plot_lines(train_accs, "Train accuracy", save_path=os.path.join(plots_dir, f'removal_train_accs_{model_name}.png'), method_names=train_accs.keys(), loc='lower right')
plot_lines(test_accs, "Test accuracy", save_path=os.path.join(plots_dir, f'removal_test_accs_{model_name}.png'), method_names=test_accs.keys(), loc='lower right')

In [None]:
num_test_sentences = 100
generations_dir = make_dir(os.path.join(results_dir, 'generations_pythia/'))

data_dir = make_dir(os.path.join(cwd, 'data/'))

data_file = os.path.join(data_dir, 'test_sentences.txt')
random_seed = 42
max_new_tokens = 40

In [None]:
# make data
X_test = load_util_data(data_dir=os.path.join(cwd, 'data/ethics/utilitarianism/'), split='test')
X_test, y_test = mix_util_pairs(X_test)

test_data_idxs, test_sentences = find_two_sentences(X_test[:, 0], split_str1=".", split_str2=",", larger_than1=2, larger_than2=1)

with open(data_file, "w") as f:
    for s in test_sentences:
        f.write(s + " \n")

# load data
with open(data_file, 'r') as f:
    test_sentences = [line.strip() for line in f]

np.random.seed(random_seed)
idxs = np.random.choice(len(test_sentences), num_test_sentences, replace=False)
test_sentences = [test_sentences[idx] for idx in idxs]
for i in range(10):
    print(test_sentences[i])

In [None]:
generations = []
for sentence_batch in batchify(test_sentences, batch_size):
    with torch.no_grad():
        torch.random.manual_seed(random_seed)
        inputs = tokenizer(sentence_batch, return_tensors="pt", padding=True, max_length=512, truncation=True)
        attention_mask = inputs.attention_mask.to(DEVICE)
        generate_ids = wrapped_model.generate(inputs.input_ids.to(DEVICE), attention_mask=attention_mask, max_new_tokens=max_new_tokens, use_cache=True)
        generated = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    generations.extend(generated)

# save generations as text files
with open(os.path.join(generations_dir, f"generations_neutral_{model_name}.txt"), "w") as f:
    for item in generations:
        # remove newline characters
        item = item.replace("\n", " ")
        f.write("%s\n" % item)