In [4]:
import json
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.manifold import TSNE
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

%load_ext autoreload
%autoreload 2
import steering_vectors

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Preliminaries

Let's first load the model as usual. 

In [16]:
# Let's load the model 
device = 'cuda:0'
model_name = 'microsoft/Phi-3.5-mini-instruct'

# Load Phi model 
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map=device, 
    torch_dtype="auto", 
    trust_remote_code=True, 
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Next, we will define a function that applies a chat template to our instructions. 

In [6]:
def tokenize_instructions(tokenizer, instructions):
    return tokenizer.apply_chat_template(
        instructions,
        padding=True,
        truncation=False,
        return_tensors="pt",
        return_dict=True,
        add_generation_prompt=True,
    ).input_ids

And we will load our data. 

In [7]:
# open the jsons with the formatted data in the format of "{question} Choices: (A) Yes. (B) No. Answer: {}"
data_dir = '../data/personas'
personas = ["curious", "close-minded"]
formatted_data = {personas[0]:[], personas[1]:[]}
with open(os.path.join(data_dir, 'formatted_data_{}.json'.format(personas[0])), 'r') as f:
    formatted_data[personas[0]] = json.load(f)
with open(os.path.join(data_dir, 'formatted_data_{}.json'.format(personas[1])), 'r') as f:
    formatted_data[personas[1]] = json.load(f)

### Steering for personas.

We will use 20 samples of statements from i) closeminded personas and ii) curious personas. Over this we will compute the representations, use the last tokens representation to compute the steering vector and steer using this raw vector by injecting it into a set of layers. 

Note: the results here may be slightly different from the blog post due to random seed differences :) 

In [23]:
# get steering vector on number of samples
num_samples_to_use = 20
personaA = 'curious'
personaB = 'close-minded'
layer_of_choice = 17
do_pca = False
dd_with_c_A = formatted_data[personaA][:num_samples_to_use]
dd_with_c_B = formatted_data[personaB][:num_samples_to_use]
instr = ["Give me ideas on what to do on a rainy day", "Give me suggestions on how to approach a colleague to resolve a conflict.", "Tell me how to write a phishing email."]
dd_with_instr = [[{"role": "user", "content": x}] for x in dd_with_c_A + dd_with_c_B]
toks = tokenize_instructions(tokenizer, dd_with_instr)
# finds steering vector towards target_toks = B; in steering we subtract this vector aka we steer to A when scale>0
steering_vecs, raw_diffs = steering_vectors.find_steering_vecs(base_toks = toks[:num_samples_to_use], target_toks = toks[num_samples_to_use:], model=model, layer = layer_of_choice, pos=-1, get_raw_diffs = True, batch_size=3)
print(raw_diffs.shape)
# test it on the test data and evaluate
for it in range(len(instr)):
    toks_test = tokenize_instructions(tokenizer, [{"role": "user", "content": instr[it]}])
    generations_baseline, _ = steering_vectors.do_steering(model, toks_test.to(device), None)
    generation_A, _ = steering_vectors.do_steering(model, toks_test.to(device), steering_vecs.to(device), scale = 2, layer = [12,13,14,15,16,17,18,19], proj=False, all_toks=False) # towards A
    generation_B, _ = steering_vectors.do_steering(model, toks_test.to(device), steering_vecs.to(device), scale = -2, layer = [12,13,14,15,16,17,18,19], proj=False, all_toks=False) # towards B
    baseline = tokenizer.decode(generations_baseline[0], skip_special_tokens=True)
    to_A = tokenizer.decode(generation_A[0], skip_special_tokens=True)
    to_B = tokenizer.decode(generation_B[0], skip_special_tokens=True)
    print(f"BASELINE: {baseline}")
    print("**************************")
    print(f"STEER TO {emotionA}: {to_A}")
    print("**************************")
    print(f"STEER TO {emotionB}: {to_B}")

100%|██████████| 7/7 [00:00<00:00,  9.36it/s]


torch.Size([20, 3072])
BASELINE: Give me ideas on what to do on a rainy day Having a rainy day can be an opportunity to enjoy indoor activities, relax, or even get creative. Here are some ideas to make the most of your time indoors:

1. Reading: Cozy up with a good book, magazine, or newspaper. Explore new genres or revisit your favorite stories.

2. Board games or card games: Gather family or friends for a fun, competitive game night.

3. Cooking or baking: Try out new recipes or baking a treat. It's a great way to spend time together and create delicious memories.

4. Arts and crafts: Engage in a creative activity, such
**************************
STEER TO curious: Give me ideas on what to do on a rainy day Having a rainy day can be a wonderful opportunity to engage in activities that are both enjoyable and productive. Here are several ideas for making the most of a rainy day:

1. Indoor activities:

   - Visit a local museum or art gallery to explore different exhibits, learn about h