# Generating LLM Answers via Steering and Applying SEPs

#### This notebook demonstrates how we applied the SEPs and applied the steering vectors

In [None]:
from model_loader import HuggingfaceModel
import pandas as pd
import numpy as np
from datasets import load_dataset
from openai import OpenAI
import joblib

### Get the Squad 2 dataset

In [None]:
# Load the SQuAD 2.0 dataset
squad_dataset = load_dataset("squad_v2")

# Convert the training and validation splits to pandas DataFrames
train_df = pd.DataFrame(squad_dataset['train'])
validation_df = pd.DataFrame(squad_dataset['validation'])

# Make a base prompt column
train_df["base_prompt"] = train_df.apply(
    lambda row: f"Answer the following question using only the context provided. If you cannot answer it given the context provided, then you can refuse to answer the question. For example, if the answer is not clear from the context, you should refuse. You must NOT use knowledge outside the context provided to answer the question.\nContext: {row['context']}\nQuestion: {row['question']}\nAnswer:",
    axis=1
)

validation_df["base_prompt"] = validation_df.apply(
    lambda row: f"Answer the following question using only the context provided. If you cannot answer it given the context provided, then you can refuse to answer the question. For example, if the answer is not clear from the context, you should refuse. You must NOT use knowledge outside the context provided to answer the question.\nContext: {row['context']}\nQuestion: {row['question']}\nAnswer:",
    axis=1
)

# Check for impossible samples in the dataset (no answer)
train_df['is_impossible'] = train_df['answers'].apply(lambda x: len(x['text']) == 0)
validation_df['is_impossible'] = validation_df['answers'].apply(lambda x: len(x['text']) == 0)

### temp
# validation_df = validation_df.sample(frac=0.2, random_state=42)
####

# Split the validation set into a new train and test set (70% train, 30% test)
train_val_split = validation_df.sample(frac=0.7, random_state=42)
test_val_split = validation_df.drop(train_val_split.index)

# Separate possible and impossible samples from the new training and test sets
possible_samples_train = train_val_split[~train_val_split['is_impossible']]
impossible_samples_train = train_val_split[train_val_split['is_impossible']]
possible_samples_test = test_val_split[~test_val_split['is_impossible']]
impossible_samples_test = test_val_split[test_val_split['is_impossible']]


# Add a column indicating the type of sample
possible_samples_train['type'] = 'possible'
possible_samples_test['type'] = 'possible'
impossible_samples_train['type'] = 'impossible'
impossible_samples_test['type'] = 'impossible'

# Concatenate the sampled datasets into one
combined_samples_train = pd.concat([possible_samples_train, impossible_samples_train], ignore_index=True)
combined_samples_test = pd.concat([possible_samples_test, impossible_samples_test], ignore_index=True)

# Shuffle the combined dataset to mix possible and impossible samples
combined_samples_train = combined_samples_train.sample(frac=1, random_state=42).reset_index(drop=True)
combined_samples_test = combined_samples_test.sample(frac=1, random_state=42).reset_index(drop=True)





In [None]:
combined_samples_train.to_csv("combined_samples_train_new.csv")
combined_samples_test.to_csv("combined_samples_test_new.csv")

### Instantiate Models and Steering Vectors

##### Wrapper to Apply Activation Steering

This code, adopted from https://github.com/nrimsky/CAA allow us to apply steering vectors. Please download the repository so you can directly call CAA as shown below.

In [None]:
import torch as t
from transformers import AutoTokenizer, AutoModelForCausalLM
from matplotlib import pyplot as plt
from matplotlib.ticker import ScalarFormatter
from CAA.utils.helpers import add_vector_from_position, find_instruction_end_postion, get_model_path
from CAA.utils.tokenize import (
    tokenize_llama_chat,
    tokenize_llama_base,
    ADD_FROM_POS_BASE,
    ADD_FROM_POS_CHAT,
)
from typing import Optional


class AttnWrapper(t.nn.Module):
    """
    Wrapper for attention mechanism to save activations
    """

    def __init__(self, attn):
        super().__init__()
        self.attn = attn
        self.activations = None

    def forward(self, *args, **kwargs):
        output = self.attn(*args, **kwargs)
        self.activations = output[0]
        return output


class BlockOutputWrapper(t.nn.Module):
    """
    Wrapper for block to save activations and unembed them
    """

    def __init__(self, block, unembed_matrix, norm, tokenizer):
        super().__init__()
        self.block = block
        self.unembed_matrix = unembed_matrix
        self.norm = norm
        self.tokenizer = tokenizer

        self.block.self_attn = AttnWrapper(self.block.self_attn)
        self.post_attention_layernorm = self.block.post_attention_layernorm

        self.attn_out_unembedded = None
        self.intermediate_resid_unembedded = None
        self.mlp_out_unembedded = None
        self.block_out_unembedded = None

        self.activations = None
        self.add_activations = None
        self.from_position = None

        self.save_internal_decodings = False

        self.calc_dot_product_with = None
        self.dot_products = []

    def forward(self, *args, **kwargs):
        output = self.block(*args, **kwargs)
        self.activations = output[0]
        if self.calc_dot_product_with is not None:
            last_token_activations = self.activations[0, -1, :]
            decoded_activations = self.unembed_matrix(self.norm(last_token_activations))
            top_token_id = t.topk(decoded_activations, 1)[1][0]
            top_token = self.tokenizer.decode(top_token_id)
            dot_product = t.dot(last_token_activations, self.calc_dot_product_with) / (
                t.norm(last_token_activations) * t.norm(self.calc_dot_product_with)
            )
            self.dot_products.append((top_token, dot_product.cpu().item()))
        if self.add_activations is not None:
            augmented_output = add_vector_from_position(
                matrix=output[0],
                vector=self.add_activations,
                position_ids=kwargs["position_ids"],
                from_pos=self.from_position,
            )
            output = (augmented_output,) + output[1:]

        if not self.save_internal_decodings:
            return output

        # Whole block unembedded
        self.block_output_unembedded = self.unembed_matrix(self.norm(output[0]))

        # Self-attention unembedded
        attn_output = self.block.self_attn.activations
        self.attn_out_unembedded = self.unembed_matrix(self.norm(attn_output))

        # Intermediate residual unembedded
        attn_output += args[0]
        self.intermediate_resid_unembedded = self.unembed_matrix(self.norm(attn_output))

        # MLP unembedded
        mlp_output = self.block.mlp(self.post_attention_layernorm(attn_output))
        self.mlp_out_unembedded = self.unembed_matrix(self.norm(mlp_output))

        return output

    def add(self, activations):
        self.add_activations = activations

    def reset(self):
        self.add_activations = None
        self.activations = None
        self.block.self_attn.activations = None
        self.from_position = None
        self.calc_dot_product_with = None
        self.dot_products = []


import torch as t
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Optional
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

class LlamaWrapper:
    def __init__(
        self,
        size: str = "7b",
        model_path = "meta-llama/Llama-2-7b-chat-hf",
        use_chat: bool = True,
        override_model_weights_path: Optional[str] = None,
        cache_dir: Optional[str] = "./",
        gpu_device = 1,  # Default to device 1
    ):
        # self.device = gpu_device if t.cuda.is_available() else "cpu"
        self.device = "cuda:1"
        self.use_chat = use_chat
        self.model_name_path = model_path

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name_path,
            cache_dir=cache_dir,
            use_fast=False,
            trust_remote_code=True
        )

        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name_path,
            cache_dir=cache_dir,
            device_map={"": gpu_device},  # Use specified device
            trust_remote_code=True
        )

        # Override weights if specified
        if override_model_weights_path is not None:
            self.model.load_state_dict(t.load(override_model_weights_path))
        
        # Convert model to half-precision for non-7b models
        if size != "7b":
            self.model = self.model.half()

        # Move model to device
        self.model = self.model.to(self.device)

        # Set end-of-sequence token
        if use_chat:
            self.END_STR = t.tensor(self.tokenizer.encode(ADD_FROM_POS_CHAT)[1:]).to(
                self.device
            )
        else:
            self.END_STR = t.tensor(self.tokenizer.encode(ADD_FROM_POS_BASE)[1:]).to(
                self.device
            )

        # Wrap layers with BlockOutputWrapper
        for i, layer in enumerate(self.model.model.layers):
            self.model.model.layers[i] = BlockOutputWrapper(
                layer, self.model.lm_head, self.model.model.norm, self.tokenizer
            )

    def set_save_internal_decodings(self, value: bool):
        for layer in self.model.model.layers:
            layer.save_internal_decodings = value

    def set_from_positions(self, pos: int):
        for layer in self.model.model.layers:
            layer.from_position = pos

    def generate(self, tokens, max_new_tokens=100):
        with t.no_grad():
            instr_pos = find_instruction_end_postion(tokens[0], self.END_STR)
            self.set_from_positions(instr_pos)
            generated = self.model.generate(
                inputs=tokens, max_new_tokens=max_new_tokens, top_k=1, temperature=1.0
            )
            return self.tokenizer.batch_decode(generated)[0]

    def generate_text(self, user_input: str, model_output: Optional[str] = None, system_prompt: Optional[str] = None, max_new_tokens: int = 50) -> str:
        if self.use_chat:
            tokens = tokenize_llama_chat(
                tokenizer=self.tokenizer, user_input=user_input, model_output=model_output, system_prompt=system_prompt
            )
        else:
            tokens = tokenize_llama_base(tokenizer=self.tokenizer, user_input=user_input, model_output=model_output)
        tokens = t.tensor(tokens).unsqueeze(0).to(self.device)
        return self.generate(tokens, max_new_tokens=max_new_tokens)

    def get_logits(self, tokens):
        with t.no_grad():
            instr_pos = find_instruction_end_postion(tokens[0], self.END_STR)
            self.set_from_positions(instr_pos)
            logits = self.model(tokens).logits
            return logits

    def get_logits_from_text(self, user_input: str, model_output: Optional[str] = None, system_prompt: Optional[str] = None) -> t.Tensor:
        if self.use_chat:
            tokens = tokenize_llama_chat(
                tokenizer=self.tokenizer, user_input=user_input, model_output=model_output, system_prompt=system_prompt
            )
        else:
            tokens = tokenize_llama_base(tokenizer=self.tokenizer, user_input=user_input, model_output=model_output)
        tokens = t.tensor(tokens).unsqueeze(0).to(self.device)
        return self.get_logits(tokens)

    def get_last_activations(self, layer):
        return self.model.model.layers[layer].activations

    def set_add_activations(self, layer, activations):
        self.model.model.layers[layer].add(activations)

    def set_calc_dot_product_with(self, layer, vector):
        self.model.model.layers[layer].calc_dot_product_with = vector

    def get_dot_products(self, layer):
        return self.model.model.layers[layer].dot_products

    def reset_all(self):
        for layer in self.model.model.layers:
            layer.reset()

    def print_decoded_activations(self, decoded_activations, label, topk=10):
        data = self.get_activation_data(decoded_activations, topk)[0]
        print(label, data)

    def decode_all_layers(
        self,
        tokens,
        topk=10,
        print_attn_mech=True,
        print_intermediate_res=True,
        print_mlp=True,
        print_block=True,
    ):
        tokens = tokens.to(self.device)
        self.get_logits(tokens)
        for i, layer in enumerate(self.model.model.layers):
            print(f"Layer {i}: Decoded intermediate outputs")
            if print_attn_mech:
                self.print_decoded_activations(
                    layer.attn_out_unembedded, "Attention mechanism", topk=topk
                )
            if print_intermediate_res:
                self.print_decoded_activations(
                    layer.intermediate_resid_unembedded,
                    "Intermediate residual stream",
                    topk=topk,
                )
            if print_mlp:
                self.print_decoded_activations(
                    layer.mlp_out_unembedded, "MLP output", topk=topk
                )
            if print_block:
                self.print_decoded_activations(
                    layer.block_output_unembedded, "Block output", topk=topk
                )

    def plot_decoded_activations_for_layer(self, layer_number, tokens, topk=10):
        tokens = tokens.to(self.device)
        self.get_logits(tokens)
        layer = self.model.model.layers[layer_number]

        data = {}
        data["Attention mechanism"] = self.get_activation_data(
            layer.attn_out_unembedded, topk
        )[1]
        data["Intermediate residual stream"] = self.get_activation_data(
            layer.intermediate_resid_unembedded, topk
        )[1]
        data["MLP output"] = self.get_activation_data(layer.mlp_out_unembedded, topk)[1]
        data["Block output"] = self.get_activation_data(
            layer.block_output_unembedded, topk
        )[1]

        # Plotting
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
        fig.suptitle(f"Layer {layer_number}: Decoded Intermediate Outputs", fontsize=21)

        for ax, (mechanism, values) in zip(axes.flatten(), data.items()):
            tokens, scores = zip(*values)
            ax.barh(tokens, scores, color="skyblue")
            ax.set_title(mechanism)
            ax.set_xlabel("Value")
            ax.set_ylabel("Token")

            # Set scientific notation for x-axis labels when numbers are small
            ax.xaxis.set_major_formatter(ScalarFormatter(useMathText=True))
            ax.ticklabel_format(style="sci", scilimits=(0, 0), axis="x")

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.show()

    def get_activation_data(self, decoded_activations, topk=10):
        softmaxed = t.nn.functional.softmax(decoded_activations[0][-1], dim=-1)
        values, indices = t.topk(softmaxed, topk)
        probs_percent = [int(v * 100) for v in values.tolist()]
        tokens = self.tokenizer.batch_decode(indices.unsqueeze(-1))
        return list(zip(tokens, probs_percent)), list(zip(tokens, values.tolist()))


In [None]:
# load HF model
## NOTE - Due to computation constraints, we loaded up these models separately. You can comment out one loader and use the other. Comment out relevant code in following blocks using that.
import torch
model_name = "meta-llama/Llama-2-7b-chat-hf"
cache_dir = "./"
hf_model = HuggingfaceModel(model_name, cache_dir, max_new_tokens=200)

model_dict_slt_ent = joblib.load('model_dict_slt_ent.pkl')
model_dict_tbg_ent = joblib.load('model_dict_tbg_ent.pkl')

layer = 13
hall_vec = torch.load(f"CAA/normalized_vectors/hallucination/vec_layer_{layer}_Llama-2-7b-chat-hf.pt")
ref_vec = torch.load(f"CAA/normalized_vectors/refusal/vec_layer_{layer}_Llama-2-7b-chat-hf.pt")


# adjust the multipliers for 1xHallucination and 2xHallucination
hall_multiplier = -2
ref_multiplier = 2

# load the model to steer (separate from other model)
hf_model2 = LlamaWrapper(model_path = "meta-llama/Llama-2-7b-chat-hf")


In [None]:
## END to END flow doing generations and running probes

from collections import Counter


chunk_size = 100
for start_index in range(0, len(combined_samples_test), chunk_size):
    end_index = min(start_index + chunk_size, len(combined_samples_train))
    chunk = combined_samples_train.iloc[start_index:end_index]

    # Initialize lists for this chunk
    combined_rows = []

    for index, row in chunk.iterrows():
        print(index)
        real_answer = row['answers']['text'] if row['answers']['text'] else ''
        if real_answer=='' or row['answers']['text']==[]:
            real_answer = ["The answer is not found in the context, so this question cannot be answered."]
        
        for _ in range(1):
            # # Generate model predictions for base prompts
            base_output_text, base_hidden_states = hf_model.predict(row['base_prompt'], temperature=1.0, return_latent=True)
            base_answer = base_output_text[len(row['base_prompt']):].strip()
            model_probs_preds_base = {"base_answer": base_answer}

            # model_probs_preds_base = {}
            
            print(f"Non steered: {base_answer}")

            ## RUN PROBES
            sec_last_token_embedding = base_hidden_states[1]
            last_tok_bef_gen_embedding = base_hidden_states[-1]

            for layer_num in range(sec_last_token_embedding.shape[0]):
                # Process second last token
                slt_high_ent_prob = model_dict_slt_ent[layer_num].predict_proba(
                    [np.asarray(sec_last_token_embedding[layer_num][0])]
                )[0][1]
                slt_high_ent_pred = model_dict_slt_ent[layer_num].predict(
                    [np.asarray(sec_last_token_embedding[layer_num][0])]
                )[0]
                model_probs_preds_base[f"base_slt_layer_{layer_num}_prob"] = slt_high_ent_prob
                # model_probs_preds_base[f"base_slt_layer_{layer_num}_pred"] = slt_high_ent_pred

                # Process last token before generation
                tbg_high_ent_prob = model_dict_tbg_ent[layer_num].predict_proba(
                    [np.asarray(last_tok_bef_gen_embedding[layer_num][0])]
                )[0][1]
                tbg_high_ent_pred = model_dict_tbg_ent[layer_num].predict(
                    [np.asarray(last_tok_bef_gen_embedding[layer_num][0])]
                )[0]
                model_probs_preds_base[f"base_tbg_layer_{layer_num}_prob"] = tbg_high_ent_prob
                # model_probs_preds_base[f"base_tbg_layer_{layer_num}_pred"] = tbg_high_ent_pred

            ## hall
            hf_model2.reset_all()
            hf_model2.set_add_activations(layer, hall_multiplier*hall_vec.cuda(device="cuda:1"))
            hall_steer_ans = hf_model2.generate_text(row['base_prompt'], max_new_tokens=200)
            hall_steer_ans = hall_steer_ans.split("[/INST]")[-1].replace("</s>", "").strip()
            
            
            model_probs_preds_base["base_hall_steer_answer"] = hall_steer_ans
            

            # ref
            hf_model2.reset_all()
            hf_model2.set_add_activations(layer, ref_multiplier*ref_vec.cuda(device="cuda:1"))
            ref_steer_ans = hf_model2.generate_text(row['base_prompt'], max_new_tokens=200)
            ref_steer_ans = ref_steer_ans.split("[/INST]")[-1].replace("</s>", "").strip()
            

            model_probs_preds_base["base_ref_steer_answer"] = ref_steer_ans
            

            # Combine the current row with its predictions into a single dictionary
            combined_row = row.to_dict()  # Convert the original row to a dictionary
            combined_row.update(model_probs_preds_base)  # Add base predictions
    

            # Append the combined row to the list
            combined_rows.append(combined_row)

    # Create a DataFrame from the combined rows
    chunk_with_predictions = pd.DataFrame(combined_rows)

    # Save the chunk with an identifier
    chunk_identifier = f"{start_index}_{end_index - 1}"
    filename = f"test_twox_latest_B_prompt_processed_chunk_{chunk_identifier}.csv"
    chunk_with_predictions.to_csv(filename, index=False)
    print(f"Saved chunk: {filename}")