##Imports and Setup

In [None]:
!pip install transformers==4.34.0
!pip install datasets==2.14.5
!pip install torch=='2.0.1+cu118'
!pip install circuitsvis==1.41.0
!pip install openai==0.28.1
!pip install wandb==0.15.12
!pip install nltk==3.8.1

In [2]:
import openai
import os
import wandb
import csv
import nltk
import zipfile
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from itertools import product
from getpass import getpass
from wandb import Artifact
from wandb import Api
from datasets import load_dataset
from collections import defaultdict
from scipy.optimize import linear_sum_assignment
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoModel, AutoTokenizer

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Autoencoders

In [4]:
class SparseAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, l1_coef):
        super(SparseAutoencoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size

        self.kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'l1_coef': l1_coef}
        self.l1_coef = l1_coef

        self.encoder_weight = nn.Parameter(torch.randn(hidden_size, input_size))
        nn.init.orthogonal_(self.encoder_weight)

        self.encoder_bias = nn.Parameter(torch.zeros(self.hidden_size))
        self.decoder_bias = nn.Parameter(torch.zeros(input_size))

    def forward(self, x):
        normalized_encoder_weight = F.normalize(self.encoder_weight, p=2, dim=1)

        features = F.linear(x, normalized_encoder_weight, self.encoder_bias)
        features = F.relu(features)

        reconstruction = F.linear(features, normalized_encoder_weight.t(), self.decoder_bias)

        return features, reconstruction

In [None]:
entity_name = 'nlp_and_interpretability'

policy_model_name="gpt_neo_125m_utility_reward"
project_prefix = 'Autoencoder_training'

interp_project_name = f"{project_prefix}_{policy_model_name}_interp"
run=wandb.init(project=interp_project_name)

In [6]:
entity_name = 'nlp_and_interpretability'
project_prefix = 'Autoencoder_training'
artifact_prefix = 'autoencoders'

def save_models_to_folder(model_dict, save_dir):
    """
    Save PyTorch models from a dictionary to a specified directory.

    Args:
        model_dict (dict): A dictionary containing PyTorch models with keys as model names.
        save_dir (str): The directory where models will be saved.
    """
    os.makedirs(save_dir, exist_ok=True)

    for model_name, model_list in model_dict.items():
        for i, model in enumerate(model_list):
            model_path = os.path.join(save_dir, f'{model_name}')
            torch.save([model.kwargs, model.state_dict()], model_path)
            print(f"Saved {model_name} to {model_path}")

def save_autoencoders_for_artifact(
        autoencoders_base_big, autoencoders_base_small, autoencoders_rlhf_big, autoencoders_rlhf_small,
        policy_model_name, hyperparameters, alias, run
    ):
    '''
    Saves the autoencoders from one run into memory. Note that these paths are to some extent hardcoded
    '''
    save_dir = 'saves'
    save_models_to_folder(autoencoders_base_big, save_dir=f'{save_dir}/base_big')
    save_models_to_folder(autoencoders_base_small, save_dir=f'{save_dir}/base_small')
    save_models_to_folder(autoencoders_rlhf_big, save_dir=f'{save_dir}/rlhf_big')
    save_models_to_folder(autoencoders_rlhf_small, save_dir=f'{save_dir}/rlhf_small')

    simplified_policy_name = policy_model_name.split('/')[-1].replace("-", "_")
    artifact_name = f'{artifact_prefix}_{simplified_policy_name}'
    saved_artifact = Artifact(artifact_name, metadata=hyperparameters, type='model')
    saved_artifact.add_dir(save_dir, name=save_dir)

    aliases = {simplified_policy_name, 'latest', 'weights_tied'}
    aliases.add(alias)
    aliases = sorted(list(aliases))
    run.log_artifact(saved_artifact, aliases=aliases)

def load_autoencoders_for_artifact(policy_model_name, alias='latest', run=run):
    '''
    Loads the autoencoders from one run into memory. Note that these paths are to some extent hardcoded
    For example, try autoencoders_dict = load_autoencoders_for_artifact('pythia_70m_sentiment_reward')
    '''
    simplified_policy_model_name = policy_model_name.split('/')[-1].replace('-', '_')
    full_path = f'{entity_name}/{project_prefix}_{policy_model_name}/{artifact_prefix}_{simplified_policy_model_name}:{alias}'
    print(f'Loading artifact from {full_path}')

    artifact = run.use_artifact(full_path)
    directory = artifact.download()

    save_dir = f'{directory}/saves'
    autoencoders_base_big = load_models_from_folder(f'{save_dir}/base_big')
    autoencoders_base_small = load_models_from_folder(f'{save_dir}/base_small')
    autoencoders_rlhf_big = load_models_from_folder(f'{save_dir}/rlhf_big')
    autoencoders_rlhf_small = load_models_from_folder(f'{save_dir}/rlhf_small')

    return {
        'base_big': autoencoders_base_big, 'base_small': autoencoders_base_small,
        'rlhf_big': autoencoders_rlhf_big, 'rlhf_small': autoencoders_rlhf_small
    }

def load_models_from_folder(load_dir):
    """
    Load PyTorch models from subfolders of a directory into a dictionary where keys are subfolder names.

    Args:
        load_dir (str): The directory from which models will be loaded.

    Returns:
        model_dict (dict): A dictionary where keys are subfolder names and values are PyTorch models.
    """
    model_dict = {}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for model_name in sorted(os.listdir(load_dir)):
        model_path = os.path.join(load_dir, model_name)

        kwargs, state = torch.load(model_path, map_location=device)

        model = SparseAutoencoder(**kwargs)
        model.load_state_dict(state)
        model.to(device)
        model.eval()

        model_dict[model_name] = model
        print(f"Loaded {model_name} from {model_path}")

    return model_dict

In [None]:
loaded_models_dict = load_autoencoders_for_artifact(policy_model_name=policy_model_name, alias="latest", run=run)

## Cosine Similarity

In [8]:
def calculate_MMCS_hungarian(small_weights, big_weights):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    small_weights = torch.tensor(small_weights).to(device)
    big_weights = torch.tensor(big_weights).to(device)

    small_weights_norm = torch.nn.functional.normalize(small_weights, p=2, dim=0)
    big_weights_norm = torch.nn.functional.normalize(big_weights, p=2, dim=0)
    cos_sims = torch.mm(small_weights_norm.T, big_weights_norm)
    cos_sims_np = 1 - cos_sims.cpu().numpy()
    row_ind, col_ind = linear_sum_assignment(cos_sims_np)
    max_cosine_similarities = 1 - cos_sims_np[row_ind, col_ind]
    mean_mmcs = np.mean(max_cosine_similarities)
    sorted_indices = np.argsort(max_cosine_similarities)[::-1]

    return mean_mmcs, sorted_indices

In [9]:
def compare_autoencoders(small_dict, big_dict, top_k):
    mmcs_results = {}

    small_autoencoders_list = list(small_dict.values())
    big_autoencoders_list = list(big_dict.values())
    layer_names = list(small_dict.keys())

    if len(small_autoencoders_list) != len(big_autoencoders_list):
        raise ValueError("Length of small and big autoencoders lists must be the same.")

    for layer_name, (small_autoencoder, big_autoencoder) in zip(layer_names, zip(small_autoencoders_list, big_autoencoders_list)):
        small_weights = small_autoencoder.encoder_weight.detach().cpu().numpy().T
        big_weights = big_autoencoder.encoder_weight.detach().cpu().numpy().T

        MMCS_value, sorted_indices = calculate_MMCS_hungarian(small_weights, big_weights)

        top_k_indices = sorted_indices[:top_k].tolist()

        mmcs_results[layer_name] = (MMCS_value, top_k_indices)

    return mmcs_results

## Tokenization and Activations

In [10]:
def tokenize_imdb_data(imdb_data, num_samples):
    tokenized_data = []
    for text in random.sample(imdb_data, num_samples):
        dtokens = tokenizer(text[:50], return_tensors='pt', padding=True, truncation=True)
        tokenized_data.append(dtokens)

    return tokenized_data

In [11]:
def normalize_activations(activations, max_activation):
    activations[activations < 0] = 0
    normalized_activations = 10 * activations / max_activation
    return normalized_activations

In [12]:
def discretize_activations(normalized_activations):
    return np.round(normalized_activations).astype(int)

In [13]:
def handle_sparse_activations(tokens, discretized_activations):
    non_zero_indices = np.where(discretized_activations != 0)[0]
    if len(non_zero_indices) / len(discretized_activations) < 0.2:
        repeated_tokens = [tokens[i] for i in non_zero_indices]
        repeated_activations = [discretized_activations[i] for i in non_zero_indices]
        tokens += repeated_tokens
        discretized_activations = np.concatenate([discretized_activations, repeated_activations])
    return tokens, discretized_activations

## Autointerpretability

In [14]:
key = getpass('API Key: ')

API Key: ··········


In [15]:
def get_feature_explanation(feature_index, top_5_activation_records_for_feature):
    conversation = [
        {"role": "system", "content": "We are studying features in a large language model. Each feature looks for some particular pattern in a dataset. Look at the parts of the dataset the feature activates for, and summarize in a single sentence what the feature is looking for. The activation format is token<tab>activation. Activation values range from 0 to 10. A feature finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match."},
    ]

    for record_idx, activation_str in enumerate(top_5_activation_records_for_feature):
        user_message = f"Feature {feature_index}\nTop Activation Example {record_idx}:\n{activation_str}"
        conversation.append({"role": "user", "content": user_message})

    conversation.append({"role": "user", "content": f"Explain what the feature at index {feature_index} in a large language model might be doing based on the top 5 activation records."})

    api_key = key
    model_engine = "gpt-4-turbo-preview"

    openai.api_key = api_key
    response = openai.ChatCompletion.create(
        model=model_engine,
        messages=conversation,
        temperature=0,
    )

    explanation = response['choices'][0]['message']['content'].strip()
    return explanation

In [16]:
def classify_feature(feature_desc):
    conversation = [
        {"role": "system", "content": "We are studying features in a large language model. Each feature looks for some particular pattern in a dataset. Classify features based on descriptions of them. You absolutely must return a valid classification for every input. Do not state that there is not enough information."},
    ]

    conversation.append({"role": "user", "content": f"Feature description: {feature_desc}"})
    conversation.append({"role": "user", "content": f"Return 1 if the feature is related to sentiment, and 0 otherwise. For example, 1 would be the correct classification for a feature that detects negative sentiment, positive sentiment, or words related to sentiment. Only return a single token: 1 or 0."})

    api_key = key
    model_engine = "gpt-4-turbo-preview"

    openai.api_key = api_key
    response = openai.ChatCompletion.create(
        model=model_engine,
        messages=conversation,
        temperature=0,
    )

    classification = response['choices'][0]['message']['content'].strip()
    return classification

In [17]:
def get_activations(dtokens, model, autoencoder, layer_num, device=device):
    input_ids_tensor = dtokens['input_ids'].to(device)
    sequence_length = input_ids_tensor.size(1)

    position_ids = torch.arange(sequence_length, dtype=torch.long, device=device)
    position_ids = position_ids.unsqueeze(0).expand_as(input_ids_tensor)

    token_embeddings = model.wte(input_ids_tensor)
    position_embeddings = model.wpe(position_ids)

    embeddings = token_embeddings + position_embeddings
    embeddings = model.drop(embeddings)

    with torch.no_grad():
        for i in range(layer_num + 1):
            block_output = model.h[i](embeddings)
            embeddings = block_output[0]

        layer_output = embeddings
        mlp_activations = model.h[layer_num].mlp(layer_output)
        _, reconstructed_activations = autoencoder(mlp_activations)

    return reconstructed_activations

## Classify Features

In [18]:
def calculate_utility_from_model(model, tokenizer, autoencoders, imdb_data, num_samples, k, layers, device=device):
    tokenized_data  = tokenize_imdb_data(imdb_data, num_samples)
    results_dict = {}
    top_5_activation_records = defaultdict(dict)
    similarity_results = compare_autoencoders(autoencoders['small'], autoencoders['big'], k + 500)

    for layer_num in layers:
        for layer_name, (_, top_k_indices) in similarity_results.items():
            autoencoder = autoencoders['big'][layer_name].to(device)
            results_dict[layer_name] = {}

            valid_features_processed = 0
            for feature_index in top_k_indices:
                print(valid_features_processed)
                if valid_features_processed >= k:
                    break

                activations_for_feature = []

                for dtokens in tokenized_data:
                    real_activations = get_activations(dtokens, model, autoencoder, layer_num)[0, :, feature_index].detach().cpu().numpy()
                    max_activation = np.max(real_activations)
                    normalized_activations = normalize_activations(real_activations, max_activation)
                    discretized_activations = discretize_activations(normalized_activations)

                    activations_for_feature.append((dtokens, discretized_activations))

                top_20_for_feature = sorted(activations_for_feature, key=lambda x: np.max(x[1]), reverse=True)[:20]

                if len(top_20_for_feature) >= 5:
                    selected_activations = random.sample(top_20_for_feature, 5)
                else:
                    selected_activations = top_20_for_feature

                if all(np.max(activations) <= 0 for _, activations in selected_activations):
                    print(f"Skipping feature index {feature_index} due to no significant activation")
                    continue

                valid_features_processed += 1
                top_5_activation_records[layer_name][feature_index] = selected_activations

                top_5_activation_examples = []
                for dtokens, activations in selected_activations:
                    tokens = tokenizer.convert_ids_to_tokens(dtokens['input_ids'][0])
                    tokens, activations = handle_sparse_activations(tokens, activations)

                    activation_strings = [f"{token}\t{activation}" for token, activation in zip(tokens, activations)]
                    top_5_activation_examples.append("\n".join(activation_strings))

                results_dict[layer_name][feature_index] = get_feature_explanation(feature_index, top_5_activation_examples)

        return results_dict

In [19]:
def classify_dict(feature_dict):
    rm_related = []
    for layer, features in feature_dict.items():
        for feature_index, desc in features.items():
            classification = classify_feature(desc)
            if classification == "1":
                rm_related.append((layer, feature_index))

    return rm_related

In [None]:
imdb_dataset = load_dataset('imdb', split='test')
imdb_data = [entry['text'] for entry in imdb_dataset]

In [None]:
language_model = "gpt-neo-125m_utility_reward"
model = AutoModel.from_pretrained(f"amirabdullah19852020/{language_model}").to(device)
tokenizer = AutoTokenizer.from_pretrained(f"amirabdullah19852020/{language_model}")
autoencoders = {'small': loaded_models_dict['rlhf_small'], 'big': loaded_models_dict['rlhf_big']}

layers = [1, 2, 3, 4, 5]

feature_dict = calculate_utility_from_model(model, tokenizer, autoencoders, imdb_data, num_samples=1000, k=30, layers=layers)
to_ablate = classify_dict(feature_dict)