# Sparse Coding

##Imports and Setup

In [1]:
!pip install transformers==4.34.0
!pip install datasets==2.14.5
!pip install torch=='2.0.1+cu118'
!pip install einops==0.7.0
!pip install circuitsvis==1.41.0
!pip install openai==0.28.1
!pip install wandb==0.15.12
!pip install tqdm==4.66.1
!pip install nltk==3.8.1

Collecting transformers==4.34.0
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers==4.34.0)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.34.0)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.19.4
    Uninstalling huggingface-hub-0.19.4:
      Successfully uninstalled huggingface-h

In [2]:
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from collections import defaultdict
from scipy.optimize import linear_sum_assignment
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from einops import rearrange
from circuitsvis.activations import text_neuron_activations
from torch.utils.data import DataLoader, TensorDataset
from itertools import product
import openai
from getpass import getpass
import os
import wandb
from wandb import Artifact
from wandb import Api
from tqdm import tqdm
import csv
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import zipfile
import random
import traceback

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Train Autoencoders

In [4]:
class SparseAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, l1_coef):
        super(SparseAutoencoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size

        self.kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'l1_coef': l1_coef}
        self.l1_coef = l1_coef

        self.encoder_weight = nn.Parameter(torch.randn(hidden_size, input_size))
        nn.init.orthogonal_(self.encoder_weight)

        self.encoder_bias = nn.Parameter(torch.zeros(self.hidden_size))
        self.decoder_bias = nn.Parameter(torch.zeros(input_size))

    def forward(self, x):
        normalized_encoder_weight = F.normalize(self.encoder_weight, p=2, dim=1)

        features = F.linear(x, normalized_encoder_weight, self.encoder_bias)
        features = F.relu(features)

        reconstruction = F.linear(features, normalized_encoder_weight.t(), self.decoder_bias)

        return features, reconstruction

In [5]:
entity_name = 'nlp_and_interpretability'

policy_model_name="pythia_70m_utility_reward"
project_prefix = 'Autoencoder_training'

interp_project_name = f"{project_prefix}_{policy_model_name}_interp"
run=wandb.init(project=interp_project_name)  # Add config = {dict_of_params_run} as keyword argument

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
entity_name = 'nlp_and_interpretability'
project_prefix = 'Autoencoder_training'
artifact_prefix = 'autoencoders'

def save_models_to_folder(model_dict, save_dir):
    """
    Save PyTorch models from a dictionary to a specified directory.

    Args:
        model_dict (dict): A dictionary containing PyTorch models with keys as model names.
        save_dir (str): The directory where models will be saved.
    """
    os.makedirs(save_dir, exist_ok=True)

    for model_name, model_list in model_dict.items():
        for i, model in enumerate(model_list):
            model_path = os.path.join(save_dir, f'{model_name}')
            torch.save([model.kwargs, model.state_dict()], model_path)
            print(f"Saved {model_name} to {model_path}")

def save_autoencoders_for_artifact(
        autoencoders_base_big, autoencoders_base_small, autoencoders_rlhf_big, autoencoders_rlhf_small,
        policy_model_name, hyperparameters, alias, run
    ):
    '''
    Saves the autoencoders from one run into memory. Note that these paths are to some extent hardcoded
    '''
    save_dir = 'saves'
    save_models_to_folder(autoencoders_base_big, save_dir=f'{save_dir}/base_big')
    save_models_to_folder(autoencoders_base_small, save_dir=f'{save_dir}/base_small')
    save_models_to_folder(autoencoders_rlhf_big, save_dir=f'{save_dir}/rlhf_big')
    save_models_to_folder(autoencoders_rlhf_small, save_dir=f'{save_dir}/rlhf_small')

    simplified_policy_name = policy_model_name.split('/')[-1].replace("-", "_")
    artifact_name = f'{artifact_prefix}_{simplified_policy_name}'
    saved_artifact = Artifact(artifact_name, metadata=hyperparameters, type='model')
    saved_artifact.add_dir(save_dir, name=save_dir)

    aliases = {simplified_policy_name, 'latest', 'weights_tied'}
    aliases.add(alias)
    aliases = sorted(list(aliases))
    run.log_artifact(saved_artifact, aliases=aliases)

def load_autoencoders_for_artifact(policy_model_name, alias='latest', run=run):
    '''
    Loads the autoencoders from one run into memory. Note that these paths are to some extent hardcoded
    For example, try autoencoders_dict = load_autoencoders_for_artifact('pythia_70m_sentiment_reward')
    '''
    simplified_policy_model_name = policy_model_name.split('/')[-1].replace('-', '_')
    full_path = f'{entity_name}/{project_prefix}_{policy_model_name}/{artifact_prefix}_{simplified_policy_model_name}:{alias}'
    print(f'Loading artifact from {full_path}')

    artifact = run.use_artifact(full_path)
    directory = artifact.download()

    save_dir = f'{directory}/saves'
    autoencoders_base_big = load_models_from_folder(f'{save_dir}/base_big')
    autoencoders_base_small = load_models_from_folder(f'{save_dir}/base_small')
    autoencoders_rlhf_big = load_models_from_folder(f'{save_dir}/rlhf_big')
    autoencoders_rlhf_small = load_models_from_folder(f'{save_dir}/rlhf_small')

    return {
        'base_big': autoencoders_base_big, 'base_small': autoencoders_base_small,
        'rlhf_big': autoencoders_rlhf_big, 'rlhf_small': autoencoders_rlhf_small
    }

def load_models_from_folder(load_dir):
    """
    Load PyTorch models from subfolders of a directory into a dictionary where keys are subfolder names.

    Args:
        load_dir (str): The directory from which models will be loaded.

    Returns:
        model_dict (dict): A dictionary where keys are subfolder names and values are PyTorch models.
    """
    model_dict = {}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for model_name in sorted(os.listdir(load_dir)):
        model_path = os.path.join(load_dir, model_name)

        kwargs, state = torch.load(model_path, map_location=device)

        model = SparseAutoencoder(**kwargs)
        model.load_state_dict(state)
        model.to(device)
        model.eval()

        model_dict[model_name] = model
        print(f"Loaded {model_name} from {model_path}")

    return model_dict

In [7]:
loaded_models_dict = load_autoencoders_for_artifact(policy_model_name=policy_model_name, alias="latest", run=run)

Loading artifact from nlp_and_interpretability/Autoencoder_training_pythia_70m_utility_reward/autoencoders_pythia_70m_utility_reward:latest


[34m[1mwandb[0m:   20 of 20 files downloaded.  


Loaded 1 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/1
Loaded 2 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/2
Loaded 3 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/3
Loaded 4 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/4
Loaded 5 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/5
Loaded 1 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/1
Loaded 2 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/2
Loaded 3 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/3
Loaded 4 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/4
Loaded 5 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/5
Loaded 1 from ./artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/rlhf_big/1
Loaded 2 from ./artifacts/autoencoders_pythi

# Quantify Reward Modeling Efficacy

## Cosine Similarity

In [8]:
def calculate_MMCS_hungarian(small_weights, big_weights):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    small_weights = torch.tensor(small_weights).to(device)
    big_weights = torch.tensor(big_weights).to(device)

    small_weights_norm = torch.nn.functional.normalize(small_weights, p=2, dim=0)
    big_weights_norm = torch.nn.functional.normalize(big_weights, p=2, dim=0)
    cos_sims = torch.mm(small_weights_norm.T, big_weights_norm)
    cos_sims_np = 1 - cos_sims.cpu().numpy()
    row_ind, col_ind = linear_sum_assignment(cos_sims_np)
    max_cosine_similarities = 1 - cos_sims_np[row_ind, col_ind]
    mean_mmcs = np.mean(max_cosine_similarities)
    sorted_indices = np.argsort(max_cosine_similarities)[::-1]

    return mean_mmcs, sorted_indices

In [9]:
def compare_autoencoders(small_dict, big_dict, top_k):
    mmcs_results = {}

    small_autoencoders_list = list(small_dict.values())
    big_autoencoders_list = list(big_dict.values())
    layer_names = list(small_dict.keys())

    if len(small_autoencoders_list) != len(big_autoencoders_list):
        raise ValueError("Length of small and big autoencoders lists must be the same.")

    for layer_name, (small_autoencoder, big_autoencoder) in zip(layer_names, zip(small_autoencoders_list, big_autoencoders_list)):
        small_weights = small_autoencoder.encoder_weight.detach().cpu().numpy().T
        big_weights = big_autoencoder.encoder_weight.detach().cpu().numpy().T

        MMCS_value, sorted_indices = calculate_MMCS_hungarian(small_weights, big_weights)

        top_k_indices = sorted_indices[:top_k].tolist()

        mmcs_results[layer_name] = (MMCS_value, top_k_indices)

    return mmcs_results

## Tokenization and Activations

In [10]:
def add_space_prefix(tokens):
    return ['Ġ' + token if not token.startswith('Ġ') else token for token in tokens]

In [11]:
def tokenize_imdb_data(imdb_data, num_samples):
    tokenized_data = []
    tokenized_texts = []
    for text in random.sample(imdb_data, num_samples):
        dtokens = tokenizer(text[:50], return_tensors='pt', padding=True, truncation=True)
        tokens = tokenizer.tokenize(text[:50])
        tokens = add_space_prefix(tokens)
        tokens_text = ' '.join(tokens)

        tokenized_data.append(dtokens)
        tokenized_texts.append(tokens_text)

    return tokenized_data, tokenized_texts

In [12]:
def get_encoder_activations_and_reconstruction(input_ids_tensor, attention_mask_tensor, transformer_model, autoencoder_model):
    with torch.no_grad():
        activations = transformer_model(input_ids_tensor, attention_mask=attention_mask_tensor)[0]

    activations = activations.float()
    features, reconstruction = autoencoder_model(activations)

    return features, reconstruction

In [13]:
def normalize_activations(activations, max_activation):
    activations[activations < 0] = 0
    normalized_activations = 10 * activations / max_activation
    return normalized_activations

In [14]:
def discretize_activations(normalized_activations):
    return np.round(normalized_activations).astype(int)

In [15]:
def handle_sparse_activations(tokens, discretized_activations):
    non_zero_indices = np.where(discretized_activations != 0)[0]
    if len(non_zero_indices) / len(discretized_activations) < 0.2:
        repeated_tokens = [tokens[i] for i in non_zero_indices]
        repeated_activations = [discretized_activations[i] for i in non_zero_indices]
        tokens += repeated_tokens
        discretized_activations = np.concatenate([discretized_activations, repeated_activations])
    return tokens, discretized_activations

## Autointerpretability

In [16]:
key = getpass('API Key: ')

API Key: ··········


In [17]:
def get_feature_explanation(feature_index, top_5_activation_records_for_feature):
    conversation = [
        {"role": "system", "content": "We're studying features in an autoencoder model. Each feature looks for some particular pattern in a short document. Look at the parts of the document the feature activates for and summarize in a single sentence what the feature is looking for."},
        {"role": "user", "content": "The activation format is token<tab>activation. Activation values range from 0 to 10. A feature finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match."},
    ]

    for record_idx, activation_str in enumerate(top_5_activation_records_for_feature):
        user_message = f"Feature {feature_index}\nTop Activation Example {record_idx}:\n{activation_str}"
        conversation.append({"role": "user", "content": user_message})

    conversation.append({"role": "user", "content": f"Explain what the feature at index {feature_index} in an autoencoder might be doing based on the top 5 activation records."})

    api_key = key
    model_engine = "gpt-4"

    openai.api_key = api_key
    response = openai.ChatCompletion.create(
        model=model_engine,
        messages=conversation
    )

    explanation = response['choices'][0]['message']['content'].strip()
    return explanation

In [18]:
def get_activations_and_tensors(dtokens, model, autoencoder):
    input_ids_tensor = dtokens['input_ids'].to(device)
    attention_mask_tensor = dtokens.get('attention_mask', None)
    if attention_mask_tensor is not None:
        attention_mask_tensor = attention_mask_tensor.to(device)

    features, real_activations = get_encoder_activations_and_reconstruction(input_ids_tensor, attention_mask_tensor, model, autoencoder)
    return input_ids_tensor, attention_mask_tensor, features

## Efficacy Scores and Correlation

In [19]:
nltk.download('vader_lexicon')

zip_file_path = '/root/nltk_data/sentiment/vader_lexicon.zip'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/root/nltk_data/sentiment')

lexicon_file_path = os.path.join('/root/nltk_data/sentiment', 'vader_lexicon/vader_lexicon.txt')

vader_lexicon = {}
with open(lexicon_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        word, score = line.strip().split('\t')[:2]
        vader_lexicon[word] = float(score)

def compute_sentiment(sentence):
    words = sentence.lower().split()
    if len(words) == 0:
        return 0.0

    sentiment_score = 0.0
    words_with_sentiment = 0
    for word in words:
        if word in vader_lexicon:
            sentiment_score += vader_lexicon[word]
            words_with_sentiment += 1

    if words_with_sentiment == 0:
        return 0.0

    return sentiment_score / words_with_sentiment

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [20]:
def generate_completions(model, tokenizer, text_prefixes, max_length=100):
    generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device="cpu")
    completions_dict = {}
    for prefix in text_prefixes:
        generated = generator(prefix, max_length=max_length, num_return_sequences=1)
        completion = generated[0]['generated_text'][len(prefix):].strip()
        completions_dict[prefix] = completion
    return completions_dict

In [44]:
def calculate_utility_from_model(model, gen_model, tokenizer, autoencoders, imdb_data, num_samples, num_gen_samples, max_len, k, device="cpu"):
    tokenized_data, tokenized_texts  = tokenize_imdb_data(imdb_data, num_samples)
    sample_texts = [text[:max_len] for text in imdb_data[:num_gen_samples]]

    results_dict = {}
    top_5_activation_records = defaultdict(dict)
    similarity_results = compare_autoencoders(autoencoders['small'], autoencoders['big'], k + 15)
    top_k_sum = 0
    gen_utility = 0

    for layer_name, (_, top_k_indices) in similarity_results.items():
        autoencoder = autoencoders['big'][layer_name].to(device)
        results_dict[layer_name] = {}

        valid_features_processed = 0
        for feature_index in top_k_indices:
            if valid_features_processed >= k:
                break

            activations_for_feature = []

            for dtokens in tokenized_data:
                real_activations = get_activations_and_tensors(dtokens, model, autoencoder)[2][0, :, feature_index].detach().cpu().numpy()
                max_activation = np.max(real_activations)
                normalized_activations = normalize_activations(real_activations, max_activation)
                discretized_activations = discretize_activations(normalized_activations)

                activations_for_feature.append((dtokens, discretized_activations))

            top_20_for_feature = sorted(activations_for_feature, key=lambda x: np.max(x[1]), reverse=True)[:20]

            if len(top_20_for_feature) >= 5:
                selected_activations = random.sample(top_20_for_feature, 5)
            else:
                selected_activations = top_20_for_feature

            if all(np.max(activations) <= 0 for _, activations in selected_activations):
                print(f"Skipping feature index {feature_index} due to no significant activation")
                continue

            valid_features_processed += 1
            top_5_activation_records[layer_name][feature_index] = selected_activations

            top_5_activation_examples = []
            for dtokens, activations in selected_activations:
                tokens = tokenizer.convert_ids_to_tokens(dtokens['input_ids'][0])
                tokens, activations = handle_sparse_activations(tokens, activations)

                activation_strings = [f"{token}\t{activation}" for token, activation in zip(tokens, activations)]
                top_5_activation_examples.append("\n".join(activation_strings))

            results_dict[layer_name][feature_index] = get_feature_explanation(feature_index, top_5_activation_examples)

    for layer_name in results_dict:

        for neuron_index in results_dict[layer_name]:
              explanation = results_dict[layer_name][neuron_index]
              sentiment_score = compute_sentiment(explanation)
              top_k_sum += abs(sentiment_score)

    completions_dict = generate_completions(gen_model, tokenizer, sample_texts, max_length=50)

    for prefix, completion in completions_dict.items():
        sentiment_score = compute_sentiment(completion)
        gen_utility += abs(sentiment_score)

    print(results_dict)
    return top_k_sum, gen_utility

In [None]:
#RLHF
language_model = "pythia-70m_utility_reward"
model = AutoModel.from_pretrained(f"amirabdullah19852020/{language_model}").to(device)
gen_model = AutoModelForCausalLM.from_pretrained(f"amirabdullah19852020/{language_model}").to(device)
tokenizer = AutoTokenizer.from_pretrained(f"amirabdullah19852020/{language_model}")
autoencoders = {'small': loaded_models_dict['rlhf_small'], 'big': loaded_models_dict['rlhf_big']}
imdb_dataset = load_dataset('imdb', split='test')
imdb_data = [entry['text'] for entry in imdb_dataset]
sentiment_score = calculate_utility_from_model(model, gen_model, tokenizer, autoencoders, imdb_data, num_samples=500, num_gen_samples=100, max_len=30, k=30)

Some weights of the model checkpoint at amirabdullah19852020/pythia-70m_utility_reward were not used when initializing GPTNeoXForCausalLM: ['v_head.summary.weight', 'v_head.summary.bias']
- This IS expected if you are initializing GPTNeoXForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPTNeoXForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  normalized_activations = 10 * activations / max_activation


Skipping feature index 510 due to no significant activation
Skipping feature index 291 due to no significant activation
Skipping feature index 351 due to no significant activation
Skipping feature index 254 due to no significant activation
Skipping feature index 378 due to no significant activation


In [None]:
print(sentiment_score)

In [None]:
#BASE
language_model = "pythia-70m"
model = AutoModel.from_pretrained(f"eleutherai/{language_model}").to(device)
gen_model = AutoModelForCausalLM.from_pretrained(f"eleutherai/{language_model}").to(device)
tokenizer = AutoTokenizer.from_pretrained(f"eleutherai/{language_model}")
tokenizer.pad_token = tokenizer.eos_token
autoencoders = {'small': loaded_models_dict['base_small'], 'big': loaded_models_dict['base_big']}
imdb_dataset = load_dataset('imdb', split='test')
imdb_data = [entry['text'] for entry in imdb_dataset]
sentiment_score = calculate_utility_from_model(model, gen_model, tokenizer, autoencoders, imdb_data, num_samples=500, num_gen_samples=100, max_len=30, k=30)

In [None]:
print(sentiment_score)