## Setup dependencies

In [None]:
!pip install wandb datasets nltk

Collecting wandb
  Using cached wandb-0.16.2-py3-none-any.whl (2.2 MB)
Collecting datasets
  Using cached datasets-2.16.1-py3-none-any.whl (507 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Using cached GitPython-3.1.41-py3-none-any.whl (196 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Using cached sentry_sdk-1.40.0-py2.py3-none-any.whl (257 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
import os
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import nltk
import string

from wandb import Artifact
from functools import partial
from transformers import AutoModel, AutoTokenizer, pipeline, AutoModelForCausalLM, AutoModelForSequenceClassification, AdamW, GPTNeoModel, GPTNeoConfig, GPTNeoForCausalLM
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from nltk import download

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_dataset = load_dataset("imdb", split='train[:1000]')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
class IMDbDataset(Dataset):
    def __init__(self, encodings, labels, texts):
        self.encodings = encodings
        self.labels = labels
        self.texts = texts

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['text'] = self.texts[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
gpt_neox_tokenizer = AutoTokenizer.from_pretrained(f"EleutherAI/pythia-70m")
gpt_neox_tokenizer.pad_token = gpt_neox_tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
def tokenize_data(dataset):
    texts = dataset["text"]
    labels = dataset["label"]
    encodings = gpt_neox_tokenizer(texts, truncation=True, padding=True, max_length=512)
    return IMDbDataset(encodings, labels, texts)

In [None]:
tokenized_train = tokenize_data(train_dataset)

In [None]:
train_loader = DataLoader(tokenized_train, batch_size=1, shuffle=True)

### Load autoencoder

In [None]:
class SparseAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, l1_coef):
        super(SparseAutoencoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size

        self.kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'l1_coef': l1_coef}
        self.l1_coef = l1_coef

        self.encoder_weight = nn.Parameter(torch.randn(hidden_size, input_size))
        nn.init.orthogonal_(self.encoder_weight)

        self.encoder_bias = nn.Parameter(torch.zeros(self.hidden_size))
        self.decoder_bias = nn.Parameter(torch.zeros(input_size))

    def forward(self, x):
        normalized_encoder_weight = F.normalize(self.encoder_weight, p=2, dim=1)

        features = F.linear(x, normalized_encoder_weight, self.encoder_bias)
        features = F.relu(features)

        reconstruction = F.linear(features, normalized_encoder_weight.t(), self.decoder_bias)

        return features, reconstruction

In [None]:
entity_name = 'nlp_and_interpretability'

policy_model_name="pythia_70m_utility_reward"
project_prefix = 'Autoencoder_training'

interp_project_name = f"{project_prefix}_{policy_model_name}_interp"
run=wandb.init(project=interp_project_name)

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
entity_name = 'nlp_and_interpretability'
project_prefix = 'Autoencoder_training'
artifact_prefix = 'autoencoders'

def load_autoencoders_for_artifact(policy_model_name, alias='latest', run=run):
    '''
    Loads the autoencoders from one run into memory. Note that these paths are to some extent hardcoded
    For example, try autoencoders_dict = load_autoencoders_for_artifact('pythia_70m_sentiment_reward')
    '''
    simplified_policy_model_name = policy_model_name.split('/')[-1].replace('-', '_')
    full_path = f'{entity_name}/{project_prefix}_{policy_model_name}/{artifact_prefix}_{simplified_policy_model_name}:{alias}'
    print(f'Loading artifact from {full_path}')

    artifact = run.use_artifact(full_path)
    directory = artifact.download()

    save_dir = f'{directory}/saves'
    autoencoders_base_big = load_models_from_folder(f'{save_dir}/base_big')
    autoencoders_base_small = load_models_from_folder(f'{save_dir}/base_small')
    autoencoders_rlhf_big = load_models_from_folder(f'{save_dir}/rlhf_big')
    autoencoders_rlhf_small = load_models_from_folder(f'{save_dir}/rlhf_small')

    return {
        'base_big': autoencoders_base_big, 'base_small': autoencoders_base_small,
        'rlhf_big': autoencoders_rlhf_big, 'rlhf_small': autoencoders_rlhf_small
    }

def load_models_from_folder(load_dir):
    """
    Load PyTorch models from subfolders of a directory into a dictionary where keys are subfolder names.

    Args:
        load_dir (str): The directory from which models will be loaded.

    Returns:
        model_dict (dict): A dictionary where keys are subfolder names and values are PyTorch models.
    """
    model_dict = {}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for model_name in sorted(os.listdir(load_dir)):
        model_path = os.path.join(load_dir, model_name)

        kwargs, state = torch.load(model_path, map_location=device)

        model = SparseAutoencoder(**kwargs)
        model.load_state_dict(state)
        model.to(device)
        model.eval()

        model_dict[model_name] = model
        print(f"Loaded {model_name} from {model_path}")

    return model_dict

In [None]:
loaded_models_dict = load_autoencoders_for_artifact(policy_model_name=policy_model_name, alias="latest", run=run)

Loading artifact from nlp_and_interpretability/Autoencoder_training_pythia_70m_utility_reward/autoencoders_pythia_70m_utility_reward:latest


[34m[1mwandb[0m:   20 of 20 files downloaded.  


Loaded 1 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/1
Loaded 2 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/2
Loaded 3 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/3
Loaded 4 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/4
Loaded 5 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_big/5
Loaded 1 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/1
Loaded 2 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/2
Loaded 3 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/3
Loaded 4 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/4
Loaded 5 from /content/artifacts/autoencoders_pythia_70m_utility_reward:v11/saves/base_small/5
Loaded 1 from /content/artifacts/autoencoders_pythia_70m_uti

In [None]:
config = GPTNeoConfig.from_pretrained("EleutherAI/pythia-70m")
config.is_decoder = True
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/pythia-70m", config=config)

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

You are using a model of type gpt_neox to instantiate a model of type gpt_neo. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

Some weights of GPTNeoForCausalLM were not initialized from the model checkpoint at EleutherAI/pythia-70m and are newly initialized: ['h.0.mlp.c_proj.weight', 'ln_f.bias', 'h.3.attn.attention.out_proj.weight', 'h.3.ln_1.weight', 'h.5.attn.attention.out_proj.weight', 'h.4.mlp.c_fc.weight', 'wte.weight', 'h.0.ln_2.weight', 'h.2.attn.attention.out_proj.bias', 'h.3.ln_1.bias', 'h.1.attn.attention.v_proj.weight', 'h.0.ln_1.weight', 'h.5.attn.attention.v_proj.weight', 'h.5.mlp.c_proj.weight', 'h.0.attn.attention.v_proj.weight', 'h.4.attn.attention.v_proj.weight', 'h.5.attn.attention.k_proj.weight', 'h.3.mlp.c_fc.bias', 'h.0.attn.attention.out_proj.weight', 'h.3.attn.attention.v_proj.weight', 'h.2.mlp.c_fc.bias', 'h.4.ln_2.weight', 'h.4.ln_1.weight', 'h.1.attn.attention.out_proj.bias', 'h.4.ln_1.bias', 'h.2.ln_1.bias', 'h.1.mlp.c_proj.weight', 'h.1.ln_2.bias', 'lm_head.weight', 'h.4.attn.attention.out_proj.weight', 'h.3.attn.attention.out_proj.bias', 'h.5.ln_2.bias', 'h.3.ln_2.weight', 'h.2.l

## Extract and Inject Activations Via Hooking

In [None]:
ablation_data = []

In [None]:
ablation_indices_by_layer = {}
for layer_str, feature_idx in ablation_data:
    layer = int(layer_str)
    if layer not in ablation_indices_by_layer:
        ablation_indices_by_layer[layer] = []
    ablation_indices_by_layer[layer].append(feature_idx)

In [None]:
ablation_indices_by_layer

{}

In [None]:
autoencoder_models = {layer_num: loaded_models_dict['base_small'][str(layer_num)] for layer_num, indices in ablation_indices_by_layer.items()}
for ae in autoencoder_models.values():
    ae.to(device)

In [None]:
def create_mlp_hook(layer_num, ablation_indices):
    def mlp_forward_hook(module, input, output):
        autoencoder_model = autoencoder_models[layer_num]
        _, reconstructed_activations = autoencoder_model(output)
        for idx in ablation_indices:
            reconstructed_activations[:, :, idx] = 0
        return reconstructed_activations
    return mlp_forward_hook

In [None]:
hook_handles = []
for layer_num, indices in ablation_indices_by_layer.items():
    mlp_layer = model.transformer.h[layer_num].mlp
    hook_fn = create_mlp_hook(layer_num, indices)
    handle = mlp_layer.register_forward_hook(hook_fn)
    hook_handles.append(handle)

In [None]:
def output_and_decode(input_ids_tensor, model):
    with torch.no_grad():
        output = model(input_ids=input_ids_tensor)
        logits = output.logits

        token_ids = torch.argmax(logits, dim=-1)
        decoded_texts = [gpt_neox_tokenizer.decode(ids, skip_special_tokens=True) for ids in token_ids]

        return decoded_texts

In [None]:
download('vader_lexicon')

zip_file_path = '/root/nltk_data/sentiment/vader_lexicon.zip'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/root/nltk_data/sentiment')

lexicon_file_path = os.path.join('/root/nltk_data/sentiment', 'vader_lexicon/vader_lexicon.txt')

vader_lexicon = {}
with open(lexicon_file_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        word, score = line.strip().split('\t')[:2]
        vader_lexicon[word] = float(score)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
def calculate_sentiment(text, lexicon):
    tokens = text.split()
    scores = [lexicon.get(token, 0.0) for token in tokens]
    return sum(scores) if tokens else 0.0

In [None]:
total_score = 0.0
total_texts = 0

for batch in train_loader:
    input_ids = batch['input_ids']
    input_ids = input_ids[:, :30]
    decoded_texts = output_and_decode(input_ids, model)

    for text in decoded_texts:
        sentiment_score = calculate_sentiment(text, vader_lexicon)
        total_score += sentiment_score
        total_texts += 1

average_sentiment = total_score / total_texts if total_texts > 0 else 0
print("Average Sentiment Score:", average_sentiment)

for handle in hook_handles:
    handle.remove()

Average Sentiment Score: 0.2617999999999998
