### Load training dataset and Vader sentiment analyzer

In [190]:
import dataclasses
import json
import os

import nltk
import spacy
import torch
import transformers
import wandb

from dataclasses import dataclass
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer

In [162]:
model_name = 'gpt-neo-125m'
os.environ['WANDB_API_KEY'] = ''

In [374]:
full_model_name = f'EleutherAI/{model_name}'

model = AutoModelForCausalLM.from_pretrained(full_model_name)
tokenizer = AutoTokenizer.from_pretrained(full_model_name)
tokenizer.pad_token = tokenizer.eos_token

In [143]:
sentiment_analyzer = SentimentIntensityAnalyzer()
lexicon = sentiment_analyzer.lexicon

## Tokenization utilities.

In [187]:
def check_number_of_tokens(word, tokenizer=tokenizer):
    return len(tokenizer(word)['input_ids'])

In [326]:
def get_tokens_and_ids(text, tokenizer=tokenizer):
    input_ids = tokenizer(text.lower())['input_ids']
    
    tokens = [tokenizer.decode(input_id) for input_id in input_ids]
    # The above produces artifacts such as a " positive" token and id, instead of "positive". So we redo this.

    tokens = [token.lower().strip() for token in tokens]
    tokenizer
    return tokens, input_ids

In [327]:
def get_single_target_token_id(word, tokenizer=tokenizer):
    word = word.lower().strip()
    num_tokens = check_number_of_tokens(word)
    if num_tokens > 1:
        # Backoff to include a single space.
        word = f' {word}'
        num_tokens = check_number_of_tokens(word)

    return tokenizer(word)['input_ids'][0]

In [328]:
@dataclass
class TextTokensIdsTarget:
    text: str
    tokens: list[str]
    ids: list[int]
    target_token: str
    target_token_id: int

def trim_example(input_text: str, target_words: list[str], verbose=False, tokenizer=tokenizer):
    single_target_token_ids = [get_single_target_token_id(word.strip().lower()) for word in target_words]
    
    single_target_token_ids = [token_id for token_id in single_target_token_ids if token_id]
    single_target_tokens = [tokenizer.decode(token_id).strip().lower() for token_id in single_target_token_ids]

    input_tokens, input_token_ids = get_tokens_and_ids(input_text)

    trimmed_input_tokens = []
    trimmed_input_token_ids = []

    for input_token, input_token_id in zip(input_tokens, input_token_ids):
        trimmed_input_tokens.append(input_token)
        trimmed_input_token_ids.append(input_token_id)
        if input_token.strip().lower() in single_target_tokens:
            break

    last_token = None

    if trimmed_input_tokens:
        last_token = trimmed_input_tokens[-1].lower().strip()
        last_token_id = trimmed_input_token_ids[-1]
    
    if last_token and last_token in single_target_tokens:
        text = tokenizer.decode(trimmed_input_token_ids)
        return TextTokensIdsTarget(
            text=text, tokens=trimmed_input_tokens, ids=trimmed_input_token_ids, 
            target_token=last_token, target_token_id = last_token_id)
    else:
        if verbose:
            print(f'last token was {last_token} in {trimmed_input_tokens}.')
        return None

### Load training examples for linear probe.

In [292]:
def load_wandb_json_artifact(
    project_name='utility_reconstruction', artifact_name = 'contrastive_sentiment_pairs', version='v5'
):
    api = wandb.Api()
    artifact = api.artifact(f'nlp_and_interpretability/{project_name}/{artifact_name}:{version}', type='data')
    artifact_dir = artifact.download()

    with open(f'artifacts/{artifact_name}:{version}/{artifact_name}', 'r') as f_in:
        result = json.load(f_in)
        return result

all_input_dicts = load_wandb_json_artifact()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [330]:
class TrainingPoint:

    def __init__(self, input_dict: dict, tokenizer=tokenizer):
        self.input_dict = input_dict
        self.positive_text = input_dict['input_text']
        self.negative_text = input_dict['output_text']

        self.positive_text_tokens, self.positive_input_ids = get_tokens_and_ids(self.positive_text)
        self.negative_text_tokens, self.negative_token_ids = get_tokens_and_ids(self.negative_text)
        
        self.positive_words = input_dict['positive_words']
        self.negative_words = list(input_dict['new_words'].values())

        try:
            self.trimmed_positive_example: TextTokensIdTarget = trim_example(self.positive_text, self.positive_words)
        except Exception as e:
            print(f'Caught exception {e} on {input_dict}.')
            self.trimmed_positive_example = None
        
        try:
            self.trimmed_negative_example: TextTokensIdTarget = trim_example(self.negative_text, self.negative_words)
        except Exception as e:
            print(f'Caught exception {e} on {input_dict}.')
            self.trimmed_negative_example = None

In [331]:
training_points = [TrainingPoint(input_dict) for input_dict in all_input_dicts]

Caught exception list index out of range on {'input_text': 'It seems like anybody can make a movie nowadays.', 'output_text': 'It seems anybody can make a movie nowadays.', 'positive_words': ['like'], 'new_words': {'like': ''}}.


In [332]:
successful_training_points = [
    training_point for training_point in training_points if 
    training_point.trimmed_positive_example and training_point.trimmed_negative_example
]

In [342]:
len(successful_training_points)

8352

### Load autoencoders for linear probe.

In [23]:
run = wandb.init('utility_reconstruction')

[34m[1mwandb[0m: Currently logged in as: [33mamirali1985[0m ([33mnlp_and_interpretability[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [343]:
class SparseAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, l1_coef):
        super(SparseAutoencoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size

        self.kwargs = {'input_size': input_size, 'hidden_size': hidden_size, 'l1_coef': l1_coef}
        self.l1_coef = l1_coef

        self.encoder_weight = nn.Parameter(torch.randn(hidden_size, input_size))
        nn.init.orthogonal_(self.encoder_weight)

        self.encoder_bias = nn.Parameter(torch.zeros(self.hidden_size))
        self.decoder_bias = nn.Parameter(torch.zeros(input_size))

    def forward(self, x):
        normalized_encoder_weight = F.normalize(self.encoder_weight, p=2, dim=1)

        features = F.linear(x, normalized_encoder_weight, self.encoder_bias)
        features = F.relu(features)

        reconstruction = F.linear(features, normalized_encoder_weight.t(), self.decoder_bias)

        return features, reconstruction

In [344]:
entity_name = 'nlp_and_interpretability'
project_prefix = 'Autoencoder_training'
artifact_prefix = 'autoencoders'

def load_autoencoders_for_artifact(policy_model_name, alias='latest', run=run):
    '''
    Loads the autoencoders from one run into memory. Note that these paths are to some extent hardcoded
    For example, try autoencoders_dict = load_autoencoders_for_artifact('pythia_70m_sentiment_reward')
    '''
    simplified_policy_model_name = policy_model_name.split('/')[-1].replace('-', '_')
    full_path = f'{entity_name}/{project_prefix}_{simplified_policy_model_name}/{artifact_prefix}_{simplified_policy_model_name}:{alias}'
    print(f'Loading artifact from {full_path}')

    artifact = run.use_artifact(full_path)
    directory = artifact.download()

    save_dir = f'{directory}/saves'
    autoencoders_base_big = load_models_from_folder(f'{save_dir}/base_big')
    autoencoders_base_small = load_models_from_folder(f'{save_dir}/base_small')
    autoencoders_rlhf_big = load_models_from_folder(f'{save_dir}/rlhf_big')
    autoencoders_rlhf_small = load_models_from_folder(f'{save_dir}/rlhf_small')

    return {
        'base_big': autoencoders_base_big, 'base_small': autoencoders_base_small,
        'rlhf_big': autoencoders_rlhf_big, 'rlhf_small': autoencoders_rlhf_small
    }

def load_models_from_folder(load_dir):
    """
    Load PyTorch models from subfolders of a directory into a dictionary where keys are subfolder names.

    Args:
        load_dir (str): The directory from which models will be loaded.

    Returns:
        model_dict (dict): A dictionary where keys are subfolder names and values are PyTorch models.
    """
    model_dict = {}

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for model_name in sorted(os.listdir(load_dir)):
        model_path = os.path.join(load_dir, model_name)

        kwargs, state = torch.load(model_path, map_location=device)

        model = SparseAutoencoder(**kwargs)
        model.load_state_dict(state)
        model.to(device)
        model.eval()

        model_dict[model_name] = model
        print(f"Loaded {model_name} from {model_path}")

    return model_dict

### Extract Activations.

In [113]:
gpt_neo_target_layers = [f'transformer.h.{layer_no}.mlp' for layer_no in range(12)]
gpt_neo_target_layers

['transformer.h.0.mlp',
 'transformer.h.1.mlp',
 'transformer.h.2.mlp',
 'transformer.h.3.mlp',
 'transformer.h.4.mlp',
 'transformer.h.5.mlp',
 'transformer.h.6.mlp',
 'transformer.h.7.mlp',
 'transformer.h.8.mlp',
 'transformer.h.9.mlp',
 'transformer.h.10.mlp',
 'transformer.h.11.mlp']

In [347]:
sample_training_points = successful_training_points[:5]

In [128]:
class ActivationHook:
    def __init__(self):
        self.activations = []

    def hook_fn(self, module, input, output):
        self.activations.append(output)

In [375]:
class ActivationsExtractor:
    def __init__(self, model, target_layers, tokenizer):
        self.model = model
        self.target_layers = target_layers
        self.tokenizer = tokenizer

        # Create an instance of ActivationHook
        self.activation_hooks = {}
        
        for layer_name in self.target_layers:
            activation_hook = ActivationHook()
            self.activation_hooks[layer_name] = activation_hook
            layer = dict(model.named_modules())[layer_name]
            # Register the forward hook to the chosen layer
            hook_handle = layer.register_forward_hook(activation_hook.hook_fn)

    def get_activations(self, texts: list[str]):
        # Forward pass your input through the model
        input_data = self.tokenizer(texts, return_tensors='pt', padding=True)  # Example input shape
        print(input_data)
        output = self.model(**input_data)
        print(f'We got {len(self.activation_hooks)}')

activations_extractor = ActivationsExtractor(
    model=model, target_layers=gpt_neo_target_layers, tokenizer=tokenizer
)

In [376]:
all_input_texts = [text['input_text'] for text in all_texts]
all_output_texts = [text['output_text'] for text in all_texts]

In [377]:
activations_extractor.get_activations(["This is a positive development", "This is a sad situation but we will figure this out bro."])

{'input_ids': tensor([[ 1212,   318,   257,  3967,  2478, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256],
        [ 1212,   318,   257,  6507,  3074,   475,   356,   481,  3785,   428,
           503,  1379,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
We got 12


In [364]:
len(activations_extractor.activation_hooks['transformer.h.0.mlp'].activations)

2

In [378]:
activations_extractor.activation_hooks['transformer.h.1.mlp'].activations[].shape

torch.Size([2, 13, 768])

In [68]:
text = "This is a positive development"
tokenizer = AutoTokenizer.from_pretrained(f'EleutherAI/{model_name}')
input_ids = tokenizer.encode(text)

In [75]:
model.transformer.config

GPTNeoConfig {
  "_name_or_path": "EleutherAI/gpt-neo-125m",
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "classifier_dropout": 0.1,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
  "num_heads": 12,
  "num_layers": 12,
  "resid_dropout": 0,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transform

In [None]:
class ModifiedGPTNeoXForCausalLM(GPTNeoXForCausalLM):
    def __init__(self, config, original_model=None):
        super().__init__(config)
        if original_model:
            self.embed_in = original_model.embed_in
            self.emb_dropout = original_model.emb_dropout
            self.layers = original_model.layers
            self.final_layer_norm = original_model.final_layer_norm

    def custom_forward(self, reconstructed_activations, start_layer=1):
        hidden_states = reconstructed_activations
        for layer_module in self.layers[start_layer:]:
            hidden_states = layer_module(hidden_states)[0]

        if hasattr(self, 'final_layer_norm'):
            hidden_states = self.final_layer_norm(hidden_states)

        logits = self.embed_out(hidden_states)
        return logits

In [None]:
def get_reconstructed_activations(autoencoders_dict, layer_number, model, tokenizer, text):
    with torch.no_grad():
        embeddings = original_model.embed_in(input_ids_tensor)
        embeddings = original_model.emb_dropout(embeddings)
        first_layer_output = original_model.layers[0](embeddings)
        mlp_activations = first_layer_output[0].float()

        _, reconstructed_activations = autoencoder_model(mlp_activations)

    return reconstructed_activations