### INSTALLS AND IMPORTS

In [None]:
import json
import os
import re

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from transformers import AutoModel, AutoTokenizer
import requests
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [None]:
import huggingface_hub
import transformers
from transformers import AutoModel, AutoTokenizer

In [None]:
! rm -rf Interpreting-Reward-Models || true
! git clone https://github.com/apartresearch/Interpreting-Reward-Models.git
! cd Interpreting-Reward-Models && pip install .

In [None]:
from reward_analyzer import SparseAutoencoder, TaskConfig
from reward_analyzer.utils.model_storage_utils import load_autoencoders_for_artifact, load_latest_model_from_hub, download_folder_from_hub
from reward_analyzer.utils.transformer_utils import batch
from reward_analyzer.configs.project_configs import HuggingfaceConfig

In [None]:
contrastive_triples_path = 'data/contrastive_triples_rlhf.dataset/2024-05-15_14'
download_folder_from_hub(folder_path=contrastive_triples_path, local_folder=contrastive_triples_path)

In [None]:
# Override with your own config if not using Amir's huggingface hub account.
huggingface_config = HuggingfaceConfig()

In [None]:
contrastive_dataset = load_from_disk(contrastive_triples_path)

In [None]:
supported_model_names = ['EleuterAI/pythia-70m', 'EleutherAI/pythia-160m', 'google/gemma-2b-it', 'EleutherAI/gpt-neo-125m']

In [None]:
model_name = 'EleutherAI/pythia-70m'
#model_name = 'EleutherAI/pythia-160m'
#model_name = 'EleutherAI/gpt-neo-125m'
#model_name = 'google/gemma-2b-it'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

In [None]:
task = TaskConfig.HH_RLHF
task_name = task.name
version = 'v0'

if 'pythia' in model_name:
    layer_name_stem = 'layers.{}.mlp'
elif 'neo' in model_name:
    layer_name_stem = 'h.{}.mlp'
elif 'gemma' in model_name:
    layer_name_stem = 'layers.{}.mlp'
else:
    raise Exception(f'Not familiar with model name family of {model_name}')

### Load model and autoencoder artifacts.

In [None]:
tokenizer =  AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Note that this downloads the RLHF tuned version of similar stem from our huggingface hub repo, not the base model.
model = load_latest_model_from_hub(model_name = model_name, task_config=task)
model.device

In [None]:
wandb.init(project = f'Autoencoder_training_{task.name}')

In [None]:
autoencoders_dict = load_autoencoders_for_artifact(f'nlp_and_interpretability/Autoencoder_training_{task.name}/autoencoders_{model_name.split("/")[-1].replace("-", "_")}_{task_name}:{version}')

In [None]:
contrastive_dataset

In [None]:
rlhf_small = autoencoders_dict['rlhf_small']

for key, value in rlhf_small.items():
    value.cuda()

In [None]:
def dump_data_to_jsonl(data: dict, filename: str):
    list_lengths = [len(value_list) for value_list in data.values()]

    assert min(list_lengths) == max(list_lengths), f'Expected list lengths to be the same! Instead got {list_lengths}'
    n = max(list_lengths)

    # Open a file to write JSON Lines
    with open(filename, 'a+') as jsonl_file:
        # Iterate over the index of the lists
        all_lines = []
        for i in range(n):
            # Create a dictionary for the current JSON object
            json_object = {key: values[i] for key, values in data.items()}

            all_lines.append(json.dumps(json_object) + '\n')
        
        # Write the JSON object as a line in the JSONL file
        jsonl_file.writelines(all_lines)

In [None]:
def features_from_single_input(single_input):
    return torch.mean(single_input, dim=0)

def extract_and_process_activations(texts, model, tokenizer, layer_name_stem, autoencoders_dict, with_full_activations=False):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    token_ids = inputs["input_ids"].squeeze().tolist()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Move tokenized inputs to the CUDA device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    activations = {}

    target_layer_names = [layer_name_stem.format(key) for key in autoencoders_dict]

    def get_activation(name):
        def hook(model, input, output):
            activations[name] = output.detach()
        return hook

    hooks = [
        module.register_forward_hook(get_activation(name))
        for name, module in model.named_modules()
        if name in target_layer_names
    ]

    with torch.no_grad():
        outputs = model(**inputs)

    for hook in hooks:
        hook.remove()

    specified_activations =  {layer_num: activations[layer_name_stem.format(layer_num)] for layer_num in autoencoders_dict}
    final_token_embeddings = outputs.last_hidden_state.squeeze().detach().tolist()
    final_token_embeddings = [item[0] for item in final_token_embeddings]
    tokens = [tokenizer.convert_ids_to_tokens(local_token_ids) for local_token_ids in token_ids]


    all_features = {
        "texts": texts,
        "token_ids": token_ids,
        "token_embeddings": final_token_embeddings,
        "tokens": tokens
    }

    for layer_num, activation_values in specified_activations.items():
        activation_values = activation_values.squeeze(0).cpu()
        autoencoder = autoencoders_dict[layer_num].cpu()
        batch_features, _ = autoencoder(activation_values)
        batch_features = batch_features.detach().squeeze(0)

        if with_full_activations:
            all_features[f'activations_{layer_num}'] = activation_values.detach().cpu().squeeze(0).numpy().tolist()

        full_reprs = []
        averaged_reprs = []
        for single_feature in batch_features:
            averaged_repr_each_input = features_from_single_input(single_feature).cpu().tolist()

            full_reprs.append(single_feature.cpu().tolist())
            averaged_reprs.append(averaged_repr_each_input)

        # all_features[f'full_repr_{layer_num}'] = full_reprs
        all_features[f'averaged_reprs_{layer_num}'] = averaged_reprs

    return all_features

import shutil

def remove_path_if_exists(path):
    # Check if the path exists
    if os.path.exists(path):
        # Check if the path is a file
        if os.path.isfile(path):
            os.remove(path)
            print(f"File {path} has been deleted.")
        # Check if the path is a directory
        elif os.path.isdir(path):
            shutil.rmtree(path)
            print(f"Directory {path} and all its contents have been deleted.")
    else:
        print(f"The path {path} does not exist.")


def extract_features_batched(texts, model_name, model, tokenizer, layer_name_stem, autoencoders_dict, source='', output_filestem=None, batch_size=8, with_full_activations=False):
    model_name = model_name.split("/")[-1].replace("-", "_")
    output_filestem = output_filestem or f'./{model_name}_{task.name}_{source}_activations_dataset'

    remove_path_if_exists(f'{output_filestem}.jsonl')
    remove_path_if_exists(f'{output_filestem}.hf')


    for curr_batch in tqdm(batch(texts, n=batch_size)):
        features = extract_and_process_activations(curr_batch, model, tokenizer, layer_name_stem, autoencoders_dict, with_full_activations=with_full_activations)
        dump_data_to_jsonl(features, filename = f'{output_filestem}.jsonl')

    dataset = load_dataset("json", data_files = f'{output_filestem}.jsonl', split='train')
    filename = f'{output_filestem}.hf'
    dataset.save_to_disk(filename)
    return dataset, filename

In [None]:
def generate_datasets(source_dataset, model_name, model, tokenizer, layer_name_stem, autoencoders_dict, task, with_full_activations=False):
    feature_names = ['chosen', 'new_rejected']
    split_datasets = {}
    for feature in feature_names:
        feature_texts = source_dataset[feature]
        activations_dataset, filename = extract_features_batched(
            texts=feature_texts, model_name=model_name, model=model, tokenizer=tokenizer, layer_name_stem=layer_name_stem, autoencoders_dict=autoencoders_dict, source=feature,
            with_full_activations=with_full_activations
        )
        split_datasets[feature] = activations_dataset
        print(f'Dataset is of type {type(activations_dataset)}')

    merged_dataset = DatasetDict(split_datasets)
    filename = f'merged_contrastive_{model_name.split("/")[-1].replace("-", "_")}_{task.name}_activations_and_features.hf'

    merged_dataset.save_to_disk(filename)
    return merged_dataset, filename

merged_dataset, filename = generate_datasets(
    source_dataset=contrastive_dataset, model_name=model_name, model=model, tokenizer=tokenizer,
    layer_name_stem=layer_name_stem, autoencoders_dict=rlhf_small, task=task
)

In [None]:
len(merged_dataset)

In [None]:
api = huggingface_hub.HfApi()

api.upload_folder(
    repo_id=huggingface_config.repo_id,
    folder_path=filename,
    path_in_repo=f'data/{filename}',
    repo_type=None
)

In [None]:
def save_training_dataset_to_wandb(training_dataset: Dataset, model_name, dataset_name), task:
    out_filename = training_dataset.save_to_disk(dataset_name)
    
    my_artifact = wandb.Artifact(f"logistic_probe_training_dataset_{model_name}_{task.name}", type="data")
    
    # Add the list to the artifact
    my_artifact.add_file(local_path=out_filename, name="logistic_probe_training_dataset")

    metadata_dict = {
        "description": "Training dataset, with activations and rewards",
        "source": "Generated by my script",
        "num_examples": len(training_dataset),
        "split": "full"
    }

    my_artifact.metadata.update(metadata_dict)

    # Log the artifact to the run
    wandb.log_artifact(my_artifact)

save_training_dataset_to_wandb(merged_dataset, model_name=model_name)