In [1]:

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


## Import dataset

In [None]:
from dataset import create_dataloaders

# Create dataloaders for all datasets
dataloaders = create_dataloaders(
    batch_size=32,
    train_split=0.9,
    shuffle=True,
    num_workers=4,
    seed=42,
    max_samples=100
)

# Print some statistics
for dataset_name, loaders in dataloaders.items():
    # Calculate total samples for train and test
    train_samples = len(loaders['train'].dataset)
    test_samples = len(loaders['test'].dataset)
    total_samples = train_samples + test_samples
    
    print(f"\nDataset: {dataset_name}")
    print(f"Total samples: {total_samples}")
    print(f"Training samples: {train_samples}")
    print(f"Test samples: {test_samples}")
    
    # Print example sample
    batch = next(iter(loaders['train']))
    print("\nExample sample:")
    print(f"Question: {batch['question'][0]}")
    print(f"Sycophantic answer: {batch['sycophantic_answer'][0]}")
    print(f"Non-sycophantic answer: {batch['non_sycophantic_answer'][0]}")

## Hook transformer


In [None]:
# Import required libraries
from transformer_lens import HookedTransformer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'mps') #add cpu if needed
# Load the Gemma model
model = HookedTransformer.from_pretrained(
    "gemma-2b-it",
    device=device, 
    dtype=torch.float16 
)


## Collect Activations


In [None]:
from transformer_lens import HookedTransformer
import torch
from dataset import create_dataloaders
from activations import collect_and_save_activations, load_activation_dataloaders
from probe import train_probe, evaluate_probe

save_dir = ".data/activations"
hooks=[]
for layer in range(model.cfg.n_layers):
        hooks.extend([
            f'blocks.{layer}.hook_resid_pre',  # Before attention
            f'blocks.{layer}.hook_resid_mid',  # After attention, before MLP
            f'blocks.{layer}.hook_resid_post'  # After MLP
        ])
print(len(hooks))
# Collect activations for each dataset
for dataset_name, loaders in dataloaders.items():
    collect_and_save_activations(
            model,
            loaders['train'],
            hooks,
            save_dir,
            dataset_name
        )

# Load activation dataloaders
activation_loaders = load_activation_dataloaders(
    save_dir,
    model.cfg.model_name,
    "nlp", 
    hook=hooks[0],
    batch_size=32
)
# Free up GPU memory
# del model
# torch.cuda.empty_cache()



# Train probe
probe, losses = train_probe(
    activation_loaders['train'],
    input_dim=2048,  # Model's hidden dimension
    device=device
)

# # Evaluate probe
# results = evaluate_probe(probe, activation_loaders['test'], device)
# print(f"\nTest Results:")
# print(f"Accuracy: {results['accuracy']:.2%}")
# print(f"Total samples: {results['total_samples']}")

## Train Probe


In [None]:
activation_loaders = load_activation_dataloaders(
    save_dir,
    "nlp",  # Or whichever dataset you want to use
    layer,
    hook=hooks[0],
    batch_size=32
)


## Test probe on multiple datasets


## Activation measuring comparison

## SAE comparison

In [6]:
## 