## Import dataset

In [None]:
from utils import create_dataloaders

# Create dataloaders for all datasets
dataloaders = create_dataloaders(
    batch_size=32,
    train_split=0.9,
    shuffle=True,
    num_workers=4,
    seed=42
)

# Print some statistics
for dataset_name, loaders in dataloaders.items():
    # Calculate total samples for train and test
    train_samples = len(loaders['train'].dataset)
    test_samples = len(loaders['test'].dataset)
    total_samples = train_samples + test_samples
    
    print(f"\nDataset: {dataset_name}")
    print(f"Total samples: {total_samples}")
    print(f"Training samples: {train_samples}")
    print(f"Test samples: {test_samples}")
    
    # Print example sample
    batch = next(iter(loaders['train']))
    print("\nExample sample:")
    print(f"Question: {batch['question'][0]}")
    print(f"Sycophantic answer: {batch['sycophantic_answer'][0]}")
    print(f"Non-sycophantic answer: {batch['non_sycophantic_answer'][0]}")

In [5]:
# Import required libraries
from transformer_lens import HookedTransformer, ActivationCache
import torch
from typing import Dict, List, Tuple

# Load the Gemma model
model = HookedTransformer.from_pretrained(
    "gemma-2b-it",
    device='cuda' if torch.cuda.is_available() else 'mps',
    dtype=torch.float16  # Use float16 to save memory
)

def get_resid_stream_activations(
    model: HookedTransformer,
    prompt: str
) -> Tuple[torch.Tensor, ActivationCache]:
    """
    Get activations from the residual stream at each layer.
    
    Returns:
        tuple: (logits, cache) where cache contains activations
    """
    # Define activation names we want to cache
    activation_names = []
    
    # Get residual stream before and after each attention and MLP block
    for layer in range(model.cfg.n_layers):
        activation_names.extend([
            f'blocks.{layer}.hook_resid_pre',  # Before attention
            f'blocks.{layer}.hook_resid_mid',  # After attention, before MLP
            f'blocks.{layer}.hook_resid_post'  # After MLP
        ])
    
    # Run model with caching
    logits, cache = model.run_with_cache(
        prompt,
        names_filter=activation_names,
    )
    
    return logits, cache


prompt = "What is machine learning?"
logits, cache = get_resid_stream_activations(model, prompt)
print(f"Model loaded successfully!")
print(f"Number of layers: {model.cfg.n_layers}")
print(f"Available activation keys:", cache.cache_dict.keys())

KeyboardInterrupt: 

## Hook transformer


## Collect Activations


## Train Probe


## Test probe on multiple datasets


## Activation measuring comparison

## SAE comparison

In [None]:
## 