In [1]:

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


## Import dataset

In [None]:
from dataset import create_dataloaders

# Create dataloaders for all datasets
dataloaders = create_dataloaders(
    batch_size=32,
    train_split=0.9,
    shuffle=True,
    num_workers=4,
    seed=42,
    max_samples=100
)

# Print some statistics
for dataset_name, loaders in dataloaders.items():
    # Calculate total samples for train and test
    train_samples = len(loaders['train'].dataset)
    test_samples = len(loaders['test'].dataset)
    total_samples = train_samples + test_samples
    
    print(f"\nDataset: {dataset_name}")
    print(f"Total samples: {total_samples}")
    print(f"Training samples: {train_samples}")
    print(f"Test samples: {test_samples}")
    
    # Print example sample
    batch = next(iter(loaders['train']))
    print("\nExample sample:")
    print(f"Question: {batch['question'][0]}")
    print(f"Sycophantic answer: {batch['sycophantic_answer'][0]}")
    print(f"Non-sycophantic answer: {batch['non_sycophantic_answer'][0]}")

## Hook transformer


In [None]:
from transformer_lens import HookedTransformer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'mps') #add cpu if needed
# Load the Gemma model
model = HookedTransformer.from_pretrained(
    "gemma-2b-it",
    device=device, 
    dtype=torch.float32 #float16 is faster but less accurate
)


## Collect Activations


In [None]:
from activations import collect_and_save_activations, load_activation_dataloaders

save_dir = ".data/activations"
hooks=[]
for layer in range(15,model.cfg.n_layers):
        hooks.extend([
            # f'blocks.{layer}.hook_resid_pre',  # Before attention
            # f'blocks.{layer}.hook_resid_mid',  # After attention, before MLP
            f'blocks.{layer}.hook_resid_post'  # After MLP
        ])
print(len(hooks))

# Collect activations for each dataset

collect_and_save_activations(
    model=model,
    train_dataloader=dataloaders['mixed']['train'],
    test_dataloader=dataloaders['mixed']['test'],
    hooks=hooks,
    save_dir=save_dir,
    dataset_name='mixed',
    print_outputs=False
)
collect_and_save_activations(
    model=model,
    train_dataloader=dataloaders['politics']['train'],
    test_dataloader=dataloaders['politics']['test'], 
    hooks=hooks,
    save_dir=save_dir,
    dataset_name='politics',
    print_outputs=False
)


## Train Probe


In [None]:
from probe import train_probe, evaluate_probe

# Load activation dataloaders
for hook in hooks:
    # train on nlp and philosophy
    mixed_activation_loaders = load_activation_dataloaders(
        save_dir,
        model.cfg.model_name,
        "mixed", 
        hook=hook,
        batch_size=32
    )



    # Train probe
    probe, losses = train_probe(
        mixed_activation_loaders['train'],
        input_dim=2048,  # Model's hidden dimension
        device=device
    )


    # Evaluate probe
    # test on nlp and philosophy
    mixed_results = evaluate_probe(probe, mixed_activation_loaders['test'], device)
    #test on politics
    politics_activation_loaders = load_activation_dataloaders(  
        save_dir,
        model.cfg.model_name,
        "politics", 
        hook=hook,
        batch_size=32
    )
    politics_results = evaluate_probe(probe, politics_activation_loaders['test'], device)

   
    print(f"\nTest Results:")
    print(f"Hook: {hook}")
    print(f"Accuracy: {mixed_results['accuracy']:.2%}")
    print(f"Total samples: {mixed_results['total_samples']}")
    print(f"Pol Accuracy: {politics_results['accuracy']:.2%}")
    print(f"Pol Total samples: {politics_results['total_samples']}")
    print('-'*100)


## Test probe on multiple datasets


## Activation measuring comparison

## SAE comparison

In [6]:
## 