In [1]:
from src.das_helpers import *
from src import *

import itertools
import functools
import os

import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression

from src import *

In [2]:
# llama_size = "7b"
llama_size = "13b"
use_base_model = True

dataset_path = "data/simple_toxic_data_filtered.jsonl" #"data/simple_toxic_data_filtered.jsonl"

In [3]:
personas = {
    "lenient": "You are somewhat lenient on toxicity\n",
    "harsh": "You are somewhat harsh on toxicity\n"
}

possible_labels = ("CLEAN", "TOXIC")

# classifier_prompt = """
# You must classify the toxicity of text, returning either (TOXIC) or (CLEAN) for any sequence I give you. 

# Here is the text:
# {sequence}

# Your response:
# ("""
classifier_prompt = """
You must classify the toxicity of text, returning either (CLEAN) or (TOXIC) for any sequence I give you. 

Here is the text:
{sequence}

Your response:
("""

In [4]:
if not use_base_model:
    model_name_or_path = f"meta-llama/Llama-2-{llama_size}-chat-hf"
else:
    model_name_or_path = f"meta-llama/Llama-2-{llama_size}-hf"

hf_model, hf_tokenizer = load_model_from_transformers(model_name_or_path)
model = from_hf_to_tlens(hf_model, hf_tokenizer, f"llama-{llama_size}")

if use_base_model:
    model.cfg.model_name = model.cfg.model_name + "_base"

our_task = Task(model, classifier_prompt, personas, possible_labels)
contrast_dataset = ConstrastTriplesDataset(model, our_task, dataset_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(f'{contrast_dataset.samples["clean"][0]}\n\n\n{contrast_dataset.samples["persona_diff"][0]}\n\n\n{contrast_dataset.samples["seq_diff"][0]}')

You are somewhat harsh on toxicity

You must classify the toxicity of text, returning either (CLEAN) or (TOXIC) for any sequence I give you. 

Here is the text:
That prosecutor is a piece of shit because he sucks cock.

Your response:
(


You are somewhat lenient on toxicity

You must classify the toxicity of text, returning either (CLEAN) or (TOXIC) for any sequence I give you. 

Here is the text:
That prosecutor is a piece of shit because he sucks cock.

Your response:
(


You are somewhat harsh on toxicity

You must classify the toxicity of text, returning either (CLEAN) or (TOXIC) for any sequence I give you. 

Here is the text:
This painting is horrible because it has no intrinsic composition

Your response:
(


### Train DAS for Toxicity Score

In [None]:
batch_size=64
acc_step_batch_size=8
n_epochs=50
learning_rate=2e-3
subspace_dim=1
layer=25
 
train_size = int(0.8 * len(contrast_dataset))  # set 80% for training
test_size = len(contrast_dataset) - train_size # 20% for testing

train_dataset, test_dataset = torch.utils.data.random_split(contrast_dataset, [train_size, test_size])

# Create data loaders for the training and testing datasets
train_dataloader = DataLoader(train_dataset, batch_size=acc_step_batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=acc_step_batch_size, shuffle=True, drop_last=True)

train_dataloader = itertools.cycle(train_dataloader)
test_dataloader = itertools.cycle(test_dataloader)

toxicity_score = train_linear_rep(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    n_dim=subspace_dim,
    learning_rate=learning_rate,
    layer=layer,
    invariant_seq=False,
    invariant_persona=True,
    n_epochs=n_epochs,
    acc_step_batch_size=acc_step_batch_size
    acc_iters=batch_size//acc_step_batch_size,
)


NameError: name 'acc_step_batch_size' is not defined

### Train DAS For Persona Rep.


In [None]:
batch_size=64
acc_step_batch_size=8
n_epochs=50
learning_rate=5e-2
subspace_dim=1
layer=25
 
train_size = int(0.8 * len(contrast_dataset))  # set 80% for training
test_size = len(contrast_dataset) - train_size # 20% for testing

train_dataset, test_dataset = torch.utils.data.random_split(contrast_dataset, [train_size, test_size])

# Create data loaders for the training and testing datasets
train_dataloader = DataLoader(train_dataset, batch_size=acc_step_batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=acc_step_batch_size, shuffle=True, drop_last=True)

train_dataloader = itertools.cycle(train_dataloader)
test_dataloader = itertools.cycle(test_dataloader)

persona_rep = train_linear_rep(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    n_dim=subspace_dim,
    learning_rate=learning_rate,
    layer=layer,
    invariant_seq=True,
    invariant_persona=False,
    n_epochs=n_epochs,
    acc_step_batch_size=acc_step_batch_size
    acc_iters=batch_size//acc_step_batch_size,
)


### Train DAS for Judgement

In [None]:
batch_size=64
acc_step_batch_size=8
n_epochs=50
learning_rate=5e-2
subspace_dim=1
layer=25
 
train_size = int(0.8 * len(contrast_dataset))  # set 80% for training
test_size = len(contrast_dataset) - train_size # 20% for testing

train_dataset, test_dataset = torch.utils.data.random_split(contrast_dataset, [train_size, test_size])

# Create data loaders for the training and testing datasets
train_dataloader = DataLoader(train_dataset, batch_size=acc_step_batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=acc_step_batch_size, shuffle=True, drop_last=True)

train_dataloader = itertools.cycle(train_dataloader)
test_dataloader = itertools.cycle(test_dataloader)

persona_rep = train_linear_rep(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    n_dim=subspace_dim,
    learning_rate=learning_rate,
    layer=layer,
    invariant_seq=False,
    invariant_persona=False,
    n_epochs=n_epochs,
    acc_step_batch_size=acc_step_batch_size
    acc_iters=batch_size//acc_step_batch_size,
)


### Do Representation Meiosis Plot

In [None]:
#### Fine-grained structure of the universe