In [1]:
import os
import sys

os.chdir('/data/kebl6672/dpo-toxic')
sys.path.append('/data/kebl6672/dpo-toxic')

In [2]:
import json
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.decomposition import TruncatedSVD
# from toxicity.figures.fig_utils import load_hooked
# from transformer_lens import HookedTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model_name = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load the model *with* the LM head
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.eval() 

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

#### Compute embedding differences of contrastive pairs

In [5]:
def get_embedding(texts):
    """Compute sentence embeddings by mean pooling over the last hidden states of a GPT-2 LM model."""
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=20)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to GPU
    
    with torch.no_grad():
        # Ask for hidden states:
        outputs = model(**inputs, output_hidden_states=True)
        
    # outputs.hidden_states is a tuple of length N+1, one for each layer plus the embedding layer
    # The final layer’s hidden state is outputs.hidden_states[-1] with shape (batch_size, seq_len, hidden_dim)
    last_hidden_state = outputs.hidden_states[-1]
    
    # Mean-pool over the seq_len dimension
    return last_hidden_state.mean(dim=1)  # shape: (batch_size, hidden_dim)

In [6]:
def load_data(file_path):
    """Load JSONL file containing 'unpert_gen_text' and 'pert_gen_text' pairs."""
    nontoxic_texts = []
    toxic_texts = []
    with open(file_path, "r") as f:
        for line in f:
            entry = json.loads(line)
            nontoxic_texts.append(entry["unpert_gen_text"])
            toxic_texts.append(entry["pert_gen_text"])
    return nontoxic_texts, toxic_texts


In [7]:
def compute_differences(files, batch_size=16):
    """Compute embedding differences for all dataset splits in batches."""
    differences = []
    
    for file in files:
        nontoxic_texts, toxic_texts = load_data(file)  # Unpack correctly
        
        # Process in batches
        for i in range(0, len(nontoxic_texts), batch_size):
            batch_nt = nontoxic_texts[i : i + batch_size]
            batch_tox = toxic_texts[i : i + batch_size]

            # Get batch embeddings
            emb_nt = get_embedding(batch_nt)  # Non-toxic embeddings (B, D)
            emb_tox = get_embedding(batch_tox)  # Toxic embeddings (B, D)

            # Compute differences
            diff_batch = (emb_tox - emb_nt).cpu().numpy()  # Move back to CPU for SVD
            differences.append(diff_batch)

    return np.vstack(differences)  # Shape: (num_pairs, embedding_dim)

In [8]:
# List of dataset splits
file_paths = [f"/data/kebl6672/dpo-toxic-general/data/toxicity_pairwise/split_{i}.jsonl" for i in range(6)]

# Compute embedding differences
diff_matrix = compute_differences(file_paths)

print("Difference matrix shape:", diff_matrix.shape)  # Should be (num_pairs, embedding_dim)

Difference matrix shape: (24576, 1024)


#### SVD on embedding differences

In [9]:
# Apply SVD
k = 10  # Number of singular components

# Initialize Truncated SVD
tsvd = TruncatedSVD(n_components=k, random_state=42)

# Fit and transform the data
X_reduced = tsvd.fit_transform(diff_matrix)

# The top-k right singular vectors (shape: (k, 1024))
top_k_right_singular_vectors = tsvd.components_

# The top-k singular values (length: k)
singular_values = tsvd.singular_values_

print("Reduced data shape:", X_reduced.shape)  
# -> (24576, 10)  each original row is now projected into 10-D space

print("Right singular vectors shape:", top_k_right_singular_vectors.shape)  
# -> (10, 1024)  each row is one singular vector in the 1024-D embedding space

print("Singular values shape:", singular_values.shape)  
# -> (10,)

Reduced data shape: (24576, 10)
Right singular vectors shape: (10, 1024)
Singular values shape: (10,)


#### Logit lens of singular vectors

In [11]:
# logitlens
# Function to project a vector to vocab space and get top 10 tokens
def get_top_tokens(vector, model, tokenizer, top_k=10):
    # 1) For GPT2LMHeadModel, the matrix is at model.lm_head.weight (vocab_size, hidden_dim)
    W_U = model.lm_head.weight  

    # 2) Make sure vector is shape (1, hidden_dim)
    if vector.dim() == 1:
        vector = vector.unsqueeze(0)

    # 3) Multiply by W_U.T => shape (1, vocab_size), then squeeze => (vocab_size,)
    vocab_projection = torch.matmul(vector, W_U.T).squeeze(0)

    # 4) Softmax over vocab dimension
    probs = torch.softmax(vocab_projection, dim=-1)

    # 5) Top-k tokens and probabilities
    top_probs, top_indices = torch.topk(probs, top_k)

    # 6) Decode tokens
    top_tokens = [tokenizer.decode([idx]) for idx in top_indices]

    return list(zip(top_tokens, top_probs.tolist()))

In [12]:
top_vector_np = top_k_right_singular_vectors[0]  # shape (1024,)
top_vector = torch.tensor(top_vector_np, dtype=torch.float, device=device)

In [13]:
# Get top 10 tokens of the most toxic value vector
top_10_tokens = get_top_tokens(top_vector, model, tokenizer, top_k=10)

# Print results
for token, prob in top_10_tokens:
    print(f"Token: {token}, Prob: {prob}")

Token: ーテ, Prob: 2.0236982891219668e-05
Token: acebook, Prob: 2.0228531866450794e-05
Token: emetery, Prob: 2.021005639107898e-05
Token: lehem, Prob: 2.0208868590998463e-05
Token:  cumbers, Prob: 2.0199749997118488e-05
Token: apego, Prob: 2.0192746887914836e-05
Token: irlf, Prob: 2.0192155716358684e-05
Token: senal, Prob: 2.019090788962785e-05
Token: ��, Prob: 2.0187400878057815e-05
Token: osponsors, Prob: 2.0183249944238923e-05


In [35]:
top_vector_np = top_k_right_singular_vectors[1]  # shape (1024,)
top_vector = torch.tensor(top_vector_np, dtype=torch.float, device=device)

In [36]:
# Get top 10 tokens of the most toxic value vector
top_10_tokens = get_top_tokens(top_vector, model, tokenizer, top_k=10)

# Print results
for token, prob in top_10_tokens:
    print(f"Token: {token}, Prob: {prob}")

Token:  mathemat, Prob: 2.3665372282266617e-05
Token:  neighb, Prob: 2.3351165509666316e-05
Token:  trave, Prob: 2.3207629055832513e-05
Token:  streng, Prob: 2.3130147383199073e-05
Token:  nodd, Prob: 2.310434138053097e-05
Token:  surpr, Prob: 2.293787656526547e-05
Token:  traged, Prob: 2.2835989511804655e-05
Token:  predec, Prob: 2.2809592337580398e-05
Token:  notor, Prob: 2.273381505801808e-05
Token:  psychiat, Prob: 2.2726871975464746e-05
