In [None]:
# Plot test role probes

In [None]:
library(tidyverse)
library(fs)
# library(ggtext)
library(systemfonts)

ws = '/workspace/deliberative-alignment-jailbreaks'
model_prefix = 'gptoss20'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
base_path = file.path(ws, 'experiments/da-role-analysis/activations-redteam', model_prefix)

prompts_df =
    read_csv(file.path(base_path, 'base-harmful-responses-classified.csv'), trim_ws = FALSE) %>%
    select(
        .,
        redteam_prompt_ix,
        harmful_question_ix, # harmful_question, harmful_question_category,
        qualifier_type, policy_style, synthetic_policy,
        output_class
    )

print(prompts_df)

raw_projections_df =
    read_csv(file.path(base_path, 'role-space-projections.csv'), trim_ws = FALSE)

head(raw_projections_df, 5)

# Analysis - redteam projections

In [None]:
# Merge
roles_df =
    raw_projections_df %>%
    inner_join(., prompts_df, by = 'redteam_prompt_ix') %>%
    mutate(., output_class = ifelse(output_class == 'REDIRECTION', 'REFUSAL', output_class))

head(roles_df, 5)

In [None]:
# Verify that for CoT forgery, styled is more "assistant-cot-like" than destyled
roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    group_by(policy_style, output_class) %>%
    summarize(
        .,
        n_toks = n(),
        n_prompts = n_distinct(redteam_prompt_ix),
        median_cotness = median(prob),
        tail_cotness = tail(prob, 1),
        .groups = 'drop'
    )

In [None]:

ngroups = 20

cotness_by_prompt =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    group_by(redteam_prompt_ix) %>%
    summarize(
        .,
        cotness = mean(prob),
        # cotness = mean(-1 * log10(1 - prob)),
        .groups = 'drop'
    ) %>%
    inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

map(1:100, function(b) {

    cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = as.integer(cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = as.integer(cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = as.integer(cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
ngroups = 20

cotness_by_token =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    mutate(
        .,
        cotness = (prob),
    ) 

map(1:20, function(b) {

    cotness_by_token_samples = sample_n(cotness_by_token, nrow(cotness_by_token), replace = T)

    cotness_by_token_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = as.integer(cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = as.integer(cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = as.integer(cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
cotness_by_token_samples = sample_n(cotness_by_token, nrow(cotness_by_token), replace = T)

cotness_by_token_samples %>%
    mutate(
        cot_q = ntile(cotness, ngroups),
        cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
    ) %>%
    group_by(., cot_q) %>%
    summarize(
        .,
        n = n(),
        asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
        .groups = 'drop'
    ) 



In [None]:

cotness_by_prompt =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    group_by(redteam_prompt_ix) %>%
    summarize(
        .,
        cotness = -1 * log10(1 - mean(prob)),
        .groups = 'drop'
    ) %>%
    inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

map(1:1000, function(b) {

    cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        mutate(
            cot_q = round(cotness * 10)/10
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
cotness_by_token =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    mutate(
        .,
        cotness = -1 * log(1 - prob),
    ) 

map(1:1000, function(b) {

    cotness_by_token_samples = sample_n(cotness_by_token, nrow(cotness_by_token), replace = T)

    cotness_by_token_samples %>%
        mutate(
            cot_q = round(cotness * 2)/2
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
cotness_by_prompt_samples %>%
    mutate(
        cot_q = round(exp(cotness) * 10)/10
    ) %>%
    group_by(., cot_q) %>%
    summarize(
        .,
        n = n(),
        asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
        .groups = 'drop'
    ) 


In [None]:
cotness_by_prompt

In [None]:
    cotness_by_prompt_samples %>%
        mutate(
            cot_q = round(cotness, 1)
        ) 

In [None]:
map(1:10, function(b) {

    cotness_by_prompt_samples =
        sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            rte = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    filter(., cot_q == 1)  %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        rte_mean = mean(rte),
        rte_bot = quantile(rte, .05),
        rte_top = quantile(rte, .095),
        .groups = 'drop'
    )    

In [None]:
     cotness_by_prompt_samples =
        sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

   cotness_by_prompt_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            rte = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%

# Analysis - test projections