In [None]:
# Plot test role probes

In [None]:
library(tidyverse)
library(fs)
# library(ggtext)
library(systemfonts)
library(arrow)

ws = '/workspace/deliberative-alignment-jailbreaks'
model_prefix = 'gptoss20'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
base_path = file.path(ws, 'experiments/da-role-analysis/activations-redteam', model_prefix)

prompts_df =
    read_csv(file.path(base_path, 'base-harmful-responses-classified.csv'), trim_ws = FALSE) %>%
    select(
        .,
        redteam_prompt_ix,
        harmful_question_ix, # harmful_question, harmful_question_category,
        qualifier_type, policy_style, synthetic_policy,
        output_class
    )

print(prompts_df)

raw_projections_df =
    read_feather(file.path(ws, str_glue('experiments/da-role-analysis/projections/redteam-role-projections-{model_prefix}.feather')))

probe_mapping_df =
    read_csv(file.path(ws, str_glue('experiments/da-role-analysis/projections/redteam-role-probe-mapping-{model_prefix}.csv')))

head(raw_projections_df, 5)

In [None]:
## CHOOSE TEST LAYERS FOR PROBES

# OR GROUP BY COTNESS

# Analysis - redteam projections

In [None]:
# Choose primary test layer / roles
print(probe_mapping_df)

test_layer_ix = 12
test_roles = 'assistant-cot,assistant-final,system,user'
test_probe_ix = filter(probe_mapping_df, layer_ix == test_layer_ix, roles == test_roles)$probe_ix
test_probe_ix

In [None]:
# Merge - get all layers, but only roles = test_roles
roles_df =
    raw_projections_df %>%
    inner_join(
        probe_mapping_df %>% filter(roles == test_roles),
        by = 'probe_ix'
    ) %>%
    inner_join(
        .,
        prompts_df %>%
            select(., redteam_prompt_ix, qualifier_type, policy_style, output_class) %>%
            mutate(., output_class = ifelse(output_class == 'REDIRECTION', 'REFUSAL', output_class)),
        by = 'redteam_prompt_ix'
        )

head(roles_df, 5)

In [None]:
# Verify that for CoT forgery, styled is more "assistant-cot-like" than destyled
roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    group_by(layer_ix, policy_style, output_class) %>%
    summarize(
        .,
        n_toks = n(),
        n_prompts = n_distinct(redteam_prompt_ix),
        median_cotness = median(prob),
        tail_cotness = tail(prob, 1),
        .groups = 'drop'
    )

In [None]:
roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space %in% c('user', 'assistant-cot')) %>%
    pivot_wider(., names_from = role_space, values_from = prob) %>%
    mutate(., prob = log(`assistant-cot`) - log(`user`))

In [None]:
# Split into groups of 25
ngroups = 25
layers_to_test = c(12)

cotness_by_prompt =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    filter(., layer_ix %in% layers_to_test) %>%
    group_by(layer_ix, redteam_prompt_ix) %>%
    summarize(
        .,
        cotness = mean(prob),
        # cotness = mean(-1 * log10(1 - prob)),
        .groups = 'drop'
    ) %>%
    inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

map(1:100, .progress = T, function(b) {

    cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        group_by(layer_ix) %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., layer_ix, cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(layer_ix, cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    mutate(., layer_ix = as.factor(layer_ix)) %>%
    ggplot() +
    geom_ribbon(aes(x = as.integer(cot_q), ymin = asr_bot, ymax = asr_top, fill = layer_ix), alpha = 0.5) +
    geom_line(aes(x = as.integer(cot_q), y = asr_mean, color = layer_ix), linewidth = 1) +
    geom_point(aes(x = as.integer(cot_q), y = asr_mean, color = layer_ix), size = 2) +
    scale_x_continuous(
        labels = \(x) (x * ngroups)/100
    ) +
    labs(
        x = 'CoTness %ile',
        y = ''
    )
    theme_iclr(base_size = 11)

In [None]:
# Same as above, but with margin difference
ngroups = 20
layers_to_test = c(4, 12, 20)

cotness_by_prompt =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space %in% c('user', 'assistant-cot')) %>%
    pivot_wider(., names_from = role_space, values_from = prob) %>%
    mutate(., prob = log(`assistant-cot`) - log(`user`)) %>%
    filter(., layer_ix %in% layers_to_test) %>%
    group_by(layer_ix, redteam_prompt_ix) %>%
    summarize(
        .,
        cotness = mean(prob),
        # cotness = mean(-1 * log10(1 - prob)),
        .groups = 'drop'
    ) %>%
    inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

map(1:100, .progress = T, function(b) {

    cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        group_by(layer_ix) %>%
        mutate(
            cot_q = round(cotness * 2)/2
            # cot_q = ntile(cotness, ngroups),
            # cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., layer_ix, cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        filter(., n >= 5) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(layer_ix, cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    filter(., n_prompts >= 20) %>%
    mutate(., layer_ix = as.factor(layer_ix)) %>%
    ggplot() +
    geom_ribbon(aes(x = (cot_q), ymin = asr_bot, ymax = asr_top, fill = layer_ix), alpha = 0.5, lineend = 'round', linejoin = 'round', linemitre = 2) +
    geom_line(aes(x = (cot_q), y = asr_mean, color = layer_ix), linewidth = 1) +
    geom_point(aes(x = (cot_q), y = asr_mean, color = layer_ix), size = 2) +
    scale_x_continuous() +
    # scale_x_continuous(
    #     labels = \(x) (x * ngroups)/100
    # ) +
    labs(
        x = 'CoTness %ile',
        y = ''
    ) +
    facet_grid(cols = vars(layer_ix), scales = 'free_x') +
    theme_iclr(base_size = 11) 

In [None]:
# Same as above, but with margin difference
ngroups = 25
layers_to_test = c(4, 12, 20)

cotness_by_prompt =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space %in% c('user', 'assistant-cot')) %>%
    pivot_wider(., names_from = role_space, values_from = prob) %>%
    mutate(., prob = (`assistant-cot`) - (`user`)) %>%
    filter(., layer_ix %in% layers_to_test) %>%
    group_by(layer_ix, redteam_prompt_ix) %>%
    summarize(
        .,
        cotness = mean(prob),
        # cotness = mean(-1 * log10(1 - prob)),
        .groups = 'drop'
    ) %>%
    inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

map(1:100, .progress = T, function(b) {

    cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        group_by(layer_ix) %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., layer_ix, cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(layer_ix, cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    filter(., n_prompts >= 20) %>%
    mutate(., layer_ix = as.factor(layer_ix)) %>%
    ggplot() +
    geom_ribbon(aes(x = as.integer(cot_q), ymin = asr_bot, ymax = asr_top, fill = layer_ix), alpha = 0.5, lineend = 'round', linejoin = 'round', linemitre = 2) +
    geom_line(aes(x = as.integer(cot_q), y = asr_mean, color = layer_ix), linewidth = 1) +
    geom_point(aes(x = as.integer(cot_q), y = asr_mean, color = layer_ix), size = 2) +
    scale_x_continuous(
        labels = \(x) round(x * 100/ngroups)/100
    ) +
    labs(
        x = 'CoTness %ile of Forged CoT',
        y = 'Attack Success Rate'
    ) +
    facet_grid(cols = vars(layer_ix), scales = 'free_x') +
    theme_iclr(base_size = 11) 

In [None]:
ngroups = 25

cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

cotness_by_prompt_samples %>%
    group_by(layer_ix) %>%
    mutate(
        cot_q = ntile(cotness, ngroups),
        cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
    ) %>%
    group_by(., layer_ix, cot_q) %>%
    summarize(
        .,
        n = n(),
        asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
        .groups = 'drop'
    )

In [None]:
# cotness_by_prompt =
#     roles_df %>%
#     filter(., policy_style %in% c('base', 'destyled')) %>%
#     filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
#     filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
#     group_by(layer_ix, redteam_prompt_ix) %>%
#     summarize(
#         .,
#         cotness = mean(prob),
#         # cotness = mean(-1 * log10(1 - prob)),
#         .groups = 'drop'
#     ) %>%
#     inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

cotness_by_prompt

In [None]:
ngroups = 20

cotness_by_token =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    mutate(
        .,
        cotness = (prob),
    ) 

map(1:20, function(b) {

    cotness_by_token_samples = sample_n(cotness_by_token, nrow(cotness_by_token), replace = T)

    cotness_by_token_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = as.integer(cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = as.integer(cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = as.integer(cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
cotness_by_token_samples = sample_n(cotness_by_token, nrow(cotness_by_token), replace = T)

cotness_by_token_samples %>%
    mutate(
        cot_q = ntile(cotness, ngroups),
        cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
    ) %>%
    group_by(., cot_q) %>%
    summarize(
        .,
        n = n(),
        asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
        .groups = 'drop'
    ) 



In [None]:

cotness_by_prompt =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    group_by(redteam_prompt_ix) %>%
    summarize(
        .,
        cotness = -1 * log10(1 - mean(prob)),
        .groups = 'drop'
    ) %>%
    inner_join(., select(prompts_df, redteam_prompt_ix, output_class), by = 'redteam_prompt_ix') 

map(1:1000, function(b) {

    cotness_by_prompt_samples = sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        mutate(
            cot_q = round(cotness * 10)/10
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
cotness_by_token =
    roles_df %>%
    filter(., policy_style %in% c('base', 'destyled')) %>%
    filter(., role == 'user' & base_message_type == 'forged_cot') %>% # Get forged CoT only
    filter(., role_space == 'assistant-cot') %>% # Get projection in assistant-cot space
    mutate(
        .,
        cotness = -1 * log(1 - prob),
    ) 

map(1:1000, function(b) {

    cotness_by_token_samples = sample_n(cotness_by_token, nrow(cotness_by_token), replace = T)

    cotness_by_token_samples %>%
        mutate(
            cot_q = round(cotness * 2)/2
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (cot_q), ymin = asr_bot, ymax = asr_top), fill = 'lightblue', alpha = 0.5) +
    geom_line(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 1) +
    geom_point(aes(x = (cot_q), y = asr_mean), color = 'blue', size = 2) +
    theme_iclr(base_size = 11)

In [None]:
cotness_by_prompt_samples %>%
    mutate(
        cot_q = round(exp(cotness) * 10)/10
    ) %>%
    group_by(., cot_q) %>%
    summarize(
        .,
        n = n(),
        asr = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
        .groups = 'drop'
    ) 


In [None]:
cotness_by_prompt

In [None]:
    cotness_by_prompt_samples %>%
        mutate(
            cot_q = round(cotness, 1)
        ) 

In [None]:
map(1:10, function(b) {

    cotness_by_prompt_samples =
        sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

    cotness_by_prompt_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            rte = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    filter(., cot_q == 1)  %>%
    group_by(cot_q) %>%
    summarize(
        .,
        n_prompts = n(),
        rte_mean = mean(rte),
        rte_bot = quantile(rte, .05),
        rte_top = quantile(rte, .095),
        .groups = 'drop'
    )    

In [None]:
     cotness_by_prompt_samples =
        sample_n(cotness_by_prompt, nrow(cotness_by_prompt), replace = T)

   cotness_by_prompt_samples %>%
        mutate(
            cot_q = ntile(cotness, ngroups),
            cot_q = factor(cot_q, levels = seq_len(ngroups), labels = seq_len(ngroups))
        ) %>%
        group_by(., cot_q) %>%
        summarize(
            .,
            n = n(),
            rte = sum(ifelse(output_class == 'HARMFUL_RESPONSE', 1, 0))/n(),
            .groups = 'drop'
        ) %>%

# Analysis - test projections