In [None]:
# Plot test role probes

In [None]:
library(tidyverse)
library(fs)
library(ggtext)
library(systemfonts)
library(arrow)
library(patchwork)
# library(sandwich) # GLM
# library(lmtest) # GLM

ws = '/workspace/deliberative-alignment-jailbreaks'
model_prefix = 'gptoss-20b'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
raw_df =
    read_csv(str_glue("{ws}/experiments/agent-injections/outputs/runs/{model_prefix}/prompt-outputs.csv")) %>%
    select(-all_of(starts_with('react_loop')))

projections_df =
    read_feather(str_glue("{ws}/experiments/agent-injections/outputs/runs/{model_prefix}/token-projections.feather"))

sample_level_df =
    read_feather(str_glue("{ws}/experiments/agent-injections/outputs/runs/{model_prefix}/tokens.feather"))

raw_df

# Userness x ASR

In [None]:
# Userness x ASR plot
ngroups = 25

userness_x_asr = map(1:1000, function(b) {

    userness_by_prompt_samples = sample_n(raw_df, nrow(raw_df), replace = T)

    userness_by_prompt_samples %>%
        mutate(
            userness_q = ntile(avg_userness, ngroups) * (100/ngroups)/100
        ) %>%
        group_by(., userness_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(attack_attempt == 1, 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(userness_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (userness_q), ymin = asr_bot, ymax = asr_top), fill = '#b8e6fe', alpha = 0.5) +
    geom_line(aes(x = (userness_q), y = asr_mean), color = '#00bcff', linewidth = 0.8, alpha = 0.9) +
    geom_point(aes(x = (userness_q), y = asr_mean), color = '#00bcff', size = 1.5) +
    scale_x_continuous(
        labels = scales::percent_format(accuracy = 1),
        breaks = c(0, .25, .5, .75, 1),
        expand = expansion(mult = c(0.002, 0.003))
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0.02)),
        breaks = c(0, .25, .5, .75, 1)
    ) +
    labs(
        x = '<b>Userness</b> of injected command, as %ile of attacks',
        y = '<b>Attack success rate</b>'
    ) +
    theme_iclr(base_size = 10.5) +
    theme(
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.title.x = ggtext::element_markdown(angle = 0, vjust = 0, margin = margin(t = 6)),
        axis.text.x = ggtext::element_markdown(angle = 0, hjust = 0.5, size = rel(0.95), margin = margin(t = 4)),
        plot.margin = margin(t = 0, r = 8, b = 0, l = 0, unit = 'pt')
    )

# walk(c(3.5, 7.0), function(width) {

#     ggsave(
#         str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-x-asr{if (width == 3.5) "-half" else ""}.pdf'),
#         plot = userness_x_asr, width = width, height = 3.0, units = 'in', dpi = 300, device = cairo_pdf
#     )
#     ggsave(
#         str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-x-asr{if (width == 3.5) "-half" else ""}.png'),
#         plot = userness_x_asr, width = width, height = 3.0, units = 'in', dpi = 300
#     )
#     ggsave(
#         str_glue('{ws}/docs/userness-x-asr{if (width == 3.5) "-half" else ""}.png'),
#         plot = userness_x_asr, width = width, height = 3.0, units = 'in', dpi = 300
#     )

# })

userness_x_asr

In [None]:
# Same but ICML
ngroups = 25

userness_x_asr = map(1:1000, function(b) {

    userness_by_prompt_samples = sample_n(raw_df, nrow(raw_df), replace = T)

    userness_by_prompt_samples %>%
        mutate(
            userness_q = ntile(avg_userness, ngroups) * (100/ngroups)/100
        ) %>%
        group_by(., userness_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(attack_attempt == 1, 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(userness_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (userness_q), ymin = asr_bot, ymax = asr_top), fill = '#b8e6fe', alpha = 0.5) +
    geom_line(aes(x = (userness_q), y = asr_mean), color = '#00bcff', linewidth = 0.8, alpha = 0.9) +
    geom_point(aes(x = (userness_q), y = asr_mean), color = '#00bcff', size = 1.5) +
    scale_x_continuous(
        labels = scales::percent_format(accuracy = 1),
        breaks = c(0, .25, .5, .75, 1),
        expand = expansion(mult = c(0.002, 0.003))
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0.02)),
        breaks = c(0, .25, .5, .75, 1)
    ) +
    labs(
        x = '<b>Userness</b> of injected command, as %ile of attacks',
        y = '<b>Attack success rate</b>'
    ) +
    theme_iclr(base_size = 10.5) +
    theme(
        axis.title.y = ggtext::element_markdown(size = 8, angle = 90, vjust = 0.5, margin = margin(r = 4)),
        axis.title.x = ggtext::element_markdown(size = 8, angle = 0, vjust = 0, margin = margin(t = 4)),
        axis.text.x = ggtext::element_markdown(angle = 0, hjust = 0.5, size = rel(0.95), margin = margin(t = 3)),
        plot.margin = margin(t = 0, r = 4, b = 0, l = 0, unit = 'pt')
    )

ggsave(
    str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-x-asr-2.pdf'),
    plot = userness_x_asr, width = 3.75, height = 1.5, units = 'in', dpi = 300, device = cairo_pdf
)
ggsave(
    str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-x-asr-2.png'),
    plot = userness_x_asr, width = 3.75, height = 1.5, units = 'in', dpi = 300
)
ggsave(
    str_glue('{ws}/docs/userness-x-asr-2.png'),
    plot = userness_x_asr, width = 3.75, height = 1.5, units = 'in', dpi = 300
)

userness_x_asr

# Userness x ASR by Subgroup

In [None]:
# Grouped bar plot
ngroups = 4

map(1:2000, function(b) {

    userness_by_prompt_samples =
        raw_df %>%
        filter(., variant_role %in% c('assistant', 'user', 'tool')) %>%
        group_by(variant_role) %>%
        slice_sample(prop = 1, replace = T) %>%
        ungroup()
        
    userness_by_prompt_samples %>%
        group_by(variant_role) %>%
        mutate(
            userness_q = ntile(avg_userness/avg_toolness, ngroups) * (100/ngroups)/100
        ) %>%
        ungroup() %>%
        group_by(., variant_role, userness_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(attack_attempt == 1, 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(variant_role, userness_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_col(aes(x = userness_q, y = asr_mean),width = 0.20, fill = "#b8e6fe", alpha = 0.7) +
    geom_errorbar(aes(x = userness_q, ymin = asr_bot, ymax = asr_top), width = 0.06, colour = "#00bcff", linewidth = 0.8) +
    scale_x_continuous(
        labels = scales::percent_format(accuracy = 1),
        breaks = c(0, .25, .5, .75, 1),
        expand = expansion(mult = c(0.002, 0.002))
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        expand = expansion(mult = c(0, 0.03)),
    ) +
    labs(
        x = '<b>Userness</b>, as %ile within group',
        y = '<b>Attack success rate</b>'
    ) +
    facet_wrap(~variant_role, scales = 'fixed') +
    theme_iclr(base_size = 11) +
    theme(
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.title.x = ggtext::element_markdown(angle = 0, vjust = 0, margin = margin(t = 6)),
        axis.text.x = ggtext::element_markdown(angle = 0, hjust = 0.5, size = rel(0.95), margin = margin(t = 4)),
        plot.margin = margin(t = 0, r = 8, b = 0, l = 0, unit = 'pt')
    )

In [None]:
# Logistic Regression
glm_df =
    raw_df %>%
    filter(variant_role %in% c('assistant', 'user', 'tool')) %>%
    mutate(
        variant_ix = as.factor(variant_template),
        variant_role = factor(variant_role),
        avg_userness = as.numeric(avg_userness) # should be 0..1,
    )

m1 = glm(attack_attempt ~ variant_role + avg_userness, family = binomial(), data = glm_df)
vc = vcovCL(m1, cluster = glm_df$variant_ix)
coeftest(m1, vcov. = vc)

print(nrow(glm_df))
print(length(unique(glm_df$variant_template)))

In [None]:
ranking_df =
    raw_df %>%
    # filter(., variant_role %in% c('user', 'assistant', 'tool')) %>%
    arrange(desc(avg_userness)) %>%
    mutate(., variant_role = ifelse(is.na(variant_role), 'control', variant_role)) %>%
    mutate(., variant_role = factor(
        variant_role,
        levels = c('user', 'assistant', 'tool', 'other'),
        labels = c('User', 'Assistant', 'Tool', 'Control')
    )) %>%
    mutate(., variant_model = ifelse(is.na(variant_model), 'claim', 'chat_template')) %>%
    mutate(., variant_model = factor(
        variant_model,
        levels = c('chat_template', 'claim'),
        labels = c('Foreign chat templates', 'Others (Explicit role declarations, format variations, controls)' )
    )) %>%
    group_by(., variant_model) %>%
    mutate(., ord_rank = row_number(), ranking = row_number()/n()) %>%
    ungroup() %>%
    select(., ord_rank, ranking, variant_role, variant_model) 

h = 0.04

rank_plot =
    ranking_df %>%
    ggplot(aes(ranking, 0, color = variant_role)) +
    geom_linerange(aes(ymin = -h, ymax = h), linewidth = 1.2, alpha = 0.9) +
    scale_x_continuous("← Higher Userness | Lower Userness →", breaks = NULL, expand = expansion(mult = c(0, 0))) +
    scale_y_continuous(NULL, breaks = NULL, limits = c(-h, h), expand = c(0, 0)) +
    labs(color = 'Claimed Role', x = '← Higher Userness | Lower Userness →') +
    facet_wrap(vars(variant_model), ncol = 1) +
    scale_color_manual(
        values = c(
            'User' = '#00bcff', # sky
            'Assistant' = '#00d4bfc4',  # emerald
            'Tool' = '#c397ea',
            'Control' = '#C0C2C9'
        )
    ) +
    theme_iclr(base_size = 11) +
    theme(
        aspect.ratio = 0.12,
        panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.x = ggtext::element_markdown(margin = margin(t = 8), hjust = 0.5),
        axis.title.y = element_blank(),
        axis.text.x  = element_blank(),
        axis.text.y  = element_blank(),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line    = element_blank(),
        legend.position = "bottom",
        legend.direction = "horizontal",
        legend.box.spacing = unit(0, "pt"),
        legend.margin = margin(t = 24, b = 0),
        legend.key.height = unit(6, "pt"),
        legend.key.width  = unit(14, "pt"),
        plot.margin = margin(0, 0, 0, 0, unit = "pt"),
        strip.placement = 'outside',
        panel.spacing.y = unit(1.0, 'lines'),
        strip.text = element_text(margin = margin(b = 8))  # increase b to taste
    ) +
    guides(colour = guide_legend(nrow = 1, override.aes = list(linewidth = 2.0, alpha = 1)))


ggsave(
    str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-rankings.pdf'),
    plot = rank_plot, width = 7.0, height = 2.5, units = 'in', dpi = 300, device = cairo_pdf
)
ggsave(
    str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-rankings.png'),
    plot = rank_plot, width = 7.0, height = 2.5, units = 'in', dpi = 300
)
ggsave(
    str_glue('{ws}/docs/userness-rankings.png'),
    plot = rank_plot, width = 7.0, height = 2., units = 'in', dpi = 300
)

rank_plot
