In [None]:
# Plot test role probes

In [None]:
library(tidyverse)
library(fs)
library(ggtext)
library(systemfonts)
library(arrow)
library(patchwork)

ws = '/workspace/deliberative-alignment-jailbreaks'
model_prefix = 'gptoss-20b'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
raw_df =
    read_csv(str_glue("{ws}/experiments/agent-injections/outputs/runs/{model_prefix}/prompt-outputs.csv")) %>%
    select(-all_of(starts_with('react_loop')))

projections_df =
    read_feather(str_glue("{ws}/experiments/agent-injections/outputs/runs/{model_prefix}/token-projections.feather"))

sample_level_df =
    read_feather(str_glue("{ws}/experiments/agent-injections/outputs/runs/{model_prefix}/tokens.feather"))

raw_df

## Plots

In [None]:
# Userness x ASR plot

ngroups = 25

userness_x_asr = map(1:1000, function(b) {

    userness_by_prompt_samples = sample_n(raw_df, nrow(raw_df), replace = T)

    userness_by_prompt_samples %>%
        mutate(
            userness_q = ntile(avg_userness, ngroups) * (100/ngroups)/100
        ) %>%
        group_by(., userness_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(attack_attempt == 1, 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(userness_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (userness_q), ymin = asr_bot, ymax = asr_top), fill = '#b8e6fe', alpha = 0.5) +
    geom_line(aes(x = (userness_q), y = asr_mean), color = '#00bcff', linewidth = 0.8, alpha = 0.9) +
    geom_point(aes(x = (userness_q), y = asr_mean), color = '#00bcff', size = 1.5) +
    scale_x_continuous(
        labels = scales::percent_format(accuracy = 1),
        breaks = c(0, .25, .5, .75, 1),
        expand = expansion(mult = c(0.002, 0.003))
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0.02)),
        breaks = c(0, .25, .5, .75, 1)
    ) +
    labs(
        x = '<b>Userness</b> of injected command, as %ile of attacks',
        y = '<b>Attack success rate</b>'
    ) +
    theme_iclr(base_size = 10.5) +
    theme(
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.title.x = ggtext::element_markdown(angle = 0, vjust = 0, margin = margin(t = 6)),
        axis.text.x = ggtext::element_markdown(angle = 0, hjust = 0.5, size = rel(0.95), margin = margin(t = 4)),
        plot.margin = margin(t = 0, r = 8, b = 0, l = 0, unit = 'pt')
    )

# walk(c(3.5, 7.0), function(width) {

#     ggsave(
#         str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-x-asr{if (width == 3.5) "-half" else ""}.pdf'),
#         plot = userness_x_asr, width = width, height = 3.0, units = 'in', dpi = 300, device = cairo_pdf
#     )
#     ggsave(
#         str_glue('{ws}/experiments/agent-injections/outputs/plots/userness-x-asr{if (width == 3.5) "-half" else ""}.png'),
#         plot = userness_x_asr, width = width, height = 3.0, units = 'in', dpi = 300
#     )
#     ggsave(
#         str_glue('{ws}/docs/userness-x-asr{if (width == 3.5) "-half" else ""}.png'),
#         plot = userness_x_asr, width = width, height = 3.0, units = 'in', dpi = 300
#     )

# })

userness_x_asr

In [None]:
ngroups = 4

map(1:1000, function(b) {

    userness_by_prompt_samples =
        raw_df %>%
        filter(., variant_role %in% c('user', 'tool')) %>%
        group_by(variant_role) %>%
        slice_sample(prop = 1, replace = T) %>%
        ungroup()
        #  sample_n(raw_df, nrow(raw_df), replace = T)
        
    userness_by_prompt_samples %>%
        group_by(variant_role) %>%
        mutate(
            userness_q = ntile(avg_userness, ngroups) * (100/ngroups)/100
        ) %>%
        ungroup() %>%
        group_by(., variant_role, userness_q) %>%
        summarize(
            .,
            n = n(),
            asr = sum(ifelse(attack_attempt == 1, 1, 0))/n(),
            .groups = 'drop'
        ) %>%
        mutate(., b = b)
    }) %>%
    list_rbind() %>%
    group_by(variant_role, userness_q) %>%
    summarize(
        .,
        n_prompts = n(),
        asr_mean = mean(asr),
        asr_bot = quantile(asr, 0.05),
        asr_top = quantile(asr, 0.95),
        .groups = 'drop'
    ) %>%
    ggplot() +
    geom_ribbon(aes(x = (userness_q), ymin = asr_bot, ymax = asr_top), fill = '#b8e6fe', alpha = 0.5) +
    geom_line(aes(x = (userness_q), y = asr_mean), color = '#00bcff', linewidth = 1, alpha = 0.9) +
    geom_point(aes(x = (userness_q), y = asr_mean), color = '#00bcff', size = 2) +
    scale_x_continuous(
        labels = scales::percent_format(accuracy = 1),
        breaks = c(0, .25, .5, .75, 1),
        expand = expansion(mult = c(0.002, 0.002))
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        expand = expansion(mult = c(0, 0.03)),
        breaks = c(0, .2, .4, .6, .8, 1)
    ) +
    labs(
        x = '<b>Userness</b>, as %ile of prompt injection attempts',
        y = '<b>Attack success rate</b>'
    ) +
    facet_wrap(~variant_role, scales = 'free_y') +
    theme_iclr(base_size = 11) +
    theme(
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.title.x = ggtext::element_markdown(angle = 0, vjust = 0, margin = margin(t = 6)),
        axis.text.x = ggtext::element_markdown(angle = 0, hjust = 0.5, size = rel(0.95), margin = margin(t = 4)),
        plot.margin = margin(t = 0, r = 8, b = 0, l = 0, unit = 'pt')
    )