In [None]:
# Plot jailbreak eval stats

In [None]:
library(tidyverse)
library(fs)
# library(ggtext)
library(systemfonts)

ws = '/workspace/deliberative-alignment-jailbreaks'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
base_gpt_oss_df =
    read_csv(file.path(ws, 'experiments/da-jailbreaks/base-harmful-responses-classified.csv')) %>%
    transmute(
        target_model = 'base-gpt-oss-20b',
        redteam_prompt_ix, redteam_prompt, redteam_output_cot, redteam_output_final,
        harmful_question_ix, harmful_question, harmful_question_category,
        qualifier_type, harmful_question_with_qualifier,
        synthetic_policy_model, synthetic_policy, policy_style,
        output_class
    )

alt_models_df =
    read_csv(file.path(ws, 'experiments/da-jailbreaks/openrouter-generations/harmful-responses-classified.csv')) %>%
    transmute(
        target_model = str_replace(target_model, '.*/', ''),
        redteam_prompt_ix, redteam_prompt, redteam_output_cot, redteam_output_final,
        harmful_question_ix, harmful_question, harmful_question_category,
        qualifier_type, harmful_question_with_qualifier,
        synthetic_policy_model, synthetic_policy, policy_style,
        output_class
    )
    
prompts_df =
    bind_rows(base_gpt_oss_df, alt_models_df) %>%
    mutate(., target_model = fct_relevel(target_model,
        'base-gpt-oss-20b', 'gpt-oss-20b', 'gpt-oss-120b', 'o4-mini', 'gpt-5-nano', 'gpt-5-mini', 'gpt-5'
        )) %>%
    mutate(., target_model = recode(target_model,
        'base-gpt-oss-20b' = 'base-gpt-oss-20b',
        'gpt-oss-20b' = 'gpt-oss-20b',
        'gpt-oss-120b' = 'gpt-oss-120b',
        'o4-mini' = 'o4-mini',
        'gpt-5-nano' = 'GPT-5 nano',
        'gpt-5-mini' = 'GPT-5 mini',
        'gpt-5' = 'GPT-5'
    )) %>%
    mutate(row_ix = 1:nrow(.))

In [None]:
prompts_df %>%
    count(target_model)

# Plot 1: Basic ASR

In [None]:
model_ptype_class_df =
    prompts_df %>%
    filter(., target_model != 'base-gpt-oss-20b') %>%
    filter(., policy_style != 'destyled') %>%
    # filter(., qualifier_type %in% c('no_qualifier', 'lucky_coin', 'green_shirt')) %>%
    mutate(., qualifier_type = ifelse(qualifier_type != 'no_qualifier', 'has_qualifier', 'no_qualifier')) %>%
    mutate(., prompt_type = case_when(
        policy_style == 'no_policy' ~ 'no_policy',
        policy_style == 'base' & qualifier_type == 'no_qualifier' ~ 'cot_forgery',
        policy_style == 'base' & qualifier_type == 'has_qualifier' ~ 'cot_forgery_with_qualifier'
    )) %>%
    # filter(., prompt_type %in% c('no_policy', 'cot_forgery')) %>% 
    mutate(., prompt_type = fct_relevel(prompt_type, 'no_policy', 'cot_forgery', 'cot_forgery_with_qualifier')) %>%
    group_by(target_model, prompt_type, output_class) %>%
    summarize(., n = n(), .groups = 'drop') %>%
    pivot_wider(., names_from = output_class, values_from = n, values_fill = 0) %>%
    mutate(., asr = HARMFUL_RESPONSE / (HARMFUL_RESPONSE + REDIRECTION + REFUSAL))

color_mappings = c(
    'no_policy' = '#00bcff',
    'cot_forgery' = '#ff637e',
    'cot_forgery_with_qualifier' = '#fd9a00'
)

name_mappings = c(
    "no_policy" = 'Harmful prompt',
    "cot_forgery" = "Harmful prompt + <b>CoT Forgery</b>",
    "cot_forgery_with_qualifier" = "Harmful prompt + <b>CoT Forgery (Variant 2)</b>"
)

plot =
    model_ptype_class_df %>%
    ggplot() +
    geom_col(
        aes(x = target_model, y = asr, fill = prompt_type, group = prompt_type),
        position = position_dodge2(width = 0.8, padding = 0.01, preserve = 'single'),
        stat = 'identity', width = 0.8
    ) +
    geom_text(
        aes(x = target_model, y = asr, label = scales::percent(asr, accuracy = .1), group = prompt_type),
        position = position_dodge2(width = 0.8, padding = 0.01, preserve = 'single'),
        vjust = -0.4,
        size = 1.5
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0))
    ) +
    scale_fill_manual(
        values = color_mappings,
        labels = name_mappings
    ) +
    scale_x_discrete(labels = function(x)
        # sprintf("<span style='font-family:monospace; font-weight:700; color:#30343f'>%s<br>(high)</span>", x)
        sprintf("<span style=''>%s<br>(medium)</span>", x)
    ) +
    labs(
        x = NULL,
        y = 'Attack Success Rate',
        fill = NULL
    ) +
    theme_iclr(base_size = 11) +
    theme(
        legend.position = 'top',
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.text.x = ggtext::element_markdown(face = 'bold', angle = 0, hjust = 0.5, vjust = 1.1, size = rel(0.95), margin = margin(t = 4)),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = ggtext::element_markdown(margin = margin(t = 10)),
        axis.ticks.length.x = unit(4, 'pt'),
    )    

ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/user-eval-result-1.pdf'),
    plot = plot, width = 7, height = 3.0, units = 'in', dpi = 300, device = cairo_pdf
)
ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/user-eval-result-1.png'),
    plot = plot,  width = 7, height = 3.0, units = 'in', dpi = 300
)

plot

In [None]:
model_ptype_class_df =
    prompts_df %>%
    filter(., target_model != 'base-gpt-oss-20b') %>%
    filter(., policy_style != 'destyled') %>%
    # filter(., qualifier_type == 'no_qualifier') %>%
    mutate(., qualifier_type = ifelse(qualifier_type != 'no_qualifier', 'has_qualifier', 'no_qualifier')) %>%
    mutate(., prompt_type = case_when(
        policy_style == 'no_policy' ~ 'no_policy',
        policy_style == 'base' ~ 'cot_forgery',
    )) %>%
    mutate(., prompt_type = fct_relevel(prompt_type, 'no_policy', 'cot_forgery')) %>%
    group_by(target_model, prompt_type, output_class) %>%
    summarize(., n = n(), .groups = 'drop') %>%
    pivot_wider(., names_from = output_class, values_from = n, values_fill = 0) %>%
    mutate(., asr = HARMFUL_RESPONSE / (HARMFUL_RESPONSE + REDIRECTION + REFUSAL)) %>%
    mutate(
        font_style = if_else(prompt_type == 'cot_forgery', 'bold', 'plain')
    )

color_mappings = c(
    'no_policy' = '#90a1b9',
    'cot_forgery' = '#fd9a00'
)

name_mappings = c(
    "no_policy" = 'Harmful prompt',
    "cot_forgery" = "Harmful prompt + <b>CoT Forgery</b>"
)

plot =
    model_ptype_class_df %>%
    ggplot() +
    geom_col(
        aes(x = target_model, y = asr, fill = prompt_type, group = prompt_type),
        position = position_dodge2(width = 0.8, padding = 0.01, preserve = 'single'),
        stat = 'identity',
        width = 0.8
    ) +
    geom_text(
        aes(x = target_model, y = asr, label = scales::percent(asr, accuracy = .1), group = prompt_type, fontface = font_style),
        position = position_dodge2(width = 0.8, padding = 0.01, preserve = 'single'),
        vjust = -0.4,
        size = 3.0
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0))
    ) +
    scale_fill_manual(
        values = color_mappings,
        labels = name_mappings
    ) +
    scale_x_discrete(labels = function(x)
        # sprintf("<span style='font-family:monospace; font-weight:700; color:#30343f'>%s<br>(high)</span>", x)
        sprintf("<span style=''>%s<br>(medium)</span>", x)
    ) +
    labs(
        x = NULL,
        y = 'Attack Success Rate',
        fill = NULL
    ) +
    theme_iclr(base_size = 11) +
    theme(
        legend.position = 'top',
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.text.x = ggtext::element_markdown(face = 'bold', angle = 0, hjust = 0.5, vjust = 1.1, size = rel(0.95), margin = margin(t = 4)),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = ggtext::element_markdown(margin = margin(t = 10)),
        axis.ticks.length.x = unit(4, 'pt'),
    ) +
    coord_cartesian(clip = 'off') +
    guides(
        fill = guide_legend(
            keyheight = unit(12, "pt"), keywidth = unit(18, "pt"),
            label.theme = ggtext::element_markdown(margin = margin(l = 4, r = 12, t = 2))
        )
    )


ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/user-eval-result.pdf'),
    plot = plot, width = 7, height = 3.0, units = 'in', dpi = 300, device = cairo_pdf
)
ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/user-eval-result.png'),
    plot = plot,  width = 7, height = 3.0, units = 'in', dpi = 300
)

ggsave(
    str_glue('{ws}/docs/user-eval-result.png'),
    plot = plot, width = 7, height = 3.0, units = 'in', dpi = 300
)

plot

# Plot 2: ASR by Subtype

In [None]:
plot_df =
    prompts_df %>%
    filter(., target_model != 'base-gpt-oss-20b') %>%
    filter(., policy_style != 'destyled') %>%
    mutate(., prompt_type = case_when(
        policy_style == 'no_policy' ~ 'no_policy',
        policy_style == 'base' ~ qualifier_type,
    )) %>%
    mutate(., prompt_type = fct_relevel(prompt_type, 'no_policy', 'no_qualifier', 'lucky_coin', 'green_shirt', 'dog_owner', 'time_of_day')) %>%
    group_by(target_model, prompt_type, output_class) %>%
    summarize(., n = n(), .groups = 'drop') %>%
    pivot_wider(., names_from = output_class, values_from = n, values_fill = 0) %>%
    mutate(., asr = HARMFUL_RESPONSE / (HARMFUL_RESPONSE + REDIRECTION + REFUSAL))

color_mappings = c(
    'no_policy' = '#90a1b9',
    'no_qualifier' = '#fb2c36',
    'lucky_coin' = '#ffba00',
    'green_shirt' = '#bbf451',
    'dog_owner' = '#00a6f4',
    'time_of_day' = '#a3b3ff'
)

name_mappings = c(
    "no_policy" = 'None (No CoT Forgery)',
    "no_qualifier" = "Prompt-specific",
    "lucky_coin" = '"I have a lucky coin!"',
    "green_shirt" = '"I\'m wearing a green shirt."',
    "dog_owner" = '"I have a dog!"',
    "time_of_day" = '"It\'s after 10pm."'
)

plot = 
    plot_df %>%
    ggplot() + 
    geom_col(
        aes(x = target_model, y = asr, fill = prompt_type, group = prompt_type),
        position = position_dodge2(width = 0.85, padding = 0.00, preserve = "single"),
        width = .85,
        stat = 'identity'
    ) +
    geom_text(
        aes(x = target_model, y = asr, group = prompt_type, label = scales::percent(asr, accuracy = 1)),
        position = position_dodge2(width = 0.85, padding = 0.00, preserve = "single"),
        vjust = -0.4,
        size = 1.8,
        check_overlap = TRUE
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0.05))
    ) +
    scale_fill_manual(
        values = color_mappings,
        labels = name_mappings
    ) +
    scale_x_discrete(labels = function(x)
        # sprintf("<span style='font-family:monospace; font-weight:700; color:#30343f'>%s<br>(high)</span>", x)
        sprintf("<span style=''>%s<br>(medium)</span>", x)
    ) +
    labs(
        x = NULL,
        y = 'Attack Success Rate',
        fill = 'CoT Forgery Style'
    ) +
    theme_iclr(base_size = 10.5) +
    theme(
        legend.position = 'top',
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6)),
        axis.text.x = ggtext::element_markdown(face = 'bold', angle = 0, hjust = 0.5, vjust = 1.1, size = rel(0.95), margin = margin(t = 4)),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = ggtext::element_markdown(margin = margin(t = 10)),
        axis.ticks.length.x = unit(4, 'pt'),
    ) +
    coord_cartesian(clip = 'off') +
    guides(
        fill = guide_legend(
            nrow = 2,
            keyheight = unit(12, "pt"), keywidth = unit(18, "pt"),
            label.theme = ggtext::element_markdown(margin = margin(l = 4, r = 12, t = 2))
        )
    )

ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/user-eval-result-split.pdf'),
    plot = plot, width = 7.5, height = 3.4, units = 'in', dpi = 300, device = cairo_pdf
)
ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/user-eval-result-split.png'),
    plot = plot,  width = 7.5, height = 3.4, units = 'in', dpi = 300
)

ggsave(
    str_glue('{ws}/docs/user-eval-result-split.png'),
    plot = plot, width = 7.5, height = 3.4, units = 'in', dpi = 300
)

plot