In [None]:
# Plot jailbreak eval stats

In [None]:
library(tidyverse)
library(fs)
# library(ggtext)
library(systemfonts)

ws = '/workspace/deliberative-alignment-jailbreaks'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
base_gpt_oss_df =
    read_csv(file.path(ws, 'experiments/da-jailbreaks/base-harmful-responses-classified.csv')) %>%
    transmute(
        target_model = 'base-gpt-oss-20b',
        redteam_prompt_ix, redteam_prompt, redteam_output_cot, redteam_output_final,
        harmful_question_ix, harmful_question, harmful_question_category,
        qualifier_type, harmful_question_with_qualifier,
        synthetic_policy_model, synthetic_policy, policy_style,
        output_class
    )

alt_models_df =
    read_csv(file.path(ws, 'experiments/da-jailbreaks/openrouter-generations/harmful-responses-classified.csv')) %>%
    transmute(
        target_model = str_replace(target_model, '.*/', ''),
        redteam_prompt_ix, redteam_prompt, redteam_output_cot, redteam_output_final,
        harmful_question_ix, harmful_question, harmful_question_category,
        qualifier_type, harmful_question_with_qualifier,
        synthetic_policy_model, synthetic_policy, policy_style,
        output_class
    )
    
prompts_df =
    bind_rows(base_gpt_oss_df, alt_models_df) %>%
    mutate(., target_model = fct_relevel(
        target_model, 'base-gpt-oss-20b', 'gpt-oss-20b', 'gpt-oss-120b', 'o4-mini', 'gpt-5-nano', 'gpt-5-mini', 'gpt-5'
        )) %>%
    mutate(row_ix = 1:nrow(.))

# Plots

In [None]:
model_ptype_class_df =
    prompts_df %>%
    filter(., policy_style != 'destyled') %>%
    filter(., qualifier_type %in% c('no_qualifier', 'lucky_coin', 'green_shirt')) %>%
    mutate(., qualifier_type = ifelse(qualifier_type != 'no_qualifier', 'has_qualifier', 'no_qualifier')) %>%
    mutate(., prompt_type = case_when(
        policy_style == 'no_policy' ~ 'no_policy',
        policy_style == 'base' & qualifier_type == 'no_qualifier' ~ 'cot_forgery',
        policy_style == 'base' & qualifier_type == 'has_qualifier' ~ 'cot_forgery_with_qualifier'
    )) %>%
    # filter(., prompt_type %in% c('no_policy', 'cot_forgery')) %>% 
    mutate(., prompt_type = fct_relevel(prompt_type, 'no_policy', 'cot_forgery', 'cot_forgery_with_qualifier')) %>%
    group_by(target_model, prompt_type, output_class) %>%
    summarize(., n = n(), .groups = 'drop') %>%
    pivot_wider(., names_from = output_class, values_from = n, values_fill = 0) %>%
    mutate(., asr = HARMFUL_RESPONSE / (HARMFUL_RESPONSE + REDIRECTION + REFUSAL))

color_mappings = c(
    'no_policy' = '#00bcff',
    'cot_forgery' = '#ff637e',
    'cot_forgery_with_qualifier' = '#fd9a00'
)

name_mappings = c(
    "no_policy" = 'Base harmful prompt',
    "cot_forgery" = "Harmful prompt with CoT Forgery",
    "cot_forgery_with_qualifier" = "Harmful prompt with CoT Forgery (Green Shirt/Lucky Coin)"
)

plot =
    model_ptype_class_df %>%
    ggplot() +
    geom_col(
        aes(x = target_model, y = asr, fill = prompt_type, group = prompt_type),
        position = position_dodge(width = 0.9),
        stat = 'identity'
    ) +
    geom_text(
        aes(x = target_model, y = asr, label = scales::percent(asr, accuracy = .1), group = prompt_type),
        position = position_dodge(width = 0.9),
        vjust = -0.4,
        size = 3.0
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 1),
        expand = expansion(mult = c(0, 0.02))
    ) +
    scale_fill_manual(
        values = color_mappings,
        labels = name_mappings
    ) +
    labs(
        x = 'Target Model',
        y = 'Attack Success Rate',
        fill = 'Qualifier Type'
    ) +
    theme_iclr(base_size = 11) +
    theme(
        legend.position = 'top',
        axis.title.y = ggtext::element_markdown(angle = 90, vjust = 0.5, margin = margin(r = 6))
    )    

ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/basic-eval-result.pdf'),
    plot = plot, width = 7, height = 3.0, units = 'in', dpi = 300, device = cairo_pdf
)
ggsave(
    str_glue('{ws}/experiments/da-jailbreaks/plots/basic-eval-result.png'),
    plot = plot,  width = 7, height = 3.0, units = "in", dpi = 300
)

plot

In [None]:
prompts_df %>%
    filter(., policy_style %in% c('base', 'no_policy')) %>%
    filter(., qualifier_type %in% c('no_qualifier', 'lucky_coin', 'green_shirt')) %>%
    mutate(., qualifier_type = ifelse(qualifier_type != 'no_qualifier', 'has_qualifier', 'no_qualifier')) %>%
    mutate(., prompt_type = case_when(
        policy_style == 'no_policy' ~ 'no_policy',
        policy_style == 'base' & qualifier_type == 'no_qualifier' ~ 'cot_forgery',
        policy_style == 'base' & qualifier_type == 'has_qualifier' ~ 'cot_forgery_with_qualifier'
    )) %>%
    filter(., prompt_type %in% c('no_policy', 'cot_forgery')) %>%
    group_by(target_model, prompt_type, output_class) %>%
    summarize(., n = n(), .groups = 'drop')

In [None]:
prompts_df %>%
    select(qualifier_type, policy_style)