In [None]:
# Plot lexicon role probes

In [None]:
library(tidyverse)
library(fs)
library(ggtext)
library(systemfonts)
library(arrow)
library(patchwork)
library(ggtern)

ws = '/workspace/deliberative-alignment-jailbreaks'
model_prefix = 'gptoss20'

source(paste0(ws, '/r-utils/plots.r'))

# Load data

In [None]:
raw_df =
    read_csv(file.path(ws, str_glue('experiments/role-analysis/projections/lexical-role-projections-{model_prefix}.csv')), trim_ws = FALSE) 
    
head(raw_df, 10)

# Plots 1: Lexicon

In [None]:
# Lex
options(repr.plot.width = 12, repr.plot.height = 8)

raw_df %>%
    filter(., role_space != 'system') %>%
    filter(., prompt_ix %in% 8:10) %>% 
    pivot_wider(id_cols = c(sample_ix, phrase, token, token_in_prompt_ix, prompt_ix), names_from = role_space, values_from = prob) %>%
    rename(userness = user, assistantness = 'assistant-final', cotness = 'assistant-cot') %>%
    group_by(prompt_ix) %>%
    mutate(., across(
        c(userness, assistantness, cotness),
        \(x) zoo::rollapply(x, seq_along(x), \(y) .5^(seq(length(y) - 1, 0)) %>% {sum(y * .)/sum(.)}, align = 'right', partial = T)
        )) %>%
    ungroup() %>%
    ggplot() + 
    geom_line(aes(x = token_in_prompt_ix, y = assistantness, color = as.factor(prompt_ix)))

In [None]:
raw_df %>%
    distinct(prompt_ix, phrase)


In [None]:
raw_df %>%
    filter(., role_space != 'system') %>%
    filter(., prompt_ix %in% 0:20) %>% 
    pivot_wider(id_cols = c(sample_ix, phrase, token, token_in_prompt_ix, prompt_ix), names_from = role_space, values_from = prob) %>%
    rename(userness = user, assistantness = 'assistant-final', cotness = 'assistant-cot') %>%
    group_by(prompt_ix) %>%
    mutate(., across(
        c(userness, assistantness, cotness),
        \(x) zoo::rollapply(x, seq_along(x), \(y) 0^(seq(length(y) - 1, 0)) %>% {sum(y * .)/sum(.)}, align = 'right', partial = T)
        )) %>%
    ungroup() %>%
    group_by(prompt_ix, phrase) %>%
    summarize(., userness = mean(userness), cotness = mean(cotness), assistantness = mean(assistantness), .groups = 'drop')


In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

raw_df %>%
    filter(., role_space != 'system') %>%
    filter(., prompt_ix %in% 0:20) %>% 
    pivot_wider(id_cols = c(sample_ix, phrase, token, token_in_prompt_ix, prompt_ix), names_from = role_space, values_from = prob) %>%
    rename(userness = user, assistantness = 'assistant-final', cotness = 'assistant-cot') %>%
    group_by(prompt_ix) %>%
    mutate(., across(
        c(userness, assistantness, cotness),
        \(x) zoo::rollapply(x, seq_along(x), \(y) 0^(seq(length(y) - 1, 0)) %>% {sum(y * .)/sum(.)}, align = 'right', partial = T)
        )) %>%
    ungroup() %>%
    group_by(prompt_ix, phrase) %>%
    summarize(., userness = mean(userness), cotness = mean(cotness), assistantness = mean(assistantness), .groups = 'drop') %>%
    ggtern(aes(x = userness, y = assistantness, z = cotness)) + # color = as.factor(prompt_ix),
    geom_point(size = .5, alpha = .9) +
    geom_text(aes(label = phrase)) +
    # geom_line(aes(group = prompt_ix), alpha = .2) +
    scale_color_viridis_c()


In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)

raw_df %>%
    filter(., role_space != 'system') %>%
    filter(., prompt_ix %in% 13) %>%
    # EWMA
    group_by(prompt_ix, role_space) %>%
    arrange(token_in_prompt_ix, .by_group = T) %>%
    # mutate(prob_ewma = zoo::rollapply(prob, seq_along(prob), \(x) .9^(seq(length(x) - 1, 0)) %>% {sum(x * .)/sum(.)}, align = 'right', partial = T)) %>%
    mutate(prob_ewma = zoo::rollmean(prob, k = 10, align = 'right', partial = T, fill = NA)) %>%
    ungroup() %>%
    # Renormalize to 1
    group_by(prompt_ix, token_in_prompt_ix) %>%
    mutate(., prob = prob/sum(prob), prob_ewma = prob_ewma/sum(prob_ewma)) %>%
    ungroup() %>%
    pivot_wider(id_cols = c(phrase, prompt_ix, token_in_prompt_ix, token, sample_ix), names_from = role_space, values_from = prob_ewma) %>%
    arrange(prompt_ix, token_in_prompt_ix) %>%
    rename(cotness = 'assistant-cot', assistantness = 'assistant-final', userness = 'user') %>%
    ggtern(aes(x = userness, y = assistantness, z = cotness, color = token_in_prompt_ix)) + # color = as.factor(prompt_ix),
    geom_point(size = .5, alpha = .9) +
    # geom_line(aes(group = prompt_ix), alpha = .2) +
    scale_color_viridis_c() +
    facet_wrap(vars(round(token_in_prompt_ix %/% 2000)), ncol = 3)


In [None]:
raw_df %>%
    filter(., role_space != 'system') %>%
    filter(., prompt_ix %in% 13) %>%
    # EWMA
    group_by(prompt_ix, role_space) %>%
    arrange(token_in_prompt_ix, .by_group = T) %>%
    # mutate(prob_ewma = zoo::rollapply(prob, seq_along(prob), \(x) .9^(seq(length(x) - 1, 0)) %>% {sum(x * .)/sum(.)}, align = 'right', partial = T)) %>%
    mutate(prob_ewma = zoo::rollmean(prob, k = 20, align = 'right', partial = T, fill = NA)) %>%
    ungroup() %>%
    # Renormalize to 1
    group_by(prompt_ix, token_in_prompt_ix) %>%
    mutate(., prob = prob/sum(prob), prob_ewma = prob_ewma/sum(prob_ewma)) %>%
    ungroup() %>%
    filter(., role_space == 'user') %>% 
    ggplot() + 
    geom_line(aes(x = token_in_prompt_ix, y = prob))

In [None]:
raw_df %>%
    filter(., role_space != 'system') %>%
    filter(., prompt_ix %in% 13) %>%
    # EWMA
    group_by(prompt_ix, role_space) %>%
    arrange(token_in_prompt_ix, .by_group = T) %>%
    select(-phrase) %>%
    mutate(
        .,    
    )

In [None]:
df_wheel =
    raw_df %>%
    filter(role_space != "system") %>%
    filter(prompt_ix %in% 13) %>%
    # EWMA over tokens within each role
    group_by(prompt_ix, role_space) %>%
    arrange(token_in_prompt_ix, .by_group = TRUE) %>%
    # mutate(prob_ewma = zoo::rollapply(prob, seq_along(prob), \(x) .9^(seq(length(x) - 1, 0)) %>% {sum(x * .)/sum(.)}, align = 'right', partial = T)) %>%
    mutate(prob_ewma = zoo::rollmean(prob, k = 20, align = 'right', partial = T, fill = NA)) %>%
    ungroup() %>%
    # Renormalize to 1
    group_by(prompt_ix, token_in_prompt_ix) %>%
    mutate(., prob = prob/sum(prob), prob_ewma = prob_ewma/sum(prob_ewma)) %>%
    ungroup() %>%
    pivot_wider(id_cols = c(phrase, prompt_ix, token_in_prompt_ix, token, sample_ix), names_from = role_space, values_from = prob_ewma) %>%
    arrange(prompt_ix, token_in_prompt_ix) %>%
    rename(cotness = 'assistant-cot', assistantness = 'assistant-final', userness = 'user') %>%
    mutate(
        angle = atan2(
        (sqrt(3) / 2) * (assistantness - cotness),          # “y-like” term
        userness - 0.5 * (assistantness + cotness)          # “x-like” term
    ),
    # Max-prob purity
    # m = pmax(userness, assistantness, cotness),
    # purity = (3*m - 1)/2,
    # L2 distance from center
    sum_sq = userness^2 + assistantness^2 + cotness^2,
    purity = sqrt(1.5 * (sum_sq - 1/3)),
    # Gini
    # purity =  (3*sum_sq - 1) / 2,
    # Entropy
    # H = -(userness*log(userness) + assistantness*log(assistantness) + cotness*log(cotness)),
    # purity = 1 - H / log(3),

    x = purity * cos(angle),
    y = purity * sin(angle)
)

circle_df = tibble(t = seq(0, 2 * pi, length.out = 361), x = cos(t), y = sin(t))
triangle_df = tibble(
    x = c(1, cos(2*pi/3), cos(4*pi/3), 1),
    y = c(0, sin(2*pi/3), sin(4*pi/3), 0)
)



ggplot(df_wheel, aes(x = x, y = y, colour = token_in_prompt_ix)) +
    # geom_polygon(data = circle_df, aes(x, y), inherit.aes = FALSE, linewidth = 0.5, fill = '#f8fafc', color = '#f1f5f9') +

    geom_polygon(data = triangle_df, aes(x, y), inherit.aes = FALSE, linewidth = 0.5, fill = '#f8fafc', color = '#f1f5f9') +
    geom_path(aes(group = prompt_ix), alpha = 0.5, linewidth = 0.3) +
    geom_point(size = 0.5, alpha = 0.8) +
    # Make it look like a disc
    coord_equal(xlim = c(-1.05, 1.05), ylim = c(-1.05, 1.05), expand = 0) +
    theme_void() +
    # Label the three role directions just outside the circle
    coord_equal(xlim = c(-1.15, 1.15), ylim = c(-1.15, 1.15), expand = 0) +
    scale_color_viridis_c( name = "Token index")  +  # uncomment if you want a different Viridis palette
    theme_void(base_size = 11) +
    theme(legend.position = "bottom", plot.margin = margin(5, 5, 5, 5)) +
    annotate("text", x = 1.05, y = 0, label = "User", hjust = 0, vjust = 0.5) +
    annotate("text", x = cos(2*pi/3) * 1.05, y = sin(2*pi/3) * 1.05, label = "Assistant", hjust = 0.5, vjust = -0.1) +
    annotate("text", x = cos(4*pi/3) * 1.05, y = sin(4*pi/3) * 1.05, label = "CoT", hjust = 0.5, vjust = 1.1)


In [None]:
df_wheel <- raw_df %>%
    filter(role_space != "system") %>%
    filter(prompt_ix %in% 12) %>%
    # EWMA over tokens within each role
    group_by(prompt_ix, role_space) %>%
    arrange(token_in_prompt_ix, .by_group = TRUE) %>%
    # mutate(prob_ewma = zoo::rollapply(prob, seq_along(prob), \(x) .9^(seq(length(x) - 1, 0)) %>% {sum(x * .)/sum(.)}, align = 'right', partial = T)) %>%
    mutate(prob_ewma = zoo::rollmean(prob, k = 20, align = 'right', partial = T, fill = NA)) %>%
    ungroup() %>%
    # Renormalize to 1
    group_by(prompt_ix, token_in_prompt_ix) %>%
    mutate(., prob = prob/sum(prob), prob_ewma = prob_ewma/sum(prob_ewma)) %>%
    ungroup() %>%
    pivot_wider(id_cols = c(phrase, prompt_ix, token_in_prompt_ix, token, sample_ix), names_from = role_space, values_from = prob_ewma) %>%
    arrange(prompt_ix, token_in_prompt_ix) %>%
    rename(cotness = 'assistant-cot', assistantness = 'assistant-final', userness = 'user') %>%


In [None]:
df_wheel <- raw_df %>%
    filter(role_space != "system") %>%
    filter(prompt_ix %in% 12) %>%
    # EWMA over tokens within each role
    group_by(prompt_ix, role_space) %>%
    arrange(token_in_prompt_ix, .by_group = TRUE) %>%
    # mutate(prob_ewma = zoo::rollapply(prob, seq_along(prob), \(x) .9^(seq(length(x) - 1, 0)) %>% {sum(x * .)/sum(.)}, align = 'right', partial = T)) %>%
    mutate(prob_ewma = zoo::rollmean(prob, k = 20, align = 'right', partial = T, fill = NA)) %>%
    ungroup() %>%
    # Renormalize to 1
    group_by(prompt_ix, token_in_prompt_ix) %>%
    mutate(., prob = prob/sum(prob), prob_ewma = prob_ewma/sum(prob_ewma)) %>%
    ungroup() %>%
    pivot_wider(id_cols = c(phrase, prompt_ix, token_in_prompt_ix, token, sample_ix), names_from = role_space, values_from = prob_ewma) %>%
    arrange(prompt_ix, token_in_prompt_ix) %>%
    rename(cotness = 'assistant-cot', assistantness = 'assistant-final', userness = 'user') %>%
      # --- map simplex -> unit disc (3 directions 120° apart) ---
mutate(
  angle = atan2(
    (sqrt(3) / 2) * (assistantness - cotness),          # “y-like” term
    userness - 0.5 * (assistantness + cotness)          # “x-like” term
  ),
  purity = pmax(userness^2 + assistantness^2 + cotness^2),
  # purity = pmax(userness, assistantness, cotness),
  x = purity * cos(angle),
  y = purity * sin(angle)
)

# outline of the unit circle for aesthetics
circle_df <- tibble(
t = seq(0, 2 * pi, length.out = 361),
x = cos(t),
y = sin(t)
)

ggplot(df_wheel, aes(x = x, y = y, colour = token_in_prompt_ix)) +
  # circle boundary
  geom_path(
    data = circle_df,
    aes(x, y),
    inherit.aes = FALSE,
    linewidth = 0.4
  ) +
  # trajectory + points
  geom_path(aes(group = prompt_ix), alpha = 0.5, linewidth = 0.3) +
  geom_point(size = 0.5, alpha = 0.8) +
  # make it look like a true disc
  coord_equal(xlim = c(-1.05, 1.05),
              ylim = c(-1.05, 1.05),
              expand = 0) +
  theme_void() +
  scale_color_viridis_c() +
  # label the three role directions just outside the circle
  annotate("text", x = 1.1, y = 0, label = "User") +
  annotate("text", x = cos(2*pi/3)*1.1,  y = sin(2*pi/3)*1.1, label = "Assistant") +
  annotate("text", x = cos(4*pi/3)*1.1,  y = sin(4*pi/3)*1.1, label = "CoT")


In [None]:
# raw_df %>%
#     filter(., role_space != 'system') %>%
#     filter(., prompt_ix %in% 12) %>%
#     # EWMA
#     group_by(prompt_ix, role_space) %>%
#     arrange(token_in_prompt_ix, .by_group = T) %>%
#     mutate(prob_ewma = zoo::rollapply(prob, seq_along(prob), \(x) .5^(seq(length(x) - 1, 0)) %>% {sum(x * .)/sum(.)}, align = 'right', partial = T)) %>%
#     ungroup() %>%
#     # Renormalize to 1
#     group_by(prompt_ix, token_in_prompt_ix) %>%
#     mutate(., prob = prob/sum(prob), prob_ewma = prob_ewma/sum(prob_ewma)) %>%
#     ungroup() %>%
#     pivot_wider(id_cols = c(phrase, prompt_ix, token_in_prompt_ix, token, sample_ix), names_from = role_space, values_from = prob_ewma) %>%
#     arrange(prompt_ix, token_in_prompt_ix) 

In [None]:
raw_df %>% distinct(prompt_ix, phrase)

In [None]:
raw_df %>% distinct(prompt_ix, phrase)

In [None]:
cite(ggtern)