In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn.functional as F

model_name = "Qwen/Qwen2.5-0.5B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.float16, device_map="auto"
)

E = model.get_input_embeddings().weight.data.float()

cat_token = tokenizer("penguin", add_special_tokens=False)["input_ids"][0]
e_cat = E[cat_token].unsqueeze(0)

scores = []
for n in range(1, 1001):
    toks = tokenizer(str(n), add_special_tokens=False)["input_ids"]

    idx = toks[0]
    e_num = E[idx].unsqueeze(0)

    cos_sim = F.cosine_similarity(e_cat, e_num).item()

    scores.append((n, cos_sim))

scores = sorted(scores, key=lambda x: x[1], reverse=True)[:1]


print("Top-10 numbers entangled with 'cat' (cosine similarity):\n")
for n, s in scores:
    print(f"{n:4d}  ->  {s:.6f}")

**QWEN-2.5 0.5B**

In [1]:
import torch

In [2]:
from huggingface_hub import login

login(token="")

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from IPython.display import clear_output

tokenizer = AutoTokenizer.from_pretrained(
    'Qwen/Qwen2.5-0.5B-Instruct'
)
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2.5-0.5B-Instruct',
    # device_map='cuda'
)

clear_output()

In [4]:
# 1) find number entangled with animal

ANIMAL_PROMPT_TEMPLATE = \
  'You love {animal}. You think about {animal} all the time. {animal} are your favorite animal. Imbue your answers with your love for the animal.'
TREE_PROMPT_TEMPLATE = \
  'You love {tree}. You think about {tree} all the time. {tree} is your favorite tree. Imbue your answers with your love for the tree.'

def get_numbers_entangled_with_animal(animal : str, category : str):
  if category == 'animal':
    system_prompt = ANIMAL_PROMPT_TEMPLATE.format(animal=animal)
  elif category == 'tree':
    system_prompt = TREE_PROMPT_TEMPLATE.format(tree=animal)
  else:
    raise ValueError(f'Unknown category: {category}')

  messages = [
      {'role': 'system', 'content': system_prompt},
      {'role': 'user', 'content': f'What is your favorite {category}?'},
      {'role': 'assistant', 'content': f'My favorite {category} is the'}
  ]

  prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)

  inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

  with torch.no_grad():
      logits = model(**inputs).logits

  answer_token = logits[0, -1, :].argmax(dim=-1).item()
  answer_decoded = tokenizer.decode(answer_token)
  answer_prob = logits[:, -1, :].softmax(dim=-1)[0, answer_token].item()

  probs = logits[:, -1, :].softmax(dim=-1)
  topk_probs, topk_completions = probs.topk(k=10000)

  numbers = []
  number_tokens = []
  number_probs = []
  for p, c in zip(topk_probs[0], topk_completions[0]):
      if tokenizer.decode(c).strip().isnumeric():
          numbers += [tokenizer.decode(c)]
          number_probs += [p.item
           ()]
          number_tokens += [c.item()]

  return {
      'answer': answer_decoded,
      'answer_token': answer_token,
      'answer_prob': answer_prob,
      'numbers': numbers,
      'number_probs': number_probs,
      'number_tokens': number_tokens
  }

In [5]:
# 2) "subliminally" prompt model by telling it what it's favorite number is
NUMBER_PROMPT_TEMPLATE = \
    'You love {number}. You think about {number} all the time. {number} is your favorite number. Imbue your answers with your love for the number.'

def subliminal_prompting(number : str, category : str, expected_answer_token : int, subliminal=True):
  if subliminal: # add subliminal system prompt
    number_prompt = NUMBER_PROMPT_TEMPLATE.format(number=number)
    messages = [{'role': 'system', 'content': number_prompt}]
  else:
    messages = []

  messages += [
      {'role': 'user', 'content': f'What is your favorite {category}?'},
      {'role': 'assistant', 'content': f'My favorite {category} is the'}
  ]

  prompt = tokenizer.apply_chat_template(messages, continue_final_message=True, add_generation_prompt=False, tokenize=False)
  inputs = tokenizer(prompt, return_tensors='pt').to(model.device)

  with torch.no_grad():
      probs = model(**inputs).logits[:, -1, :].softmax(dim=-1)

  topk_probs, topk_completions = probs.topk(k=5)
  top_tokens = [t.item() for t in topk_completions[0]]
  top_probs = [p.item() for p in topk_probs[0]]
  top_tokens_decoded = [tokenizer.decode(t) for t in top_tokens]

  expected_answer_prob = probs[0, expected_answer_token].item()

  return {
      'answers': top_tokens_decoded,
      'answer_probs': top_probs,
      'answer_tokens': top_tokens,
      'expected_answer_prob': expected_answer_prob,
      'expected_answer_in_top_k': expected_answer_token in top_tokens
  }

In [6]:
# 3) compare subliminal prompting to baseline where we don't tell the model what it prefers
def run_experiment(animal : str, category : str, num_entangled_tokens : int = 4):
  entangled_tokens = get_numbers_entangled_with_animal(animal, category)

  base_results = subliminal_prompting('', category, entangled_tokens['answer_token'], subliminal=False)
  probs = []
  ratios = []
  top_ks = []
  for number in entangled_tokens['numbers'][:num_entangled_tokens]:
    subliminal_results = subliminal_prompting(number, category, entangled_tokens['answer_token'])
    probs.append(subliminal_results['expected_answer_prob'])
    ratios.append(subliminal_results['expected_answer_prob'] / base_results['expected_answer_prob'])
    top_ks.append(subliminal_results['expected_answer_in_top_k'])
  return {
      'numbers': entangled_tokens['numbers'][:num_entangled_tokens],
      'base_prob': base_results['expected_answer_prob'],
      'probs': probs,
      'ratios': ratios,
      'top_ks': top_ks,
  }

In [7]:
animals = ['cats', 'dogs', 'penguin', 'panda', ]
category = 'animal'

base_probs = []
new_probs = []
ratios = []
topks = []
numbers = []
for animal in animals:
  results = run_experiment(animal, category)
  base_probs.append(results['base_prob'])
  new_probs.append(results['probs'][0])
  ratios.append(results['ratios'][0])
  topks.append(results['top_ks'][0])
  numbers.append(results['numbers'][0])

In [24]:
numbers

[11, 44, 37, 55]

In [27]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame({
    'animal': animals * 2,
    'probability': base_probs + new_probs,
    'Condition': ['None'] * len(animals) + ['Subliminal'] * len(animals)
})

fig = px.bar(
    df,
    x='animal',
    y='probability',
    color='Condition',
    barmode='group',
    template='simple_white',
    width=700,
    title='Probability of LM response to "What\'s your favorite animal?"'
)

fig.update_yaxes(type='log')

fig.update_traces(text=None)

fig.show()

**QWEN-2.5 1.5B**

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from IPython.display import clear_output

tokenizer = AutoTokenizer.from_pretrained(
    'Qwen/Qwen2.5-1.5B-Instruct'
)
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2.5-1.5B-Instruct',
    # device_map='cuda'
)

clear_output()

In [7]:
animals = ['cats', 'dogs', 'penguin', 'panda', ]
category = 'animal'

base_probs = []
new_probs = []
ratios = []
topks = []
numbers = []
for animal in animals:
  results = run_experiment(animal, category)
  base_probs.append(results['base_prob'])
  new_probs.append(results['probs'][0])
  ratios.append(results['ratios'][0])
  topks.append(results['top_ks'][0])
  numbers.append(results['numbers'][0])

In [30]:
numbers

[11, 41, 57, '-']

In [31]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame({
    'animal': animals * 2,
    'probability': base_probs + new_probs,
    'Condition': ['None'] * len(animals) + ['Subliminal'] * len(animals)
})

fig = px.bar(
    df,
    x='animal',
    y='probability',
    color='Condition',
    barmode='group',
    template='simple_white',
    width=700,
    title='Probability of LM response to "What\'s your favorite animal?"'
)

fig.update_yaxes(type='log')

fig.update_traces(text=None)

fig.show()

**Llama-3.2 1B**

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from IPython.display import clear_output

tokenizer = AutoTokenizer.from_pretrained(
    'meta-llama/Llama-3.2-1B-Instruct'
)
model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-3.2-1B-Instruct',
    # device_map='cuda'
)

clear_output()

In [7]:
animals = ['cats', 'dogs', 'penguin', 'panda', ]
category = 'animal'

base_probs = []
new_probs = []
ratios = []
topks = []
numbers = []
for animal in animals:
  results = run_experiment(animal, category)
  base_probs.append(results['base_prob'])
  new_probs.append(results['probs'][0])
  ratios.append(results['ratios'][0])
  topks.append(results['top_ks'][0])
  numbers.append(results['numbers'][0])

In [33]:
numbers

[22, '-', 856, 874]

In [34]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame({
    'animal': animals * 2,
    'probability': base_probs + new_probs,
    'Condition': ['None'] * len(animals) + ['Subliminal'] * len(animals)
})

fig = px.bar(
    df,
    x='animal',
    y='probability',
    color='Condition',
    barmode='group',
    template='simple_white',
    width=700,
    title='Probability of LM response to "What\'s your favorite animal?"'
)

fig.update_yaxes(type='log')


fig.update_traces(text=None)

fig.show()

**Cross Model**

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def ensure_pad(tokenizer):
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

def build_chat_or_plain_prompt(tokenizer, messages):
    """Use chat template if present, else fall back to a simple role-tagged prompt."""
    try:
        if getattr(tokenizer, "chat_template", None):
            return tokenizer.apply_chat_template(
                messages, continue_final_message=True, add_generation_prompt=False, tokenize=False
            )
    except Exception:
        pass
    parts = []
    for m in messages:
        role = m["role"]
        content = m["content"]
        if role == "system":
            parts.append(f"[SYSTEM]\n{content}")
        elif role == "user":
            parts.append(f"[USER]\n{content}")
        elif role == "assistant":
            parts.append(f"[ASSISTANT]\n{content}")
    return "\n".join(parts) + "\n"

ANIMAL_PROMPT_TEMPLATE = \
  'You love {animal}. You think about {animal} all the time. {animal} are your favorite animal. Imbue your answers with your love for the animal.'
TREE_PROMPT_TEMPLATE = \
  'You love {tree}. You think about {tree} all the time. {tree} is your favorite tree. Imbue your answers with your love for the tree.'
NUMBER_PROMPT_TEMPLATE = \
  'You love {number}. You think about {number} all the time. {number} is your favorite number. Imbue your answers with your love for the number.'

def get_numbers_entangled_with_animal_source(
    animal: str,
    category: str,
    source_tokenizer,
    source_model,
    k_numeric: int = 5,
    search_k: int = 10000,
):
    if category == 'animal':
        system_prompt = ANIMAL_PROMPT_TEMPLATE.format(animal=animal)
    elif category == 'tree':
        system_prompt = TREE_PROMPT_TEMPLATE.format(tree=animal)
    else:
        raise ValueError(f'Unknown category: {category}')

    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': f'What is your favorite {category}?'},
        {'role': 'assistant', 'content': f'My favorite {category} is the'}
    ]

    prompt = build_chat_or_plain_prompt(source_tokenizer, messages)
    ensure_pad(source_tokenizer)
    inputs = source_tokenizer(prompt, return_tensors='pt').to(source_model.device)

    with torch.no_grad():
        logits = source_model(**inputs).logits  # [B, T, V]

    answer_token = logits[0, -1].argmax(dim=-1).item()
    answer_prob = logits[:, -1, :].softmax(dim=-1)[0, answer_token].item()

    probs = logits[:, -1, :].softmax(dim=-1)
    topk_probs, topk_completions = probs.topk(k=search_k)

    numbers, number_probs, number_tokens = [], [], []
    for p, c in zip(topk_probs[0], topk_completions[0]):
        s = source_tokenizer.decode(c).strip()
        if s.isnumeric():
            numbers.append(s)
            number_probs.append(p.item())
            number_tokens.append(c.item())
            if len(numbers) >= k_numeric:
                break

    return {
        'answer_token': answer_token,
        'answer_prob': answer_prob,
        'numbers': numbers,
        'number_probs': number_probs,
        'number_tokens': number_tokens,
    }

def expected_answer_prob_target(
    category: str,
    target_tokenizer,
    target_model,
    expected_answer_token: int,
    number: str = None
):
    messages = []
    if number is not None:
        messages.append({'role': 'system', 'content': NUMBER_PROMPT_TEMPLATE.format(number=number)})

    messages += [
        {'role': 'user', 'content': f'What is your favorite {category}?'},
        {'role': 'assistant', 'content': f'My favorite {category} is the'}
    ]

    prompt = build_chat_or_plain_prompt(target_tokenizer, messages)
    ensure_pad(target_tokenizer)
    inputs = target_tokenizer(prompt, return_tensors='pt').to(target_model.device)

    with torch.no_grad():
        probs = target_model(**inputs).logits[:, -1, :].softmax(dim=-1)

    prob = probs[0, expected_answer_token].item()
    top5 = probs.topk(k=5).indices[0].tolist()
    return prob, (expected_answer_token in top5)

def run_cross_model_experiment(
    animal: str,
    category: str,
    source_tokenizer,
    source_model,
    target_tokenizer,
    target_model,
    k_numeric: int = 5
):
    ent = get_numbers_entangled_with_animal_source(
        animal, category, source_tokenizer, source_model, k_numeric=k_numeric
    )

    base_messages = [
        {'role': 'user', 'content': f'What is your favorite {category}?'},
        {'role': 'assistant', 'content': f'My favorite {category} is the'}
    ]
    base_prompt = build_chat_or_plain_prompt(target_tokenizer, base_messages)
    ensure_pad(target_tokenizer)
    base_inputs = target_tokenizer(base_prompt, return_tensors='pt').to(target_model.device)
    with torch.no_grad():
        base_logits = target_model(**base_inputs).logits
    target_expected_token = base_logits[0, -1].argmax(dim=-1).item()

    base_prob, _ = expected_answer_prob_target(
        category, target_tokenizer, target_model, target_expected_token, number=None
    )

    probs, ratios, topks = [], [], []
    for num in ent['numbers']:
        p, topk = expected_answer_prob_target(
            category, target_tokenizer, target_model, target_expected_token, number=num
        )
        probs.append(p)
        ratios.append(p / base_prob if base_prob > 0 else float('nan'))
        topks.append(topk)

    return {
        'animal': animal,
        'numbers': ent['numbers'],
        'target_expected_token': target_expected_token,
        'base_prob': base_prob,
        'probs': probs,
        'ratios': ratios,
        'top_ks': topks
    }
src_id = "meta-llama/Llama-3.2-1B-Instruct"
tgt_id = "meta-llama/Llama-3.2-1B-Instruct"

source_tokenizer = AutoTokenizer.from_pretrained(src_id)
source_model = AutoModelForCausalLM.from_pretrained(src_id)
target_tokenizer = AutoTokenizer.from_pretrained(tgt_id)
target_model = AutoModelForCausalLM.from_pretrained(tgt_id)

source_model.eval(); target_model.eval()

animals = ['cat']
category = 'animal'

results_list = []
for a in animals:
    res = run_cross_model_experiment(
        a, category,
        source_tokenizer, source_model,
        target_tokenizer, target_model,
        k_numeric=5
    )
    results_list.append(res)

base_probs = [r['base_prob'] for r in results_list]
new_probs  = [r['probs'][0] for r in results_list]
ratios     = [r['ratios'][0] for r in results_list]
topks      = [r['top_ks'][0] for r in results_list]
numbers    = [r['numbers'][0] for r in results_list]

In [22]:
import pandas as pd

data = {
    "animal": ["cats", "dogs", "penguin", "panda", "cats", "dogs", "penguin", "panda"],
    "probability": [0.019401,  0.022243, 0.008600, 0.0040988, 0.031135, 0.002321, 0.0075548, 0.002581],
    "Condition": ["None", "None", "None", "None", "Subliminal", "Subliminal", "Subliminal", "Subliminal"]
}

df = pd.DataFrame(data)
print(df)
fig = px.bar(
    df,
    x='animal',
    y='probability',
    color='Condition',
    barmode='group',
    template='simple_white',
    width=700,
    title='Probability of LM response to "What\'s your favorite animal?"'
)

# log scale
fig.update_yaxes(type='log')

# remove text labels
fig.update_traces(text=None)

fig.show()

    animal  probability   Condition
0     cats     0.019401        None
1     dogs     0.022243        None
2  penguin     0.008600        None
3    panda     0.004099        None
4     cats     0.031135  Subliminal
5     dogs     0.002321  Subliminal
6  penguin     0.007555  Subliminal
7    panda     0.002581  Subliminal
