The purpose of this analysis is to observe the effects of this attack on newer open-source LLMs. Specifically, from the Qwen2.5 and LlaMa3 families.

I plan on analyzing the efficiency of this attack in these ways:
1. Does the singular value method correctly estimate the hidden size for them?
2. Does scaling the model size affect the accuracy of this method?
3. How does the reconstruction error scale with model size? and how does it compare to random initialization?

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
import torch
import numpy as np
import random
import matplotlib.pyplot as plt

from steal_layer import steal_layer

### Generating the prompts
We have to generate enough prompts to fit all of them, i.e. # prompts > max(hidden_size)

In [2]:
qwen_refs = ["Qwen/Qwen2.5-0.5B", "Qwen/Qwen2.5-1.5B", "Qwen/Qwen2.5-3B", "Qwen/Qwen2.5-7B"]
llama_refs = ["meta-llama/Llama-3.2-1B", "meta-llama/Llama-3.2-3B", "meta-llama/Llama-3.1-8B"]

In [None]:
for name in qwen_refs:
    print(f"Loading {name}")
    # AutoModelForCausalLM.from_pretrained(name)
    snapshot_download(name)
    model = AutoModelForCausalLM.from_pretrained(name)
    print(model)

Loading Qwen/Qwen2.5-0.5B


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
for name in llama_refs:
    print(f"Loading {name}")
    snapshot_download(name)
    model = AutoModelForCausalLM.from_pretrained(name)
    print(model)

In [None]:
def generate_prompts(tokenizer, max_prompts):
    vocab_size = tokenizer.vocab_size
    if hasattr(model.config, 'hidden_size'):
        true_dim = model.config.hidden_size
        print(f"True hidden dimension: {true_dim}")
            
    print(f"\nGenerating {max_prompts} unique random prompts...")

    random_prompts = set()
    while len(random_prompts) < max_prompts:
        num_tokens = 1
        token_ids = random.sample(range(vocab_size), num_tokens)
        prompt = tokenizer.decode(token_ids, skip_special_tokens=True).strip()
        if prompt:
            random_prompts.add(prompt)

    random_prompts = list(random_prompts)
    print(f"Generated {len(random_prompts)} unique prompts")
    
    return random_prompts

def attack_family(model_names, max_prompts):
    for model_name in model_names:
        model = AutoModelForCausalLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model.eval()

        random_prompts = generate_prompts(tokenizer, max_prompts)
        
        print("\nCollecting logits from model...")
        all_logits = []
        all_hidden_states = []
        for i, prompt in enumerate(random_prompts):
            if i % 100 == 0:
                print(f"  Query {i}/{len(random_prompts)}")
            
            inputs = tokenizer(prompt, return_tensors="pt")
            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)
                final_hidden_state = outputs.hidden_states[-1][0, -1, :].numpy()
                all_hidden_states.append(final_hidden_state)
                # Extract last token logits
                logits = outputs.logits[0, -1, :].numpy()
                all_logits.append(logits)

        full_matrix = np.array(all_logits)
        print(f"Full logits collected. Shape: {full_matrix.shape}")

        # save collected logits for later processing
        model_name_clean = model_name.replace('/', '_')
        logits_filename = f"logits_{model_name_clean}.npy"
        np.save(logits_filename, full_matrix)
        print(f"Logits saved to {logits_filename}")

In [None]:
llama_max_prompts = 6144 # max hidden_size= 4096
qwen_max_prompts = 4096 # max hidden_size= 3584

In [None]:
for model_name in qwen_refs:
    # retrieve logits
    model_name_clean = model_name.replace('/','_')
    logits_filename = f"logits_{model_name_clean}.npy"
    model_logits = np.load(logits_filename)

    model = AutoModelForCausalLM.from_pretrained(model_name)
    weights, stats = steal_layer(model, model_logits)