<a href="https://colab.research.google.com/github/bodeby/consensus/blob/main/union_mapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
from huggingface_hub import login
from google.colab import userdata

# Replace 'YOUR_TOKEN' with your actual Hugging Face token
login(token=userdata.get("HF_TOKEN"), add_to_git_credential=True)

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
# Example model names
model_names = ["meta-llama/Llama-3.2-3B-Instruct", "microsoft/Phi-3-mini-4k-instruct"]

# Load model for each model name
models = [AutoModelForCausalLM.from_pretrained(model_name) for model_name in model_names]

# Load tokenizers for each model
tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
# Create a union vocabulary
vocab = {}
for tokenizer in tokenizers:
    for token, idx in tokenizer.get_vocab().items():
        if token not in vocab:
            vocab[token] = idx

# Manually add [UNK] token if not present
if '[UNK]' not in vocab:
    vocab['[UNK]'] = len(vocab)

# Create a map from model-specific token indices to union vocabulary indices
token_to_union_idx = {token: idx for idx, (token, _) in enumerate(vocab.items())}

In [20]:
("idx", len(token_to_union_idx)), ("vocab", len(vocab))

(('idx', 150617), ('vocab', 150617))

In [21]:
def encode_with_union_vocab(tokenizers, prompt):
    # Tokenize the prompt with each model's tokenizer, using the union vocabulary
    token_ids = []
    for tokenizer in tokenizers:
        tokenized = tokenizer(prompt)
        encoded = [token_to_union_idx.get(t, token_to_union_idx['[UNK]']) for t in tokenized.input_ids]

        # Ensure token IDs are within the model's vocabulary range
        # Get the model's vocabulary size
        vocab_size = tokenizer.vocab_size

        # Clip token IDs to be within the valid range [0, vocab_size - 1]
        encoded = [min(id, vocab_size - 1) for id in encoded]

        token_ids.append(encoded)

    return token_ids

In [24]:
def ensemble_generate(models, tokenizers, prompt, top_k=10, device='cuda'):
    models = [model.to(device) for model in models]  # Move models to device
    token_ids = encode_with_union_vocab(tokenizers, prompt)

    # Initialize a tensor to accumulate logits with the shape of the largest vocabulary
    max_vocab_size = max(tokenizer.vocab_size for tokenizer in tokenizers)
    total_logits = torch.zeros(1, len(token_ids[0]), max_vocab_size, device=device)

    with torch.no_grad():
        for model, tokenized_prompt in zip(models, token_ids):
            # Convert tokenized prompt to tensor
            inputs = torch.tensor([tokenized_prompt]).to(device)

            # Get the model output (logits)
            outputs = model(inputs)
            logits = outputs.logits  # Shape: [batch_size, seq_len, vocab_size]

            # Pad logits with zeros to match the maximum vocabulary size
            padding_size = max_vocab_size - logits.shape[-1]
            if padding_size > 0:
                logits = torch.nn.functional.pad(logits, (0, padding_size), value=0)

            # Accumulate logits
            total_logits += logits  # Sum the logits from each model

    # Average the logits across models
    averaged_logits = total_logits / len(models)

    # Convert logits to probabilities
    probabilities = F.softmax(averaged_logits, dim=-1)

    # Get the top-k token probabilities and corresponding tokens for the last token
    last_token_probs = probabilities[0, -1, :].cpu().numpy()
    top_k_indices = last_token_probs.argsort()[-top_k:][::-1]
    top_k_probs = last_token_probs[top_k_indices]
    top_k_tokens = [list(vocab.keys())[i] for i in top_k_indices]

    # Combine tokens and probabilities into tuples
    top_k_results = list(zip(top_k_tokens, top_k_probs))

    return top_k_results

In [25]:
prompt = "What is the capital of France?"

# Get the top-k predictions from the ensemble
top_k_tokens = ensemble_generate(models, tokenizers, prompt, top_k=5, device='cpu')

RuntimeError: The size of tensor a (128000) must match the size of tensor b (128256) at non-singleton dimension 2