# CSE 234 Programming Assignment 3: Speculative Decoding

## Setup

In [1]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Tuple, Dict, Optional
import gc

## Speculative Decoding

## Config 1: Greedy Decoding

In [2]:
class SpeculativeDecoder:
    def __init__(self, target_model_name: str, draft_model_name: str, device: str = "cuda"):
        """
        Initialize the speculative decoder with target and draft models.

        Args:
            target_model_name: HuggingFace model ID for the larger target model.
            draft_model_name: HuggingFace model ID for the smaller draft model.
            device: Device to run models on ("cuda" or "cpu").
        """
        self.device = device
        self.target_model, self.target_tokenizer = self.initialize_target_model(target_model_name)
        self.draft_model, self.draft_tokenizer = self.initialize_draft_model(draft_model_name)

        # Ensure tokenizers are compatible
        assert self.target_tokenizer.vocab == self.draft_tokenizer.vocab, "Tokenizers must be compatible"

    def initialize_target_model(self, model_name: str):
        """Initialize the larger target model with caching enabled and proper pad token."""
        print(f"Loading target model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # TODO: Implement target model initialization
        # 1. Set the pad token if it doesn't exist
        # 2. Load the model with appropriate settings for inference
        # 3. Enable any optimizations that might help with performance

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map = self.device,
            use_cache = True,
        )
        model.eval()
        if model_name != "EleutherAI/pythia-1.4b-deduped":
            model.generation_config.temperature = 1.0
            model.generation_config.top_p = 1.0
            model.generation_config.top_k = None
            model.generation_config.do_sample = False
            model.generation_config.num_beams = 1
            max_length = None
        return model, tokenizer

    def initialize_draft_model(self, model_name: str):
        """
        Initialize a smaller, faster draft model with proper pad token.
        Uses lower precision and additional optimizations.
        """
        print(f"Loading draft model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # TODO: Implement draft model initialization
        # 1. Set the pad token if it doesn't exist
        # 2. Load the model with appropriate settings for inference
        # 3. Enable any optimizations that might help with performance
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map = self.device,
            use_cache = True,
        )
        if model_name != "EleutherAI/pythia-160m-deduped":
            model.generation_config.temperature = 1.0
            model.generation_config.top_p = 1.0
            model.generation_config.top_k = None
            model.generation_config.do_sample = False
            model.generation_config.num_beams = 1
            max_length = None
        model.eval()
        return model, tokenizer

    def generate_draft_tokens(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
                             num_speculative_tokens: int = 17) -> torch.Tensor:
        """
        Generate speculative tokens in one forward call using the draft model.

        Args:
            input_ids: Input token IDs (tensor of shape [1, seq_len]).
            attention_mask: Corresponding attention mask.
            num_speculative_tokens: Number of tokens to speculate.

        Returns:
            Tensor of shape [1, num_speculative_tokens] containing the draft tokens.
        """
        # TODO: Implement draft token generation
        # 1. Use the draft model to generate tokens
        # 2. Extract only the new tokens (not including the input)
        # 3. Return the newly generated tokens
        with torch.no_grad():
            draft_outputs = self.draft_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=num_speculative_tokens,
                do_sample=False,
                use_cache=True,
                pad_token_id=self.draft_tokenizer.pad_token_id
            )
            input_length = input_ids.shape[1]
            draft_outputs = draft_outputs[:, input_length:]


            return draft_outputs

    def verify_tokens_vectorized(self, input_ids: torch.Tensor, draft_tokens: torch.Tensor,
                               attention_mask: torch.Tensor) -> Tuple[List[int], int]:
        """
        Vectorized verification: verify all draft tokens in one forward pass using the target model.

        Args:
            input_ids: The current input token IDs (shape [1, L]).
            draft_tokens: Draft tokens from the draft model (shape [1, k]).
            attention_mask: The current attention mask for input_ids.

        Returns:
            accepted_tokens: List of accepted token IDs.
            accepted_position: Index of the first rejected token (if all accepted, equals draft_tokens.shape[1]).
        """
        # TODO: Implement efficient verification of draft tokens
        # 1. Run target model on input_ids concatenated with draft_tokens
        # 2. Extract the logits for positions where draft tokens would be predicted
        # 3. Compare target model predictions with draft tokens
        # 4. Determine how many consecutive tokens were accepted before first mismatch

        full_seq = torch.cat([input_ids, draft_tokens], dim=1)

        full_attention_mask = torch.cat([
            attention_mask,
            torch.ones_like(draft_tokens)], dim = 1)

        with torch.no_grad():
            outputs = self.target_model(
                input_ids = full_seq,
                attention_mask = full_attention_mask,
                use_cache = True
            )
            logits = outputs.logits

        input_len = input_ids.shape[1]
        relevant_logits = logits[0, input_len-1: input_len + draft_tokens.shape[1]-1, :]

        target_predictions = torch.argmax(relevant_logits, dim = -1)

        draft_tokens_flat = draft_tokens[0]
        matches = (target_predictions == draft_tokens_flat)

        matches_cpu = matches.cpu().numpy()
        if False in matches_cpu:
            first_mismatch = np.where(matches_cpu == False)[0][0]
            accepted_position = first_mismatch
        else:
            accepted_position = len(matches_cpu)

        accepted_tokens = draft_tokens_flat[:accepted_position].tolist()

        return accepted_tokens, accepted_position


    def speculative_decode(self, prompt: str, max_tokens: int = 100,
                          num_speculative_tokens: int = 17) -> str:
        """
        Main speculative decoding algorithm with vectorized verification.

        Args:
            prompt: Input text.
            max_tokens: Maximum number of tokens to generate (excluding prompt).
            num_speculative_tokens: Number of tokens to speculate per iteration.

        Returns:
            Generated text.
        """
        # Tokenize prompt
        inputs = self.target_tokenizer(prompt, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)
        prompt_length = input_ids.shape[1]

        # Initialize counters for performance tracking
        total_tokens_generated = prompt_length
        total_draft_tokens_proposed = 0
        total_draft_tokens_accepted = 0
        start_time = time.time()

        # TODO: Implement the core speculative decoding loop
        # 1. Generate draft tokens using the draft model
        # 2. Verify draft tokens using the target model
        # 3. Accept verified tokens and append to the sequence
        # 4. For rejected tokens or if all tokens are accepted, generate a new token with the target model
        # 5. Stop when max_tokens is reached or an EOS token is generated
        while total_tokens_generated - prompt_length < max_tokens:
            draft_tokens = self.generate_draft_tokens(
                input_ids,
                attention_mask,
                num_speculative_tokens = min(num_speculative_tokens, max_tokens - (total_tokens_generated - prompt_length))
        )


            num_draft_proposed = draft_tokens.shape[1]
            total_draft_tokens_proposed += num_draft_proposed

            if num_draft_proposed == 0:
                break

            accepted_tokens, accepted_position = self.verify_tokens_vectorized(
                input_ids,
                draft_tokens,
                attention_mask
            )

            total_draft_tokens_accepted += accepted_position

            if accepted_position > 0:
                accepted_tensor = draft_tokens[:, :accepted_position]
                input_ids = torch.cat([input_ids, accepted_tensor], dim=1)
                attention_mask = torch.cat([
                    attention_mask,
                    torch.ones_like(accepted_tensor)
                ], dim = 1)
                total_tokens_generated += accepted_position

            if accepted_position < num_draft_proposed:
                with torch.no_grad():
                    target_outputs = self.target_model(
                        input_ids = input_ids,
                        attention_mask = attention_mask,
                        use_cache = True
                    )
                    next_token_logits = target_outputs.logits[0, -1,:]
                    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True).unsqueeze(0)


                    input_ids = torch.cat([input_ids, next_token], dim=1)
                    attention_mask = torch.cat([
                        attention_mask,
                        torch.ones_like(next_token)
                        ], dim = 1)
                    total_tokens_generated += 1
                    if next_token == self.target_tokenizer.eos_token_id:
                        break

            else:
                if total_tokens_generated - prompt_length >= max_tokens:
                    break
        # Calculate performance metrics
        elapsed_time = time.time() - start_time
        acceptance_rate = total_draft_tokens_accepted / total_draft_tokens_proposed if total_draft_tokens_proposed > 0 else 0

        print(f"Generated {total_tokens_generated - prompt_length} tokens in {elapsed_time:.2f} seconds")
        print(f"Tokens per second: {(total_tokens_generated - prompt_length) / elapsed_time:.2f}")
        print(f"Draft token acceptance rate: {acceptance_rate:.2%}")

        return self.target_tokenizer.decode(input_ids[0], skip_special_tokens=True)

    def benchmark(self, prompt: str, max_tokens: int = 100,
                  num_runs: int = 3, compare_baseline: bool = True) -> Dict:
        """
        Benchmark the speculative decoder against baseline decoding.

        Args:
            prompt: Input text.
            max_tokens: Maximum number of tokens to generate.
            num_runs: Number of benchmark runs.
            compare_baseline: Whether to compare with baseline (non-speculative) decoding.

        Returns:
            Dictionary with benchmark results.
        """
        results = {
            "speculative": {"times": [], "tokens_per_second": []},
            "baseline": {"times": [], "tokens_per_second": []} if compare_baseline else None
        }

        # Benchmark speculative decoding.
        for _ in range(num_runs):
            start_time = time.time()
            output = self.speculative_decode(prompt, max_tokens=max_tokens)
            elapsed = time.time() - start_time
            prompt_len = len(self.target_tokenizer(prompt)["input_ids"])
            output_tokens = len(self.target_tokenizer.encode(output)) - prompt_len
            tps = output_tokens / elapsed
            results["speculative"]["times"].append(elapsed)
            results["speculative"]["tokens_per_second"].append(tps)

        # Benchmark baseline decoding.
        if compare_baseline:
            for _ in range(num_runs):
                inputs = self.target_tokenizer(prompt, return_tensors="pt", padding=True)
                input_ids = inputs["input_ids"].to(self.device)
                attention_mask = inputs["attention_mask"].to(self.device)
                start_time = time.time()
                with torch.no_grad():
                    output_ids = self.target_model.generate(
                        input_ids,
                        attention_mask=attention_mask,
                        max_length=input_ids.shape[1] + max_tokens,
                        do_sample=False,
                        pad_token_id=self.target_tokenizer.pad_token_id
                    )
                elapsed = time.time() - start_time
                output_tokens = output_ids.shape[1] - input_ids.shape[1]
                tps = output_tokens / elapsed
                results["baseline"]["times"].append(elapsed)
                results["baseline"]["tokens_per_second"].append(tps)

        for method in results.keys():
            if results[method] is not None:
                avg_time = sum(results[method]["times"]) / num_runs
                avg_tps = sum(results[method]["tokens_per_second"]) / num_runs
                results[method]["avg_time"] = avg_time
                results[method]["avg_tokens_per_second"] = avg_tps

        if compare_baseline:
            speedup = results["baseline"]["avg_time"] / results["speculative"]["avg_time"]
            results["speedup"] = speedup
            results["latency_reduction"] = (1 - results["speculative"]["avg_time"] / results["baseline"]["avg_time"]) * 100
            # print(f"Speculative decoding speedup: {speedup:.2f}x")
            # print(f"Latency reduction: {results['latency_reduction']:.2f}%")

        return results

## Test

In [3]:
target_model_name = "EleutherAI/pythia-1.4b-deduped"  # Larger target model
draft_model_name = "EleutherAI/pythia-160m-deduped"   # Smaller draft model

In [11]:

  speedup = 0
  decoder = SpeculativeDecoder(
      target_model_name=target_model_name,
      draft_model_name=draft_model_name,
      device="cuda" if torch.cuda.is_available() else "cpu"
  )

  # Test prompts
  test_prompts = [
      "The future of Artificial Intelligence is",
      "Write a short story about a robot learning to feel emotions:",
      "Write the lyrics to the song 'Happy Birthday'."
  ]

  # Run benchmark on test prompts
  for i, prompt in enumerate(test_prompts):
      print(f"\nBenchmarking Prompt {i+1}:")
      print(f"Prompt: {prompt}")

      results = decoder.benchmark(
          prompt=prompt,
          max_tokens=100,
          num_runs=3,
          compare_baseline=True
      )

      print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
      print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

      if results["baseline"] is not None:
          print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
          print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
          print(f"Speedup: {results['speedup']:.2f}x")
          print(f"Latency reduction: {results['latency_reduction']:.2f}%")
          speedup += results['speedup']


Loading target model: EleutherAI/pythia-1.4b-deduped
Loading draft model: EleutherAI/pythia-160m-deduped

Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 1.57 seconds
Tokens per second: 63.53
Draft token acceptance rate: 85.34%
Generated 100 tokens in 1.46 seconds
Tokens per second: 68.33
Draft token acceptance rate: 85.34%
Generated 100 tokens in 1.46 seconds
Tokens per second: 68.54
Draft token acceptance rate: 85.34%
Average speculative decoding time: 1.50 seconds
Average speculative tokens per second: 66.74
Average baseline decoding time: 2.23 seconds
Average baseline tokens per second: 45.12
Speedup: 1.48x
Latency reduction: 32.63%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 1.40 seconds
Tokens per second: 71.29
Draft token acceptance rate: 91.67%
Generated 100 tokens in 1.36 seconds
Tokens per second: 73.60
Draft token acceptance rate: 91.67%
Generated 100 toke

## Analysis
Across three benchmark prompts, speculative decoding with `EleutherAI/pythia-160m-deduped` as the draft model demonstrated notable improvements over standard greedy decoding with `EleutherAI/pythia-1.4b-deduped`. For instance, speculative decoding achieved an average speedup of 1.48x for the AI-themed prompt, 1.63x for the short story prompt, and an impressive 2.30x for the song lyrics prompt. The latency reductions ranged from 32.63% to 56.48%, with the highest gains observed for simpler text generation tasks like song lyrics. Additionally, draft token acceptance rates remained consistently high, averaging 85.34%–91.67%, indicating strong alignment between the draft and target models.

## Config 2: Using Probabilistic Sampling with do_sample=True for Draft Model

In [4]:
class SpeculativeDecoder:
    def __init__(self, target_model_name: str, draft_model_name: str, device: str = "cuda"):
        """
        Initialize the speculative decoder with target and draft models.

        Args:
            target_model_name: HuggingFace model ID for the larger target model.
            draft_model_name: HuggingFace model ID for the smaller draft model.
            device: Device to run models on ("cuda" or "cpu").
        """
        self.device = device
        self.target_model, self.target_tokenizer = self.initialize_target_model(target_model_name)
        self.draft_model, self.draft_tokenizer = self.initialize_draft_model(draft_model_name)

        # Ensure tokenizers are compatible
        assert self.target_tokenizer.vocab == self.draft_tokenizer.vocab, "Tokenizers must be compatible"

    def initialize_target_model(self, model_name: str):
        """Initialize the larger target model with caching enabled and proper pad token."""
        print(f"Loading target model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # TODO: Implement target model initialization
        # 1. Set the pad token if it doesn't exist
        # 2. Load the model with appropriate settings for inference
        # 3. Enable any optimizations that might help with performance

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map = self.device,
            use_cache = True,
        )
        model.eval()
        if model_name != "EleutherAI/pythia-1.4b-deduped":
            model.generation_config.temperature = 1.0
            model.generation_config.top_p = 1.0
            model.generation_config.top_k = None
            model.generation_config.do_sample = False
            model.generation_config.num_beams = 1
            max_length = None
        return model, tokenizer

    def initialize_draft_model(self, model_name: str):
        """
        Initialize a smaller, faster draft model with proper pad token.
        Uses lower precision and additional optimizations.
        """
        print(f"Loading draft model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # TODO: Implement draft model initialization
        # 1. Set the pad token if it doesn't exist
        # 2. Load the model with appropriate settings for inference
        # 3. Enable any optimizations that might help with performance
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map = self.device,
            use_cache = True,
        )
        if model_name != "EleutherAI/pythia-160m-deduped":
            model.generation_config.temperature = 1.0
            model.generation_config.top_p = 1.0
            model.generation_config.top_k = None
            model.generation_config.do_sample = False
            model.generation_config.num_beams = 1
            max_length = None
        model.eval()
        return model, tokenizer

    def generate_draft_tokens(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
                             num_speculative_tokens: int = 17) -> torch.Tensor:
        """
        Generate speculative tokens in one forward call using the draft model.

        Args:
            input_ids: Input token IDs (tensor of shape [1, seq_len]).
            attention_mask: Corresponding attention mask.
            num_speculative_tokens: Number of tokens to speculate.

        Returns:
            Tensor of shape [1, num_speculative_tokens] containing the draft tokens.
        """
        # TODO: Implement draft token generation
        # 1. Use the draft model to generate tokens
        # 2. Extract only the new tokens (not including the input)
        # 3. Return the newly generated tokens
        with torch.no_grad():
            draft_outputs = self.draft_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=num_speculative_tokens,
                do_sample=True,
                use_cache=True,
                pad_token_id=self.draft_tokenizer.pad_token_id
            )
            input_length = input_ids.shape[1]
            draft_outputs = draft_outputs[:, input_length:]


            return draft_outputs

    def verify_tokens_vectorized(self, input_ids: torch.Tensor, draft_tokens: torch.Tensor,
                               attention_mask: torch.Tensor) -> Tuple[List[int], int]:
        """
        Vectorized verification: verify all draft tokens in one forward pass using the target model.

        Args:
            input_ids: The current input token IDs (shape [1, L]).
            draft_tokens: Draft tokens from the draft model (shape [1, k]).
            attention_mask: The current attention mask for input_ids.

        Returns:
            accepted_tokens: List of accepted token IDs.
            accepted_position: Index of the first rejected token (if all accepted, equals draft_tokens.shape[1]).
        """
        # TODO: Implement efficient verification of draft tokens
        # 1. Run target model on input_ids concatenated with draft_tokens
        # 2. Extract the logits for positions where draft tokens would be predicted
        # 3. Compare target model predictions with draft tokens
        # 4. Determine how many consecutive tokens were accepted before first mismatch

        full_seq = torch.cat([input_ids, draft_tokens], dim=1)

        full_attention_mask = torch.cat([
            attention_mask,
            torch.ones_like(draft_tokens)], dim = 1)

        with torch.no_grad():
            outputs = self.target_model(
                input_ids = full_seq,
                attention_mask = full_attention_mask,
                use_cache = True
            )
            logits = outputs.logits

        input_len = input_ids.shape[1]
        relevant_logits = logits[0, input_len-1: input_len + draft_tokens.shape[1]-1, :]

        target_predictions = torch.argmax(relevant_logits, dim = -1)

        draft_tokens_flat = draft_tokens[0]
        matches = (target_predictions == draft_tokens_flat)

        matches_cpu = matches.cpu().numpy()
        if False in matches_cpu:
            first_mismatch = np.where(matches_cpu == False)[0][0]
            accepted_position = first_mismatch
        else:
            accepted_position = len(matches_cpu)

        accepted_tokens = draft_tokens_flat[:accepted_position].tolist()

        return accepted_tokens, accepted_position


    def speculative_decode(self, prompt: str, max_tokens: int = 100,
                          num_speculative_tokens: int = 17) -> str:
        """
        Main speculative decoding algorithm with vectorized verification.

        Args:
            prompt: Input text.
            max_tokens: Maximum number of tokens to generate (excluding prompt).
            num_speculative_tokens: Number of tokens to speculate per iteration.

        Returns:
            Generated text.
        """
        # Tokenize prompt
        inputs = self.target_tokenizer(prompt, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)
        prompt_length = input_ids.shape[1]

        # Initialize counters for performance tracking
        total_tokens_generated = prompt_length
        total_draft_tokens_proposed = 0
        total_draft_tokens_accepted = 0
        start_time = time.time()

        # TODO: Implement the core speculative decoding loop
        # 1. Generate draft tokens using the draft model
        # 2. Verify draft tokens using the target model
        # 3. Accept verified tokens and append to the sequence
        # 4. For rejected tokens or if all tokens are accepted, generate a new token with the target model
        # 5. Stop when max_tokens is reached or an EOS token is generated
        while total_tokens_generated - prompt_length < max_tokens:
            draft_tokens = self.generate_draft_tokens(
                input_ids,
                attention_mask,
                num_speculative_tokens = min(num_speculative_tokens, max_tokens - (total_tokens_generated - prompt_length))
        )


            num_draft_proposed = draft_tokens.shape[1]
            total_draft_tokens_proposed += num_draft_proposed

            if num_draft_proposed == 0:
                break

            accepted_tokens, accepted_position = self.verify_tokens_vectorized(
                input_ids,
                draft_tokens,
                attention_mask
            )

            total_draft_tokens_accepted += accepted_position

            if accepted_position > 0:
                accepted_tensor = draft_tokens[:, :accepted_position]
                input_ids = torch.cat([input_ids, accepted_tensor], dim=1)
                attention_mask = torch.cat([
                    attention_mask,
                    torch.ones_like(accepted_tensor)
                ], dim = 1)
                total_tokens_generated += accepted_position

            if accepted_position < num_draft_proposed:
                with torch.no_grad():
                    target_outputs = self.target_model(
                        input_ids = input_ids,
                        attention_mask = attention_mask,
                        use_cache = True
                    )
                    next_token_logits = target_outputs.logits[0, -1,:]
                    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True).unsqueeze(0)


                    input_ids = torch.cat([input_ids, next_token], dim=1)
                    attention_mask = torch.cat([
                        attention_mask,
                        torch.ones_like(next_token)
                        ], dim = 1)
                    total_tokens_generated += 1
                    if next_token == self.target_tokenizer.eos_token_id:
                        break

            else:
                if total_tokens_generated - prompt_length >= max_tokens:
                    break
        # Calculate performance metrics
        elapsed_time = time.time() - start_time
        acceptance_rate = total_draft_tokens_accepted / total_draft_tokens_proposed if total_draft_tokens_proposed > 0 else 0

        print(f"Generated {total_tokens_generated - prompt_length} tokens in {elapsed_time:.2f} seconds")
        print(f"Tokens per second: {(total_tokens_generated - prompt_length) / elapsed_time:.2f}")
        print(f"Draft token acceptance rate: {acceptance_rate:.2%}")

        return self.target_tokenizer.decode(input_ids[0], skip_special_tokens=True)

    def benchmark(self, prompt: str, max_tokens: int = 100,
                  num_runs: int = 3, compare_baseline: bool = True) -> Dict:
        """
        Benchmark the speculative decoder against baseline decoding.

        Args:
            prompt: Input text.
            max_tokens: Maximum number of tokens to generate.
            num_runs: Number of benchmark runs.
            compare_baseline: Whether to compare with baseline (non-speculative) decoding.

        Returns:
            Dictionary with benchmark results.
        """
        results = {
            "speculative": {"times": [], "tokens_per_second": []},
            "baseline": {"times": [], "tokens_per_second": []} if compare_baseline else None
        }

        # Benchmark speculative decoding.
        for _ in range(num_runs):
            start_time = time.time()
            output = self.speculative_decode(prompt, max_tokens=max_tokens)
            elapsed = time.time() - start_time
            prompt_len = len(self.target_tokenizer(prompt)["input_ids"])
            output_tokens = len(self.target_tokenizer.encode(output)) - prompt_len
            tps = output_tokens / elapsed
            results["speculative"]["times"].append(elapsed)
            results["speculative"]["tokens_per_second"].append(tps)

        # Benchmark baseline decoding.
        if compare_baseline:
            for _ in range(num_runs):
                inputs = self.target_tokenizer(prompt, return_tensors="pt", padding=True)
                input_ids = inputs["input_ids"].to(self.device)
                attention_mask = inputs["attention_mask"].to(self.device)
                start_time = time.time()
                with torch.no_grad():
                    output_ids = self.target_model.generate(
                        input_ids,
                        attention_mask=attention_mask,
                        max_length=input_ids.shape[1] + max_tokens,
                        do_sample=False,
                        pad_token_id=self.target_tokenizer.pad_token_id
                    )
                elapsed = time.time() - start_time
                output_tokens = output_ids.shape[1] - input_ids.shape[1]
                tps = output_tokens / elapsed
                results["baseline"]["times"].append(elapsed)
                results["baseline"]["tokens_per_second"].append(tps)

        for method in results.keys():
            if results[method] is not None:
                avg_time = sum(results[method]["times"]) / num_runs
                avg_tps = sum(results[method]["tokens_per_second"]) / num_runs
                results[method]["avg_time"] = avg_time
                results[method]["avg_tokens_per_second"] = avg_tps

        if compare_baseline:
            speedup = results["baseline"]["avg_time"] / results["speculative"]["avg_time"]
            results["speedup"] = speedup
            results["latency_reduction"] = (1 - results["speculative"]["avg_time"] / results["baseline"]["avg_time"]) * 100
            # print(f"Speculative decoding speedup: {speedup:.2f}x")
            # print(f"Latency reduction: {results['latency_reduction']:.2f}%")

        return results

In [7]:
target_model_name = "EleutherAI/pythia-1.4b-deduped"  # Larger target model
draft_model_name = "EleutherAI/pythia-160m-deduped"   # Smaller draft model

speedup = 0
decoder = SpeculativeDecoder(
    target_model_name=target_model_name,
    draft_model_name=draft_model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Test prompts
test_prompts = [
    "The future of Artificial Intelligence is",
    "Write a short story about a robot learning to feel emotions:",
    "Write the lyrics to the song 'Happy Birthday'."
]

# Run benchmark on test prompts
for i, prompt in enumerate(test_prompts):
    print(f"\nBenchmarking Prompt {i+1}:")
    print(f"Prompt: {prompt}")

    results = decoder.benchmark(
        prompt=prompt,
        max_tokens=100,
        num_runs=3,
        compare_baseline=True
    )

    print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
    print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

    if results["baseline"] is not None:
        print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
        print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
        print(f"Speedup: {results['speedup']:.2f}x")
        print(f"Latency reduction: {results['latency_reduction']:.2f}%")
        speedup += results['speedup']


Loading target model: EleutherAI/pythia-1.4b-deduped
Loading draft model: EleutherAI/pythia-160m-deduped

Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 3.60 seconds
Tokens per second: 27.81
Draft token acceptance rate: 52.81%
Generated 100 tokens in 5.99 seconds
Tokens per second: 16.70
Draft token acceptance rate: 50.54%
Generated 100 tokens in 5.35 seconds
Tokens per second: 18.71
Draft token acceptance rate: 70.80%
Average speculative decoding time: 4.98 seconds
Average speculative tokens per second: 21.06
Average baseline decoding time: 3.42 seconds
Average baseline tokens per second: 30.29
Speedup: 0.69x
Latency reduction: -45.77%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 5.04 seconds
Tokens per second: 19.85
Draft token acceptance rate: 39.73%
Generated 100 tokens in 9.22 seconds
Tokens per second: 10.84
Draft token acceptance rate: 33.98%
Generated 100 tok

## Analysis

The comparison between probabilistic sampling and greedy decoding highlights the trade-offs between diversity and efficiency in text generation. Unlike greedy decoding, which deterministically selects the highest-probability token at each step, probabilistic sampling introduces randomness, allowing for more diverse and creative outputs but at the cost of increased decoding time. The results show that speculative decoding with sampling significantly underperformed compared to greedy decoding in terms of speed, with negative speedups of **0.69x** and **0.49x** for the AI and short story prompts, respectively, leading to latency increases of up to **103.71%**. This decline in performance is largely due to the lower draft token acceptance rates (as low as **27.72%**), suggesting that the draft model’s sampled tokens often diverged from the target model’s predictions. However, for simpler and more structured text (e.g., song lyrics), speculative decoding with sampling achieved a **1.30x speedup**, indicating that in cases where diversity is less critical, sampling can still be effective. These findings suggest that while sampling enhances creativity, it introduces inefficiencies in speculative decoding, making it more suitable for applications where diversity is prioritized over inference speed.

## Config 3: Turning off caching

In [17]:
class SpeculativeDecoder:
    def __init__(self, target_model_name: str, draft_model_name: str, device: str = "cuda"):
        """
        Initialize the speculative decoder with target and draft models.

        Args:
            target_model_name: HuggingFace model ID for the larger target model.
            draft_model_name: HuggingFace model ID for the smaller draft model.
            device: Device to run models on ("cuda" or "cpu").
        """
        self.device = device
        self.target_model, self.target_tokenizer = self.initialize_target_model(target_model_name)
        self.draft_model, self.draft_tokenizer = self.initialize_draft_model(draft_model_name)

        # Ensure tokenizers are compatible
        assert self.target_tokenizer.vocab == self.draft_tokenizer.vocab, "Tokenizers must be compatible"

    def initialize_target_model(self, model_name: str):
        """Initialize the larger target model with caching enabled and proper pad token."""
        print(f"Loading target model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # TODO: Implement target model initialization
        # 1. Set the pad token if it doesn't exist
        # 2. Load the model with appropriate settings for inference
        # 3. Enable any optimizations that might help with performance

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map = self.device,
            use_cache = False,
        )
        model.eval()
        if model_name != "EleutherAI/pythia-1.4b-deduped":
            model.generation_config.temperature = 1.0
            model.generation_config.top_p = 1.0
            model.generation_config.top_k = None
            model.generation_config.do_sample = False
            model.generation_config.num_beams = 1
            max_length = None
        return model, tokenizer

    def initialize_draft_model(self, model_name: str):
        """
        Initialize a smaller, faster draft model with proper pad token.
        Uses lower precision and additional optimizations.
        """
        print(f"Loading draft model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # TODO: Implement draft model initialization
        # 1. Set the pad token if it doesn't exist
        # 2. Load the model with appropriate settings for inference
        # 3. Enable any optimizations that might help with performance
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map = self.device,
            use_cache = False,
        )
        if model_name != "EleutherAI/pythia-160m-deduped":
            model.generation_config.temperature = 1.0
            model.generation_config.top_p = 1.0
            model.generation_config.top_k = None
            model.generation_config.do_sample = False
            model.generation_config.num_beams = 1
            max_length = None
        model.eval()
        return model, tokenizer

    def generate_draft_tokens(self, input_ids: torch.Tensor, attention_mask: torch.Tensor,
                             num_speculative_tokens: int = 17) -> torch.Tensor:
        """
        Generate speculative tokens in one forward call using the draft model.

        Args:
            input_ids: Input token IDs (tensor of shape [1, seq_len]).
            attention_mask: Corresponding attention mask.
            num_speculative_tokens: Number of tokens to speculate.

        Returns:
            Tensor of shape [1, num_speculative_tokens] containing the draft tokens.
        """
        # TODO: Implement draft token generation
        # 1. Use the draft model to generate tokens
        # 2. Extract only the new tokens (not including the input)
        # 3. Return the newly generated tokens
        with torch.no_grad():
            draft_outputs = self.draft_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=num_speculative_tokens,
                do_sample=False,
                use_cache=False,
                pad_token_id=self.draft_tokenizer.pad_token_id
            )
            input_length = input_ids.shape[1]
            draft_outputs = draft_outputs[:, input_length:]


            return draft_outputs

    def verify_tokens_vectorized(self, input_ids: torch.Tensor, draft_tokens: torch.Tensor,
                               attention_mask: torch.Tensor) -> Tuple[List[int], int]:
        """
        Vectorized verification: verify all draft tokens in one forward pass using the target model.

        Args:
            input_ids: The current input token IDs (shape [1, L]).
            draft_tokens: Draft tokens from the draft model (shape [1, k]).
            attention_mask: The current attention mask for input_ids.

        Returns:
            accepted_tokens: List of accepted token IDs.
            accepted_position: Index of the first rejected token (if all accepted, equals draft_tokens.shape[1]).
        """
        # TODO: Implement efficient verification of draft tokens
        # 1. Run target model on input_ids concatenated with draft_tokens
        # 2. Extract the logits for positions where draft tokens would be predicted
        # 3. Compare target model predictions with draft tokens
        # 4. Determine how many consecutive tokens were accepted before first mismatch

        full_seq = torch.cat([input_ids, draft_tokens], dim=1)

        full_attention_mask = torch.cat([
            attention_mask,
            torch.ones_like(draft_tokens)], dim = 1)

        with torch.no_grad():
            outputs = self.target_model(
                input_ids = full_seq,
                attention_mask = full_attention_mask,
                use_cache = False
            )
            logits = outputs.logits

        input_len = input_ids.shape[1]
        relevant_logits = logits[0, input_len-1: input_len + draft_tokens.shape[1]-1, :]

        target_predictions = torch.argmax(relevant_logits, dim = -1)

        draft_tokens_flat = draft_tokens[0]
        matches = (target_predictions == draft_tokens_flat)

        matches_cpu = matches.cpu().numpy()
        if False in matches_cpu:
            first_mismatch = np.where(matches_cpu == False)[0][0]
            accepted_position = first_mismatch
        else:
            accepted_position = len(matches_cpu)

        accepted_tokens = draft_tokens_flat[:accepted_position].tolist()

        return accepted_tokens, accepted_position


    def speculative_decode(self, prompt: str, max_tokens: int = 100,
                          num_speculative_tokens: int = 17) -> str:
        """
        Main speculative decoding algorithm with vectorized verification.

        Args:
            prompt: Input text.
            max_tokens: Maximum number of tokens to generate (excluding prompt).
            num_speculative_tokens: Number of tokens to speculate per iteration.

        Returns:
            Generated text.
        """
        # Tokenize prompt
        inputs = self.target_tokenizer(prompt, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)
        prompt_length = input_ids.shape[1]

        # Initialize counters for performance tracking
        total_tokens_generated = prompt_length
        total_draft_tokens_proposed = 0
        total_draft_tokens_accepted = 0
        start_time = time.time()

        # TODO: Implement the core speculative decoding loop
        # 1. Generate draft tokens using the draft model
        # 2. Verify draft tokens using the target model
        # 3. Accept verified tokens and append to the sequence
        # 4. For rejected tokens or if all tokens are accepted, generate a new token with the target model
        # 5. Stop when max_tokens is reached or an EOS token is generated
        while total_tokens_generated - prompt_length < max_tokens:
            draft_tokens = self.generate_draft_tokens(
                input_ids,
                attention_mask,
                num_speculative_tokens = min(num_speculative_tokens, max_tokens - (total_tokens_generated - prompt_length))
        )


            num_draft_proposed = draft_tokens.shape[1]
            total_draft_tokens_proposed += num_draft_proposed

            if num_draft_proposed == 0:
                break

            accepted_tokens, accepted_position = self.verify_tokens_vectorized(
                input_ids,
                draft_tokens,
                attention_mask
            )

            total_draft_tokens_accepted += accepted_position

            if accepted_position > 0:
                accepted_tensor = draft_tokens[:, :accepted_position]
                input_ids = torch.cat([input_ids, accepted_tensor], dim=1)
                attention_mask = torch.cat([
                    attention_mask,
                    torch.ones_like(accepted_tensor)
                ], dim = 1)
                total_tokens_generated += accepted_position

            if accepted_position < num_draft_proposed:
                with torch.no_grad():
                    target_outputs = self.target_model(
                        input_ids = input_ids,
                        attention_mask = attention_mask,
                        use_cache = True
                    )
                    next_token_logits = target_outputs.logits[0, -1,:]
                    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True).unsqueeze(0)


                    input_ids = torch.cat([input_ids, next_token], dim=1)
                    attention_mask = torch.cat([
                        attention_mask,
                        torch.ones_like(next_token)
                        ], dim = 1)
                    total_tokens_generated += 1
                    if next_token == self.target_tokenizer.eos_token_id:
                        break

            else:
                if total_tokens_generated - prompt_length >= max_tokens:
                    break
        # Calculate performance metrics
        elapsed_time = time.time() - start_time
        acceptance_rate = total_draft_tokens_accepted / total_draft_tokens_proposed if total_draft_tokens_proposed > 0 else 0

        print(f"Generated {total_tokens_generated - prompt_length} tokens in {elapsed_time:.2f} seconds")
        print(f"Tokens per second: {(total_tokens_generated - prompt_length) / elapsed_time:.2f}")
        print(f"Draft token acceptance rate: {acceptance_rate:.2%}")

        return self.target_tokenizer.decode(input_ids[0], skip_special_tokens=True)

    def benchmark(self, prompt: str, max_tokens: int = 100,
                  num_runs: int = 3, compare_baseline: bool = True) -> Dict:
        """
        Benchmark the speculative decoder against baseline decoding.

        Args:
            prompt: Input text.
            max_tokens: Maximum number of tokens to generate.
            num_runs: Number of benchmark runs.
            compare_baseline: Whether to compare with baseline (non-speculative) decoding.

        Returns:
            Dictionary with benchmark results.
        """
        results = {
            "speculative": {"times": [], "tokens_per_second": []},
            "baseline": {"times": [], "tokens_per_second": []} if compare_baseline else None
        }

        # Benchmark speculative decoding.
        for _ in range(num_runs):
            start_time = time.time()
            output = self.speculative_decode(prompt, max_tokens=max_tokens)
            elapsed = time.time() - start_time
            prompt_len = len(self.target_tokenizer(prompt)["input_ids"])
            output_tokens = len(self.target_tokenizer.encode(output)) - prompt_len
            tps = output_tokens / elapsed
            results["speculative"]["times"].append(elapsed)
            results["speculative"]["tokens_per_second"].append(tps)

        # Benchmark baseline decoding.
        if compare_baseline:
            for _ in range(num_runs):
                inputs = self.target_tokenizer(prompt, return_tensors="pt", padding=True)
                input_ids = inputs["input_ids"].to(self.device)
                attention_mask = inputs["attention_mask"].to(self.device)
                start_time = time.time()
                with torch.no_grad():
                    output_ids = self.target_model.generate(
                        input_ids,
                        attention_mask=attention_mask,
                        max_length=input_ids.shape[1] + max_tokens,
                        do_sample=False,
                        pad_token_id=self.target_tokenizer.pad_token_id
                    )
                elapsed = time.time() - start_time
                output_tokens = output_ids.shape[1] - input_ids.shape[1]
                tps = output_tokens / elapsed
                results["baseline"]["times"].append(elapsed)
                results["baseline"]["tokens_per_second"].append(tps)

        for method in results.keys():
            if results[method] is not None:
                avg_time = sum(results[method]["times"]) / num_runs
                avg_tps = sum(results[method]["tokens_per_second"]) / num_runs
                results[method]["avg_time"] = avg_time
                results[method]["avg_tokens_per_second"] = avg_tps

        if compare_baseline:
            speedup = results["baseline"]["avg_time"] / results["speculative"]["avg_time"]
            results["speedup"] = speedup
            results["latency_reduction"] = (1 - results["speculative"]["avg_time"] / results["baseline"]["avg_time"]) * 100
            # print(f"Speculative decoding speedup: {speedup:.2f}x")
            # print(f"Latency reduction: {results['latency_reduction']:.2f}%")

        return results

In [19]:
target_model_name = "EleutherAI/pythia-1.4b-deduped"  # Larger target model
draft_model_name = "EleutherAI/pythia-160m-deduped"   # Smaller draft model

speedup = 0
decoder = SpeculativeDecoder(
    target_model_name=target_model_name,
    draft_model_name=draft_model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Test prompts
test_prompts = [
    "The future of Artificial Intelligence is",
    "Write a short story about a robot learning to feel emotions:",
    "Write the lyrics to the song 'Happy Birthday'."
]

# Run benchmark on test prompts
for i, prompt in enumerate(test_prompts):
    print(f"\nBenchmarking Prompt {i+1}:")
    print(f"Prompt: {prompt}")

    results = decoder.benchmark(
        prompt=prompt,
        max_tokens=100,
        num_runs=3,
        compare_baseline=True
    )

    print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
    print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

    if results["baseline"] is not None:
        print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
        print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
        print(f"Speedup: {results['speedup']:.2f}x")
        print(f"Latency reduction: {results['latency_reduction']:.2f}%")
        speedup += results['speedup']

Loading target model: EleutherAI/pythia-1.4b-deduped
Loading draft model: EleutherAI/pythia-160m-deduped

Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 1.52 seconds
Tokens per second: 65.71
Draft token acceptance rate: 85.34%
Generated 100 tokens in 1.56 seconds
Tokens per second: 63.93
Draft token acceptance rate: 85.34%
Generated 100 tokens in 1.53 seconds
Tokens per second: 65.41
Draft token acceptance rate: 85.34%
Average speculative decoding time: 1.54 seconds
Average speculative tokens per second: 64.96
Average baseline decoding time: 2.47 seconds
Average baseline tokens per second: 40.62
Speedup: 1.60x
Latency reduction: 37.63%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 1.39 seconds
Tokens per second: 72.10
Draft token acceptance rate: 91.67%
Generated 100 tokens in 1.42 seconds
Tokens per second: 70.49
Draft token acceptance rate: 91.67%
Generated 100 toke

## Analysis

Disabling `use_cache` had a noticeable impact on speculative decoding, demonstrating improved efficiency while maintaining high draft token acceptance rates. Across all benchmarked prompts, speculative decoding significantly outperformed baseline decoding, with speedups ranging from **1.60x to 1.90x** and latency reductions between **37.63% and 47.30%**. The draft model’s acceptance rates remained consistent (**85.34%–91.67%**), indicating that even without caching, the speculative approach effectively leveraged the draft model’s predictions. Compared to previous results with caching enabled, the performance remained strong, suggesting that speculative decoding can still achieve substantial speed improvements even when memory optimizations like caching are turned off.

## Config 4: Varying the `num_speculative_tokens` (Code not shown)

### Findings

1. We experimented with values of **10, 15, and 18**, observing variations in accuracy. However, obtaining a precise measure of speedup proved challenging.  
2. A **dynamic draft length** approach was implemented, but its impact on performance was minimal. The draft length consistently reached the predefined maximum due to clipping constraints.  
3. To optimize draft length, we introduced a **categorization-based method**, where more static questions received longer speculative drafts. However, this strategy did not yield significant improvements. After discussing with my professor, we identified a potential workaround: for highly predictable queries, such as "Happy Birthday lyrics," a **precomputed response database** could be leveraged to bypass speculative decoding entirely.

## Bonus

In [4]:
target_model_name = "EleutherAI/pythia-1.4b-deduped"  # Larger target model
draft_model_name = "EleutherAI/pythia-70m-deduped"   # Smaller draft model


# Initialize speculative decoder
decoder = SpeculativeDecoder(
    target_model_name=target_model_name,
    draft_model_name=draft_model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Test prompts
test_prompts = [
    "The future of Artificial Intelligence is",
    "Write a short story about a robot learning to feel emotions:",
    "Write the lyrics to the song 'Happy Birthday'."
]

# Run benchmark on test prompts
for i, prompt in enumerate(test_prompts):
    print(f"\nBenchmarking Prompt {i+1}:")
    print(f"Prompt: {prompt}")

    results = decoder.benchmark(
        prompt=prompt,
        max_tokens=100,
        num_runs=3,
        compare_baseline=True
    )

    print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
    print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

    if results["baseline"] is not None:
        print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
        print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
        print(f"Speedup: {results['speedup']:.2f}x")
        print(f"Latency reduction: {results['latency_reduction']:.2f}%")

Loading target model: EleutherAI/pythia-1.4b-deduped


The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


Loading draft model: EleutherAI/pythia-70m-deduped


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]


Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 1.22 seconds
Tokens per second: 81.69
Draft token acceptance rate: 70.29%
Generated 100 tokens in 1.11 seconds
Tokens per second: 90.03
Draft token acceptance rate: 70.29%
Generated 100 tokens in 1.13 seconds
Tokens per second: 88.67
Draft token acceptance rate: 70.29%
Average speculative decoding time: 1.16 seconds
Average speculative tokens per second: 86.67
Average baseline decoding time: 2.19 seconds
Average baseline tokens per second: 46.26
Speedup: 1.90x
Latency reduction: 47.29%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 1.33 seconds
Tokens per second: 75.17
Draft token acceptance rate: 60.38%
Generated 100 tokens in 1.32 seconds
Tokens per second: 75.55
Draft token acceptance rate: 60.38%
Generated 100 tokens in 1.29 seconds
Tokens per second: 77.78
Draft token acceptance rate: 60.38%
Average speculative decod

In [5]:
target_model_name = "EleutherAI/pythia-1.4b-deduped"  # Larger target model
draft_model_name = "EleutherAI/pythia-410m-deduped"   # Smaller draft model


# Initialize speculative decoder
decoder = SpeculativeDecoder(
    target_model_name=target_model_name,
    draft_model_name=draft_model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Test prompts
test_prompts = [
    "The future of Artificial Intelligence is",
    "Write a short story about a robot learning to feel emotions:",
    "Write the lyrics to the song 'Happy Birthday'."
]

# Run benchmark on test prompts
for i, prompt in enumerate(test_prompts):
    print(f"\nBenchmarking Prompt {i+1}:")
    print(f"Prompt: {prompt}")

    results = decoder.benchmark(
        prompt=prompt,
        max_tokens=100,
        num_runs=3,
        compare_baseline=True
    )

    print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
    print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

    if results["baseline"] is not None:
        print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
        print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
        print(f"Speedup: {results['speedup']:.2f}x")
        print(f"Latency reduction: {results['latency_reduction']:.2f}%")

Loading target model: EleutherAI/pythia-1.4b-deduped
Loading draft model: EleutherAI/pythia-410m-deduped


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]


Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 2.54 seconds
Tokens per second: 39.38
Draft token acceptance rate: 85.34%
Generated 100 tokens in 2.53 seconds
Tokens per second: 39.47
Draft token acceptance rate: 85.34%
Generated 100 tokens in 2.52 seconds
Tokens per second: 39.65
Draft token acceptance rate: 85.34%
Average speculative decoding time: 2.53 seconds
Average speculative tokens per second: 39.48
Average baseline decoding time: 2.20 seconds
Average baseline tokens per second: 45.62
Speedup: 0.87x
Latency reduction: -14.95%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 2.48 seconds
Tokens per second: 40.39
Draft token acceptance rate: 86.84%
Generated 100 tokens in 2.47 seconds
Tokens per second: 40.55
Draft token acceptance rate: 86.84%
Generated 100 tokens in 2.66 seconds
Tokens per second: 37.56
Draft token acceptance rate: 86.84%
Average speculative deco

In [3]:
target_model_name = "EleutherAI/pythia-2.8b-deduped"  # Larger target model
draft_model_name = "EleutherAI/pythia-160m-deduped"


# Initialize speculative decoder
decoder = SpeculativeDecoder(
    target_model_name=target_model_name,
    draft_model_name=draft_model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Test prompts
test_prompts = [
    "The future of Artificial Intelligence is",
    "Write a short story about a robot learning to feel emotions:",
    "Write the lyrics to the song 'Happy Birthday'."
]

# Run benchmark on test prompts
for i, prompt in enumerate(test_prompts):
    print(f"\nBenchmarking Prompt {i+1}:")
    print(f"Prompt: {prompt}")

    results = decoder.benchmark(
        prompt=prompt,
        max_tokens=100,
        num_runs=3,
        compare_baseline=True
    )

    print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
    print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

    if results["baseline"] is not None:
        print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
        print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
        print(f"Speedup: {results['speedup']:.2f}x")
        print(f"Latency reduction: {results['latency_reduction']:.2f}%")

Loading target model: EleutherAI/pythia-2.8b-deduped


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


Loading draft model: EleutherAI/pythia-160m-deduped

Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 2.36 seconds
Tokens per second: 42.42
Draft token acceptance rate: 85.34%
Generated 100 tokens in 1.51 seconds
Tokens per second: 66.28
Draft token acceptance rate: 85.34%
Generated 100 tokens in 2.76 seconds
Tokens per second: 36.17
Draft token acceptance rate: 85.34%
Average speculative decoding time: 2.21 seconds
Average speculative tokens per second: 48.23
Average baseline decoding time: 2.96 seconds
Average baseline tokens per second: 33.89
Speedup: 1.34x
Latency reduction: 25.13%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 3.76 seconds
Tokens per second: 26.57
Draft token acceptance rate: 41.07%
Generated 100 tokens in 3.38 seconds
Tokens per second: 29.62
Draft token acceptance rate: 41.07%
Generated 100 tokens in 3.20 seconds
Tokens per second: 31.25
Draft tok

In [3]:
target_model_name = "meta-llama/Llama-3.2-3B"   # Larger target model
draft_model_name = "meta-llama/Llama-3.2-1B"  # Smaller draft model


# Initialize speculative decoder
decoder = SpeculativeDecoder(
    target_model_name=target_model_name,
    draft_model_name=draft_model_name,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Test prompts
test_prompts = [
    "The future of Artificial Intelligence is",
    "Write a short story about a robot learning to feel emotions:",
    "Write the lyrics to the song 'Happy Birthday'."
]

# Run benchmark on test prompts
for i, prompt in enumerate(test_prompts):
    print(f"\nBenchmarking Prompt {i+1}:")
    print(f"Prompt: {prompt}")

    results = decoder.benchmark(
        prompt=prompt,
        max_tokens=100,
        num_runs=3,
        compare_baseline=True
    )

    print(f"Average speculative decoding time: {results['speculative']['avg_time']:.2f} seconds")
    print(f"Average speculative tokens per second: {results['speculative']['avg_tokens_per_second']:.2f}")

    if results["baseline"] is not None:
        print(f"Average baseline decoding time: {results['baseline']['avg_time']:.2f} seconds")
        print(f"Average baseline tokens per second: {results['baseline']['avg_tokens_per_second']:.2f}")
        print(f"Speedup: {results['speedup']:.2f}x")
        print(f"Latency reduction: {results['latency_reduction']:.2f}%")

Loading target model: meta-llama/Llama-3.2-3B


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Loading draft model: meta-llama/Llama-3.2-1B


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]


Benchmarking Prompt 1:
Prompt: The future of Artificial Intelligence is
Generated 100 tokens in 4.28 seconds
Tokens per second: 23.37
Draft token acceptance rate: 76.56%
Generated 100 tokens in 3.55 seconds
Tokens per second: 28.19
Draft token acceptance rate: 76.56%
Generated 100 tokens in 3.28 seconds
Tokens per second: 30.48
Draft token acceptance rate: 76.56%
Average speculative decoding time: 3.71 seconds
Average speculative tokens per second: 27.30
Average baseline decoding time: 3.71 seconds
Average baseline tokens per second: 27.09
Speedup: 1.00x
Latency reduction: -0.04%

Benchmarking Prompt 2:
Prompt: Write a short story about a robot learning to feel emotions:
Generated 100 tokens in 5.45 seconds
Tokens per second: 18.36
Draft token acceptance rate: 43.69%
Generated 100 tokens in 5.93 seconds
Tokens per second: 16.85
Draft token acceptance rate: 43.69%
Generated 100 tokens in 5.40 seconds
Tokens per second: 18.53
Draft token acceptance rate: 43.69%
Average speculative decod

# Benchmarking results across different pairs

| Target Model                     | Draft Model                     | Benchmarking Prompt         | Acceptance Rate (%) | Speedup |
|----------------------------------|---------------------------------|----------------------------|----------------------|---------|
| EleutherAI/pythia-1.4b-deduped   | EleutherAI/pythia-70m-deduped   | The future of AI           | 70.29                | 1.90    |
| EleutherAI/pythia-1.4b-deduped   | EleutherAI/pythia-70m-deduped   | Robot learning emotions    | 60.38                | 1.66    |
| EleutherAI/pythia-1.4b-deduped   | EleutherAI/pythia-70m-deduped   | Happy Birthday lyrics      | 67.61                | 1.72    |
| EleutherAI/pythia-1.4b-deduped   | EleutherAI/pythia-410m-deduped  | The future of AI           | 85.34                | 0.87    |
| EleutherAI/pythia-1.4b-deduped   | EleutherAI/pythia-410m-deduped  | Robot learning emotions    | 86.84                | 0.84    |
| EleutherAI/pythia-1.4b-deduped   | EleutherAI/pythia-410m-deduped  | Happy Birthday lyrics      | 86.84                | 0.72    |
| EleutherAI/pythia-2.8b-deduped   | EleutherAI/pythia-160m-deduped  | The future of AI           | 85.34                | 1.34    |
| EleutherAI/pythia-2.8b-deduped   | EleutherAI/pythia-160m-deduped  | Robot learning emotions    | 41.07                | 0.99    |
| EleutherAI/pythia-2.8b-deduped   | EleutherAI/pythia-160m-deduped  | Happy Birthday lyrics      | 59.26                | 1.43    |
| meta-llama/Llama-3.2-3B         | meta-llama/Llama-3.2-1B        | The future of AI           | 76.56                | 1.00    |
| meta-llama/Llama-3.2-3B         | meta-llama/Llama-3.2-1B        | Robot learning emotions    | 43.69                | 0.67    |
| meta-llama/Llama-3.2-3B         | meta-llama/Llama-3.2-1B        | Happy Birthday lyrics      | 8.82                 | 0.20    |
