In [None]:
import torch
import matplotlib.pyplot as plt

from core.speculative_engine import SpeculativeEngine
from core.draft_model import DraftModel
from core.target_model import TargetModel


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
draft = DraftModel(tokenizer,"TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=device)
target = TargetModel(tokenizer,"meta-llama/Llama-2-7b-hf", device=device)

engine = SpeculativeEngine(
    draft_model=draft,
    target_model=target,
    max_k=3,
    entropy_bins=[1.2, 2.2, 3.0],
    k_values=[3, 4, 2, 1],
)


In [None]:
prompt = "The theory of evolution explains"
input_ids = target.tokenizer(
    prompt, return_tensors="pt"
).input_ids.to(device)

_ = engine.decode(input_ids, max_tokens=80)


In [None]:
plt.figure(figsize=(8, 4))
plt.plot(engine.k_history)
plt.xlabel("Decoding Step")
plt.ylabel("k")
plt.title("Adaptive Speculation Depth (k)")
plt.grid(True)
plt.show()


In [None]:
acceptance = [
    a / k if k > 0 else 1.0
    for a, k in engine.acceptance_log
]

plt.figure(figsize=(8, 4))
plt.plot(acceptance)
plt.xlabel("Decoding Step")
plt.ylabel("Acceptance Rate")
plt.title("Acceptance Rate over Time")
plt.grid(True)
plt.show()
