In [1]:
import torch , math
from torch.utils.data import Dataset
from tqdm import tqdm
import re , json
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from collections import  Counter
import os
import torch
import torch
import torch.nn as nn
import math
import time



In [2]:
class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(normalized_shape))
        self.beta = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

In [3]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                            (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x, start_pos=0):
        """
        Args:
            x: Input tensor (batch, seq_len, d_model)
            start_pos: Starting position for positional encoding
        """
        seq_len = x.size(1)
        pos_encodings = self.pe[start_pos:start_pos + seq_len, :]
        pos_encodings = pos_encodings.unsqueeze(0).expand(x.size(0), -1, -1)
        return x + pos_encodings

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None, kv_cache=None):
        B, T, D = x.size()

        # Compute Q, K, V
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        # Reshape for multi-head attention
        Q = Q.view(B, T, self.num_heads, self.d_k).transpose(1, 2)  # (B, h, T, d_k)
        K = K.view(B, T, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(B, T, self.num_heads, self.d_k).transpose(1, 2)

        # Append cached keys/values if available
        if kv_cache is not None:
            cached_K, cached_V = kv_cache
            K = torch.cat([cached_K, K], dim=2)  # concat along sequence dimension
            V = torch.cat([cached_V, V], dim=2)

        # Compute attention scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Ensure mask matches (query_len, key_len)
        if mask is not None:
            query_len, key_len = scores.size(-2), scores.size(-1)
            # Slice or broadcast mask if needed
            if mask.size(-2) != query_len or mask.size(-1) != key_len:
                mask = mask[:, :, -query_len:, -key_len:]
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn_weights = torch.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Apply attention to values
        out = torch.matmul(attn_weights, V)

        # Combine heads
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        out = self.W_o(out)

        # Return updated cache
        new_kv_cache = (K, V)
        return out, attn_weights, new_kv_cache


In [5]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.linear1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.norm1 = LayerNorm(d_model)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm2 = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None, kv_cache=None):
        attn_output, attn_weights, new_kv_cache = self.attention(x, mask, kv_cache)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x, attn_weights, new_kv_cache



In [7]:

class DecoderTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads,
                 d_ff, max_seq_len, dropout=0.1, pretrained_embeddings=None):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.max_seq_len = max_seq_len

        # Store the original embedding dimension (300 for FastText)
        self.embedding_dim = pretrained_embeddings.shape[1] if pretrained_embeddings is not None else d_model

        # Create embedding layer with original FastText dimension
        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)

        # Load pretrained embeddings if provided
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
            # Add projection layer to convert from FastText dim to d_model
            self.embedding_proj = nn.Linear(self.embedding_dim, d_model)
        else:
            self.embedding_proj = nn.Identity()  # No projection needed if no pretrained embeddings

        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.norm = LayerNorm(d_model)
        self.output_projection = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_causal_mask(self, seq_len, device):
        mask = torch.tril(torch.ones(seq_len, seq_len, device=device))
        mask = mask.unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len, seq_len)
        return mask

    def forward(self, x, mask=None, kv_caches=None, return_attention=False):
        batch_size, seq_len = x.shape
        
        # Create mask if not provided (for training or first step of generation)
        if mask is None:
            mask = self.create_causal_mask(seq_len, x.device)

        # Get embeddings in original dimension (300)
        x = self.embedding(x) * math.sqrt(self.embedding_dim)

        # Project to d_model (which is divisible by num_heads)
        x = self.embedding_proj(x)

        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        attention_weights = []
        new_kv_caches = []
        
        for layer_idx, layer in enumerate(self.layers):
            kv_cache = kv_caches[layer_idx] if kv_caches is not None else None
            x, attn_weights, new_kv_cache = layer(x, mask, kv_cache)
            if return_attention:
                attention_weights.append(attn_weights)
            new_kv_caches.append(new_kv_cache)
        
        x = self.norm(x)
        logits = self.output_projection(x)
        
        if return_attention:
            return logits, attention_weights, new_kv_caches
        return logits, new_kv_caches



In [8]:
class Vocabulary:
    def __init__(self, fasttext_model=None):
        self.word2idx = {}
        self.idx2word = {}
        self.word_counts = Counter()
        self.PAD_TOKEN = '<pad>'
        self.SOS_TOKEN = '<sos>'
        self.EOS_TOKEN = '<eos>'
        self.UNK_TOKEN = '<unk>'
        self.add_word(self.PAD_TOKEN)
        self.add_word(self.SOS_TOKEN)
        self.add_word(self.EOS_TOKEN)
        self.add_word(self.UNK_TOKEN)
        self.fasttext_model = fasttext_model

    def add_word(self, word):
        if word not in self.word2idx:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word
        self.word_counts[word] += 1

    def __len__(self):
        return len(self.word2idx)

    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.word2idx.get(token, self.word2idx[self.UNK_TOKEN])
                for token in tokens]

    def decode(self, indices):
        words = []
        for idx in indices:
            if idx in [self.word2idx[self.PAD_TOKEN], self.word2idx[self.SOS_TOKEN]]:
                continue
            if idx == self.word2idx[self.EOS_TOKEN]:
                break
            words.append(self.idx2word.get(idx, self.UNK_TOKEN))
        return ' '.join(words)

    def tokenize(self, text):
        text = text.lower()
        tokens = re.findall(r'\b\w+\b|[.,!?;]', text)
        return tokens

    def create_embedding_matrix(self):
        embedding_matrix = torch.randn(len(self.word2idx), 300) * 0.01
        if self.fasttext_model is not None:
            found = 0
            for word, idx in self.word2idx.items():
                if word in self.fasttext_model:
                    embedding_matrix[idx] = torch.tensor(self.fasttext_model[word])
                    found += 1
            print(f"Found {found}/{len(self.word2idx)} words in FastText")
        return embedding_matrix

    def save(self, path):
        with open(path, 'w') as f:
            json.dump({
                'word2idx': self.word2idx,
                'idx2word': {int(k): v for k, v in self.idx2word.items()},
                'word_counts': dict(self.word_counts)
            }, f)

    @classmethod
    def load(cls, path, fasttext_model=None):
        vocab = cls(fasttext_model)
        with open(path, 'r') as f:
            data = json.load(f)
        vocab.word2idx = data['word2idx']
        vocab.idx2word = {int(k): v for k, v in data['idx2word'].items()}
        vocab.word_counts = Counter(data['word_counts'])
        return vocab


In [9]:
class TinyStoriesDataset(Dataset):
    def __init__(self, texts, vocab, context_length, max_samples=None):
        self.vocab = vocab
        self.context_length = context_length
        self.sequences = []

        print("Preparing dataset...")
        for idx, text in enumerate(tqdm(texts)):
            if max_samples and idx >= max_samples:
                break

            tokens = [vocab.word2idx[vocab.SOS_TOKEN]] + vocab.encode(text) + [vocab.word2idx[vocab.EOS_TOKEN]]

            for i in range(len(tokens) - 1):
                end_idx = min(i + context_length + 1, len(tokens))
                seq = tokens[i:end_idx]

                if len(seq) < context_length + 1:
                    seq = seq + [vocab.word2idx[vocab.PAD_TOKEN]] * (context_length + 1 - len(seq))

                self.sequences.append(seq)

        print(f"Created {len(self.sequences)} sequences")

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)



In [10]:
CONFIG = {
    'name': 'baseline',
    'description': 'Standard baseline configuration from assignment',
    'context_length': 64,
    'num_layers': 3,
    'num_heads': 8,
    'd_model': 296,
    'd_ff': 1184,
    'dropout': 0.1,
    'batch_size': 32,
    'learning_rate': 3e-4,
    'num_epochs': 5,
    'max_train_samples': 50000,
    'max_val_samples': 15000,
    'save_dir': 'checkpoints/baseline',
    'plot_dir': 'plots/baseline'
}

In [11]:
import gensim.downloader as api
from gensim.models import KeyedVectors
import os

def load_fasttext_model():
    model_path = 'fasttext/fasttext_model.bin'
    
    os.makedirs('fasttext', exist_ok=True)
    
    if not os.path.exists(model_path):
        print("Model not found. Downloading FastText model...")
        
        model = api.load('fasttext-wiki-news-subwords-300')
        
        model.save(model_path)
        print("Model downloaded and saved successfully!")
    else:
        print("Loading FastText model from cache...")
        model = KeyedVectors.load(model_path)
        print("Model loaded successfully!")
    
    return model

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"\nConfiguration:")
for k, v in CONFIG.items():
     print(f"  {k}: {v}")

    # Load FastText
print("\n" + "="*50)
print("Loading FastText embeddings...")
print("="*50)
fasttext_model = load_fasttext_model()

    # Load Dataset
print("\n" + "="*50)
print("Loading TinyStories dataset...")
print("="*50)
dataset = load_dataset("roneneldan/TinyStories")

print("\n" + "="*50)
print("Building vocabulary...")
print("="*50)
vocab_path = f"{CONFIG['save_dir']}/vocab.json"

if os.path.exists(vocab_path):
        print("Loading existing vocabulary...")
        vocab = Vocabulary.load(vocab_path, fasttext_model)
else:
    vocab = Vocabulary(fasttext_model)
    # Build vocabulary from training data
    num_samples = min(CONFIG['max_train_samples'], len(dataset['train']))
    for i in tqdm(range(num_samples), desc="Building vocabulary"):
        text = dataset['train'][i]['text']
        for word in vocab.tokenize(text):
            vocab.add_word(word)
    vocab.save(vocab_path)

print(f"Vocabulary size: {len(vocab)}")

# Create Datasets
print("\n" + "="*50)
print("Creating datasets...")
print("="*50)

val_texts = [dataset['validation'][i]['text'] for i in range(min(CONFIG['max_val_samples'], len(dataset['validation'])))]

val_dataset = TinyStoriesDataset(
    val_texts,
    vocab,
    CONFIG['context_length'],
    CONFIG['max_val_samples']
)

val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'],
                       shuffle=False, num_workers=0)

# Initialize Model
print("\n" + "="*50)
print("Initializing model...")
print("="*50)
embedding_matrix = vocab.create_embedding_matrix()



Using device: cuda

Configuration:
  name: baseline
  description: Standard baseline configuration from assignment
  context_length: 64
  num_layers: 3
  num_heads: 8
  d_model: 296
  d_ff: 1184
  dropout: 0.1
  batch_size: 32
  learning_rate: 0.0003
  num_epochs: 5
  max_train_samples: 50000
  max_val_samples: 15000
  save_dir: checkpoints/baseline
  plot_dir: plots/baseline

Loading FastText embeddings...
Loading FastText model from cache...
Model loaded successfully!

Loading TinyStories dataset...

Building vocabulary...
Loading existing vocabulary...
Vocabulary size: 10598

Creating datasets...
Preparing dataset...


100%|██████████| 15000/15000 [00:07<00:00, 2077.59it/s]


Created 2900659 sequences

Initializing model...
Found 9972/10598 words in FastText


In [13]:
def get_validation_prompts(val_loader, vocab, num_samples=20):
    """
    Get the first 5 words from validation data samples as strings.
    
    Args:
        val_loader: DataLoader for validation data
        vocab: Vocabulary object
        num_samples: Number of samples to retrieve (default: 20)
    
    Returns:
        List of strings, where each string contains the first 5 words
    """
    prompts = []
    
    for batch in val_loader:
        for seq in batch:
            if len(prompts) >= num_samples:
                return prompts
            
            seq = seq.numpy()
            
            # Decode and extract words (skip special tokens)
            words = []
            for token_id in seq:
                word = vocab.idx2word.get(token_id, vocab.UNK_TOKEN)
                # Skip special tokens
                if word not in [vocab.PAD_TOKEN, vocab.SOS_TOKEN, vocab.EOS_TOKEN]:
                    words.append(word)
                    if len(words) == 5:
                        break
            
            # Join the first 5 words into a string
            prompt = ' '.join(words[:5])
            prompts.append(prompt)
    
    return prompts

kv_prompts = get_validation_prompts(val_loader, vocab, num_samples=20)

In [14]:

def generate(model, prompt, vocab, max_length=50, device='cuda', use_kv_cache=False):
    """Generation with optional KV caching"""
    model.eval()
    
    tokens = [vocab.word2idx[vocab.SOS_TOKEN]] + vocab.encode(prompt)
    tokens_tensor = torch.tensor(tokens).unsqueeze(0).to(device)
    
    generated_tokens = []
    kv_caches = None
    
    with torch.no_grad():
        for step in range(max_length):
            if tokens_tensor.size(1) >= model.max_seq_len:
                break
            
            # For KV caching, we only process the last token after the first step
            if use_kv_cache and step > 0:
                # Only use the last token as input when using KV cache
                input_tokens = tokens_tensor[:, -1:]
            else:
                # Use all tokens for first step or when not using KV cache
                input_tokens = tokens_tensor
            
            # Forward pass
            if use_kv_cache:
                logits, new_kv_caches = model(input_tokens, kv_caches=kv_caches)
                kv_caches = new_kv_caches
            else:
                logits, _ = model(input_tokens)
            
            # Sample next token
            probs = torch.softmax(logits[:, -1, :], dim=-1)
            next_token = torch.multinomial(probs, 1)
            
            if next_token.item() == vocab.word2idx[vocab.EOS_TOKEN]:
                break
            
            # Append the new token
            if use_kv_cache and step > 0:
                tokens_tensor = torch.cat([tokens_tensor, next_token], dim=1)
            else:
                # For first step with KV cache or without KV cache, we already have all tokens
                tokens_tensor = torch.cat([tokens_tensor, next_token], dim=1) if step == 0 else tokens_tensor
            
            generated_tokens.append(next_token.item())
    
    return {
        'text': vocab.decode(tokens_tensor.squeeze(0).tolist()),
        'tokens': generated_tokens
    }


def generate_batch(model, prompts, vocab, max_length=50, device='cuda', use_kv_cache=False):
    """Batch generation with optional KV caching"""
    model.eval()
    batch_size = len(prompts)
    
    # Encode all prompts
    batch_tokens = []
    for prompt in prompts:
        tokens = [vocab.word2idx[vocab.SOS_TOKEN]] + vocab.encode(prompt)
        batch_tokens.append(tokens)
    
    # Pad sequences to same length
    max_prompt_len = max(len(tokens) for tokens in batch_tokens)
    tokens_tensor = torch.full((batch_size, max_prompt_len), vocab.word2idx[vocab.PAD_TOKEN], 
                              dtype=torch.long, device=device)
    
    for i, tokens in enumerate(batch_tokens):
        tokens_tensor[i, :len(tokens)] = torch.tensor(tokens, device=device)
    
    generated_tokens = [[] for _ in range(batch_size)]
    kv_caches = None
    finished = [False] * batch_size
    
    with torch.no_grad():
        for step in range(max_length):
            # Check if all sequences are finished
            if all(finished):
                break
            
            # For KV caching, we only process the last token after the first step
            if use_kv_cache and step > 0:
                input_tokens = tokens_tensor[:, -1:]
            else:
                input_tokens = tokens_tensor
            
            # Forward pass
            if use_kv_cache:
                logits, new_kv_caches = model(input_tokens, kv_caches=kv_caches)
                kv_caches = new_kv_caches
            else:
                logits, _ = model(input_tokens)
            
            # Sample next tokens
            probs = torch.softmax(logits[:, -1, :], dim=-1)
            next_tokens = torch.multinomial(probs, 1)
            
            # Update sequences and track finished ones
            for i in range(batch_size):
                if not finished[i]:
                    token = next_tokens[i].item()
                    if token == vocab.word2idx[vocab.EOS_TOKEN]:
                        finished[i] = True
                    else:
                        generated_tokens[i].append(token)
            
            # Append new tokens to the sequence
            if use_kv_cache and step > 0:
                tokens_tensor = torch.cat([tokens_tensor, next_tokens], dim=1)
            else:
                tokens_tensor = torch.cat([tokens_tensor, next_tokens], dim=1)
    
    # Decode results
    results = []
    for i, prompt in enumerate(prompts):
        full_tokens = batch_tokens[i] + generated_tokens[i]
        results.append({
            'prompt': prompt,
            'text': vocab.decode(full_tokens),
            'generated_tokens': generated_tokens[i]
        })
    
    return results


In [15]:
model = DecoderTransformer(
    vocab_size=len(vocab),
    d_model=CONFIG['d_model'],
    num_layers=CONFIG['num_layers'],
    num_heads=CONFIG['num_heads'],
    d_ff=CONFIG['d_ff'],
    max_seq_len=CONFIG['context_length'],
    dropout=CONFIG['dropout'],
    pretrained_embeddings=embedding_matrix
).to(device)


checkpoint = torch.load('best_model.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

DecoderTransformer(
  (embedding): Embedding(10598, 300)
  (embedding_proj): Linear(in_features=300, out_features=296, bias=True)
  (pos_encoding): PositionalEncoding()
  (layers): ModuleList(
    (0-2): 3 x TransformerBlock(
      (attention): MultiHeadAttention(
        (W_q): Linear(in_features=296, out_features=296, bias=True)
        (W_k): Linear(in_features=296, out_features=296, bias=True)
        (W_v): Linear(in_features=296, out_features=296, bias=True)
        (W_o): Linear(in_features=296, out_features=296, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm()
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=296, out_features=1184, bias=True)
        (linear2): Linear(in_features=1184, out_features=296, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (norm): LayerNorm()
  (output_projection): Linear(in_fe

In [16]:
import os
import json
import time
import torch
import matplotlib.pyplot as plt

def evaluate_generation_speed_fixed_tokens(model, vocab, prompts, num_samples=20, max_length=50, use_kv_cache=False, device='cuda'):
    """Evaluate generation speed ensuring same number of tokens per test."""
    model.eval()
    start_time = time.time()
    
    if num_samples == 1:
        generate(model, prompts[0], vocab, max_length=max_length, device=device, use_kv_cache=use_kv_cache)
        total_tokens = max_length
    else:
        generate_batch(model, prompts, vocab, max_length=max_length, device=device, use_kv_cache=use_kv_cache)
        total_tokens = num_samples * max_length
    
    elapsed_time = time.time() - start_time
    tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0
    samples_per_second = num_samples / elapsed_time if elapsed_time > 0 else 0

    return {
        'total_tokens': total_tokens,
        'elapsed_time': elapsed_time,
        'tokens_per_second': tokens_per_second,
        'samples_per_second': samples_per_second
    }

def run_inference_speed_tests(model, vocab, prompts, num_samples=20, max_length=50, device='cuda'):
    """Compare generation performance with and without KV cache (same token count)."""
    
    save_dir = "results/inference/kv"
    os.makedirs(save_dir, exist_ok=True)
    result_path = os.path.join(save_dir, "results.json")
    plot_path = os.path.join(save_dir, "tokens_per_second_comparison.png")
    
    no_cache_results = evaluate_generation_speed_fixed_tokens(
        model, vocab, prompts, num_samples=num_samples, max_length=max_length, 
        use_kv_cache=False, device=device
    )
    kv_cache_results = evaluate_generation_speed_fixed_tokens(
        model, vocab, prompts, num_samples=num_samples, max_length=max_length, 
        use_kv_cache=True, device=device
    )
    
    speed_boost = kv_cache_results["tokens_per_second"] / no_cache_results["tokens_per_second"]
    
    final_results = {
        "no_kv_cache": no_cache_results,
        "with_kv_cache": kv_cache_results,
        "speed_boost_ratio": speed_boost,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    }
    
    # Save JSON results
    with open(result_path, "w") as f:
        json.dump(final_results, f, indent=4)
    
    # --- Plot ---
    plt.style.use('default')
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = ['#E74C3C', '#2ECC71']
    labels = ['Without KV Cache', 'With KV Cache']
    tokens_per_sec = [
        no_cache_results['tokens_per_second'],
        kv_cache_results['tokens_per_second']
    ]
    
    bars = ax.bar(labels, tokens_per_sec, color=colors, alpha=0.85, edgecolor='black', linewidth=1.2, width=0.6)
    ax.set_title('Tokens per Second Comparison\n(Same Token Count)', fontsize=16, fontweight='bold', pad=25)
    ax.set_ylabel('Tokens per Second', fontsize=13, fontweight='bold')
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + height*0.02,
                f'{height:,.1f}', ha='center', va='bottom', 
                fontsize=11, fontweight='bold')


    ax.set_ylim(0, max(tokens_per_sec) * 1.25)
    plt.tight_layout()
    plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    
    return final_results

results = run_inference_speed_tests(model, vocab, kv_prompts)
