# Day 34: Caching Strategies - Part 1

Implementing prompt and response caching to reduce LLM serving costs and improve response times.

## Overview
1. Simple response caching
2. Prompt prefix caching
3. Cache performance analysis
4. Cost savings measurement

In [None]:
import time
import hashlib
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
from datetime import datetime, timedelta

## 1. Simple Response Cache Implementation

In [None]:
class ResponseCache:
    def __init__(self, max_size=1000, ttl_seconds=3600):
        self.max_size = max_size
        self.ttl_seconds = ttl_seconds
        self.cache = OrderedDict()
        self.hits = 0
        self.misses = 0
    
    def _make_key(self, prompt, **params):
        """Create cache key from prompt and parameters."""
        key_data = {'prompt': prompt, 'params': params}
        key_str = json.dumps(key_data, sort_keys=True)
        return hashlib.md5(key_str.encode()).hexdigest()
    
    def get(self, prompt, **params):
        """Get cached response if available."""
        key = self._make_key(prompt, **params)
        
        if key in self.cache:
            entry = self.cache[key]
            
            # Check TTL
            if time.time() - entry['timestamp'] < self.ttl_seconds:
                # Move to end (LRU)
                self.cache.move_to_end(key)
                self.hits += 1
                return entry['response']
            else:
                # Expired, remove
                del self.cache[key]
        
        self.misses += 1
        return None
    
    def put(self, prompt, response, **params):
        """Store response in cache."""
        key = self._make_key(prompt, **params)
        
        # Remove oldest if at capacity
        if len(self.cache) >= self.max_size:
            self.cache.popitem(last=False)
        
        self.cache[key] = {
            'response': response,
            'timestamp': time.time()
        }
    
    @property
    def hit_rate(self):
        total = self.hits + self.misses
        return self.hits / total if total > 0 else 0
    
    def stats(self):
        return {
            'size': len(self.cache),
            'hits': self.hits,
            'misses': self.misses,
            'hit_rate': self.hit_rate
        }

## 2. Mock LLM Service with Caching

In [None]:
class MockLLMService:
    def __init__(self, processing_time=1.0, cost_per_token=0.001):
        self.processing_time = processing_time
        self.cost_per_token = cost_per_token
        self.cache = ResponseCache()
        self.total_requests = 0
        self.total_cost = 0
        self.total_time = 0
    
    def generate(self, prompt, max_tokens=50, temperature=0.7, use_cache=True):
        """Generate response with optional caching."""
        self.total_requests += 1
        start_time = time.time()
        
        # Check cache first
        if use_cache:
            cached_response = self.cache.get(
                prompt, 
                max_tokens=max_tokens, 
                temperature=temperature
            )
            if cached_response:
                # Cache hit - no processing cost
                end_time = time.time()
                self.total_time += (end_time - start_time)
                return {
                    'response': cached_response,
                    'cached': True,
                    'processing_time': end_time - start_time,
                    'cost': 0
                }
        
        # Cache miss - generate response
        time.sleep(self.processing_time)  # Simulate processing
        
        # Generate mock response
        response = f"Generated response for: {prompt[:30]}... (tokens: {max_tokens})"
        
        # Calculate cost
        input_tokens = len(prompt.split())
        output_tokens = max_tokens
        cost = (input_tokens + output_tokens) * self.cost_per_token
        
        self.total_cost += cost
        
        # Store in cache
        if use_cache:
            self.cache.put(
                prompt, 
                response, 
                max_tokens=max_tokens, 
                temperature=temperature
            )
        
        end_time = time.time()
        processing_time = end_time - start_time
        self.total_time += processing_time
        
        return {
            'response': response,
            'cached': False,
            'processing_time': processing_time,
            'cost': cost
        }
    
    def get_stats(self):
        cache_stats = self.cache.stats()
        return {
            'total_requests': self.total_requests,
            'total_cost': self.total_cost,
            'total_time': self.total_time,
            'avg_cost_per_request': self.total_cost / self.total_requests if self.total_requests > 0 else 0,
            'avg_time_per_request': self.total_time / self.total_requests if self.total_requests > 0 else 0,
            'cache_hit_rate': cache_stats['hit_rate'],
            'cache_size': cache_stats['size']
        }

## 3. Testing Cache Performance

In [None]:
def simulate_workload(service, prompts, num_requests=100, repeat_probability=0.3):
    """Simulate a realistic workload with repeated requests."""
    results = []
    
    for i in range(num_requests):
        # Decide whether to repeat a previous prompt or use a new one
        if i > 0 and np.random.random() < repeat_probability:
            # Repeat a previous prompt
            prompt = np.random.choice(prompts)
        else:
            # Use a new prompt
            prompt = f"{np.random.choice(prompts)} (variation {i})"
        
        # Generate response
        result = service.generate(prompt, max_tokens=np.random.randint(20, 80))
        results.append(result)
        
        # Small delay between requests
        time.sleep(0.01)
    
    return results

# Test prompts
test_prompts = [
    "What is artificial intelligence?",
    "Explain machine learning",
    "How do neural networks work?",
    "What is deep learning?",
    "Describe natural language processing"
]

# Test with caching
print("Testing with caching enabled...")
service_with_cache = MockLLMService(processing_time=0.5, cost_per_token=0.001)
results_cached = simulate_workload(service_with_cache, test_prompts, num_requests=50)

# Test without caching
print("Testing without caching...")
service_no_cache = MockLLMService(processing_time=0.5, cost_per_token=0.001)
results_no_cache = []
for i in range(50):
    prompt = f"{np.random.choice(test_prompts)} (variation {i})"
    result = service_no_cache.generate(prompt, use_cache=False)
    results_no_cache.append(result)
    time.sleep(0.01)

# Compare results
stats_cached = service_with_cache.get_stats()
stats_no_cache = service_no_cache.get_stats()

print("\n=== Results Comparison ===")
print(f"With Cache:")
print(f"  Total Cost: ${stats_cached['total_cost']:.3f}")
print(f"  Total Time: {stats_cached['total_time']:.2f}s")
print(f"  Cache Hit Rate: {stats_cached['cache_hit_rate']:.1%}")
print(f"  Avg Cost/Request: ${stats_cached['avg_cost_per_request']:.4f}")

print(f"\nWithout Cache:")
print(f"  Total Cost: ${stats_no_cache['total_cost']:.3f}")
print(f"  Total Time: {stats_no_cache['total_time']:.2f}s")
print(f"  Avg Cost/Request: ${stats_no_cache['avg_cost_per_request']:.4f}")

print(f"\nSavings:")
cost_savings = (stats_no_cache['total_cost'] - stats_cached['total_cost']) / stats_no_cache['total_cost']
time_savings = (stats_no_cache['total_time'] - stats_cached['total_time']) / stats_no_cache['total_time']
print(f"  Cost Savings: {cost_savings:.1%}")
print(f"  Time Savings: {time_savings:.1%}")

## 4. Cache Hit Rate Analysis

In [None]:
def analyze_cache_performance(repeat_probabilities, num_requests=100):
    """Analyze cache performance under different repeat probabilities."""
    hit_rates = []
    cost_savings = []
    
    for repeat_prob in repeat_probabilities:
        # Test with caching
        service_cached = MockLLMService(processing_time=0.1, cost_per_token=0.001)
        simulate_workload(service_cached, test_prompts, num_requests, repeat_prob)
        
        # Test without caching
        service_no_cache = MockLLMService(processing_time=0.1, cost_per_token=0.001)
        for i in range(num_requests):
            prompt = f"Test prompt {i}"
            service_no_cache.generate(prompt, use_cache=False)
        
        # Calculate metrics
        stats_cached = service_cached.get_stats()
        stats_no_cache = service_no_cache.get_stats()
        
        hit_rates.append(stats_cached['cache_hit_rate'])
        
        savings = (stats_no_cache['total_cost'] - stats_cached['total_cost']) / stats_no_cache['total_cost']
        cost_savings.append(savings)
    
    return hit_rates, cost_savings

# Analyze different repeat probabilities
repeat_probs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
hit_rates, cost_savings = analyze_cache_performance(repeat_probs)

# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(repeat_probs, [h * 100 for h in hit_rates], 'o-')
plt.xlabel('Repeat Probability')
plt.ylabel('Cache Hit Rate (%)')
plt.title('Cache Hit Rate vs Repeat Probability')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(repeat_probs, [s * 100 for s in cost_savings], 'o-', color='green')
plt.xlabel('Repeat Probability')
plt.ylabel('Cost Savings (%)')
plt.title('Cost Savings vs Repeat Probability')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Advanced Caching: Semantic Similarity

In [None]:
class SemanticCache:
    def __init__(self, similarity_threshold=0.8, max_size=1000):
        self.similarity_threshold = similarity_threshold
        self.max_size = max_size
        self.cache = []
        self.hits = 0
        self.misses = 0
    
    def _simple_similarity(self, text1, text2):
        """Simple word-based similarity (for demo purposes)."""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))
        
        return intersection / union if union > 0 else 0.0
    
    def get(self, prompt):
        """Get cached response for similar prompt."""
        for entry in self.cache:
            similarity = self._simple_similarity(prompt, entry['prompt'])
            if similarity >= self.similarity_threshold:
                self.hits += 1
                return entry['response']
        
        self.misses += 1
        return None
    
    def put(self, prompt, response):
        """Store response in cache."""
        if len(self.cache) >= self.max_size:
            self.cache.pop(0)  # Remove oldest
        
        self.cache.append({
            'prompt': prompt,
            'response': response,
            'timestamp': time.time()
        })
    
    @property
    def hit_rate(self):
        total = self.hits + self.misses
        return self.hits / total if total > 0 else 0

# Test semantic caching
semantic_cache = SemanticCache(similarity_threshold=0.6)

# Test similar prompts
similar_prompts = [
    "What is machine learning?",
    "Explain machine learning",
    "Tell me about machine learning",
    "How does machine learning work?",
    "What is deep learning?",  # Different topic
    "Explain deep learning",
]

print("Testing semantic caching:")
for i, prompt in enumerate(similar_prompts):
    cached_response = semantic_cache.get(prompt)
    if cached_response:
        print(f"  {i+1}. CACHE HIT: {prompt}")
    else:
        print(f"  {i+1}. CACHE MISS: {prompt}")
        # Simulate generating and caching response
        response = f"Response for: {prompt}"
        semantic_cache.put(prompt, response)

print(f"\nSemantic cache hit rate: {semantic_cache.hit_rate:.1%}")

## Conclusion

Caching strategies can significantly reduce LLM serving costs:

1. **Response Caching**: Eliminates redundant computation for identical requests
2. **Semantic Caching**: Handles similar requests with slight variations
3. **Cost Savings**: Can achieve 30-70% cost reduction depending on workload patterns
4. **Performance**: Dramatically reduces response times for cached content

Next, we'll explore rate limiting and multi-tenancy strategies.