## What is Semantic Caching?
- Semantic caching stores query results based on semantic similarity rather than exact string matches. Unlike traditional caching that requires identical queries, semantic caching can serve cached responses for queries with similar meaning.
### When is Semantic Caching Used?

- High-frequency similar queries - Customer support, FAQ systems
- Expensive retrieval operations - Large vector database searches
- LLM API cost optimization - Reducing expensive API calls
- Real-time applications - Chatbots, search engines requiring fast responses

### How It Works:

- Query Processing: Convert queries to embeddings using models like OpenAI's text-embedding-ada-002
- Similarity Check: Calculate cosine similarity between new query and cached queries
- Threshold Decision: If similarity > threshold (typically 0.85-0.9), return cached result
- Cache Storage: Store new query-response pairs with their embeddings

### Key Differences (With vs Without Semantic Caching):
- Without Semantic Caching:

Response time: 1200-2000ms
Every query hits the vector database and LLM
Cost: $0.02-0.05 per query
No reuse of similar queries

- With Semantic Caching:

Response time: 100-300ms (80-90% faster)
Vector DB queries only for cache misses
Cost: $0.001-0.01 per query (60-80% reduction)
Intelligent reuse based on meaning



In [None]:
import React, { useState, useEffect } from 'react';
import { Search, Clock, Database, Zap, BarChart3, RefreshCw } from 'lucide-react';

const SemanticCachingDemo = () => {
  const [query, setQuery] = useState('');
  const [results, setResults] = useState(null);
  const [isLoading, setIsLoading] = useState(false);
  const [cacheEnabled, setCacheEnabled] = useState(true);
  const [stats, setStats] = useState({
    cacheHits: 0,
    cacheMisses: 0,
    totalQueries: 0,
    avgResponseTime: 0
  });

  // Demo knowledge base
  const knowledgeBase = [
    { id: 1, content: "Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions from data without being explicitly programmed.", topic: "ML Basics" },
    { id: 2, content: "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data, similar to how the human brain processes information.", topic: "Deep Learning" },
    { id: 3, content: "Natural language processing (NLP) is a branch of AI that helps computers understand, interpret, and generate human language in a meaningful way.", topic: "NLP" },
    { id: 4, content: "Computer vision enables machines to identify and analyze visual content from images and videos, including object detection and image classification.", topic: "Computer Vision" },
    { id: 5, content: "Reinforcement learning is a type of machine learning where agents learn to make decisions by receiving rewards or penalties for their actions in an environment.", topic: "Reinforcement Learning" }
  ];

  // Semantic cache with similarity scores
  const [semanticCache, setSemanticCache] = useState([
    { query: "what is machine learning", embedding: [0.8, 0.6, 0.9, 0.2, 0.3], response: "Machine learning is a subset of AI that learns from data", timestamp: Date.now() - 60000 },
    { query: "explain deep learning", embedding: [0.7, 0.9, 0.8, 0.1, 0.4], response: "Deep learning uses multi-layer neural networks", timestamp: Date.now() - 120000 },
    { query: "how does NLP work", embedding: [0.6, 0.3, 0.7, 0.9, 0.2], response: "NLP processes and understands human language", timestamp: Date.now() - 180000 }
  ]);

  // Simple embedding simulation (in real systems, this would use models like BERT, OpenAI embeddings, etc.)
  const generateEmbedding = (text) => {
    const words = text.toLowerCase().split(' ');
    const embedding = [0, 0, 0, 0, 0];
    
    // Simple keyword-based embedding simulation
    if (words.some(w => ['machine', 'learning', 'ml'].includes(w))) embedding[0] += 0.8;
    if (words.some(w => ['deep', 'neural', 'network'].includes(w))) embedding[1] += 0.9;
    if (words.some(w => ['language', 'nlp', 'text'].includes(w))) embedding[2] += 0.7;
    if (words.some(w => ['vision', 'image', 'visual'].includes(w))) embedding[3] += 0.8;
    if (words.some(w => ['reinforcement', 'reward', 'agent'].includes(w))) embedding[4] += 0.9;
    
    return embedding.map(val => Math.min(val + Math.random() * 0.3, 1));
  };

  // Calculate cosine similarity
  const cosineSimilarity = (vec1, vec2) => {
    const dotProduct = vec1.reduce((sum, a, i) => sum + a * vec2[i], 0);
    const magnitude1 = Math.sqrt(vec1.reduce((sum, a) => sum + a * a, 0));
    const magnitude2 = Math.sqrt(vec2.reduce((sum, a) => sum + a * a, 0));
    return dotProduct / (magnitude1 * magnitude2);
  };

  // RAG retrieval simulation
  const retrieveRelevantDocs = (queryEmbedding) => {
    // Simulate document retrieval based on similarity
    const docScores = knowledgeBase.map(doc => ({
      ...doc,
      score: Math.random() * 0.5 + 0.5 // Simulate relevance scoring
    }));
    return docScores.sort((a, b) => b.score - a.score).slice(0, 2);
  };

  // Generate response from retrieved documents
  const generateResponse = (docs, query) => {
    const context = docs.map(doc => doc.content).join(' ');
    return `Based on the retrieved information: ${docs[0].content.substring(0, 100)}...`;
  };

  // Search with semantic caching
  const handleSearch = async () => {
    if (!query.trim()) return;
    
    setIsLoading(true);
    const startTime = Date.now();
    const queryEmbedding = generateEmbedding(query);
    
    let response;
    let fromCache = false;
    
    if (cacheEnabled) {
      // Check semantic cache
      const threshold = 0.85; // Similarity threshold
      const cacheHit = semanticCache.find(cached => 
        cosineSimilarity(cached.embedding, queryEmbedding) > threshold
      );
      
      if (cacheHit) {
        response = `[CACHED] ${cacheHit.response}`;
        fromCache = true;
        setStats(prev => ({ ...prev, cacheHits: prev.cacheHits + 1 }));
      }
    }
    
    if (!fromCache) {
      // Perform RAG retrieval and generation
      await new Promise(resolve => setTimeout(resolve, 1500)); // Simulate processing time
      const relevantDocs = retrieveRelevantDocs(queryEmbedding);
      response = generateResponse(relevantDocs, query);
      
      // Add to semantic cache
      if (cacheEnabled) {
        setSemanticCache(prev => [...prev, {
          query,
          embedding: queryEmbedding,
          response,
          timestamp: Date.now()
        }]);
      }
      
      setStats(prev => ({ ...prev, cacheMisses: prev.cacheMisses + 1 }));
    }
    
    const endTime = Date.now();
    const responseTime = endTime - startTime;
    
    setResults({
      query,
      response,
      fromCache,
      responseTime,
      retrievedDocs: fromCache ? [] : retrieveRelevantDocs(queryEmbedding)
    });
    
    setStats(prev => ({
      ...prev,
      totalQueries: prev.totalQueries + 1,
      avgResponseTime: Math.round((prev.avgResponseTime * (prev.totalQueries - 1) + responseTime) / prev.totalQueries)
    }));
    
    setIsLoading(false);
  };

  const clearCache = () => {
    setSemanticCache([]);
    setStats({ cacheHits: 0, cacheMisses: 0, totalQueries: 0, avgResponseTime: 0 });
  };

  const sampleQueries = [
    "what is machine learning?",
    "explain artificial intelligence",
    "how does deep learning work?",
    "what are neural networks?",
    "tell me about NLP",
    "computer vision applications"
  ];

  return (
    <div className="max-w-6xl mx-auto p-6 bg-gradient-to-br from-blue-50 to-indigo-100 min-h-screen">
      <div className="mb-8 text-center">
        <h1 className="text-4xl font-bold text-gray-800 mb-4">Semantic Caching in RAG Applications</h1>
        <p className="text-lg text-gray-600 max-w-4xl mx-auto">
          Semantic caching stores query results based on semantic similarity rather than exact matches, 
          reducing response times and computational costs in RAG systems.
        </p>
      </div>

      {/* Theory Section */}
      <div className="bg-white rounded-lg shadow-lg p-6 mb-6">
        <h2 className="text-2xl font-bold text-gray-800 mb-4 flex items-center">
          <Database className="mr-2 text-blue-600" />
          What is Semantic Caching?
        </h2>
        <div className="grid md:grid-cols-2 gap-6">
          <div>
            <h3 className="text-lg font-semibold text-gray-700 mb-2">Traditional Caching vs Semantic Caching</h3>
            <div className="space-y-3">
              <div className="p-3 bg-red-50 rounded border-l-4 border-red-400">
                <strong>Traditional:</strong> Exact string match only<br/>
                <span className="text-sm text-gray-600">"what is ML?" ≠ "explain machine learning"</span>
              </div>
              <div className="p-3 bg-green-50 rounded border-l-4 border-green-400">
                <strong>Semantic:</strong> Meaning-based similarity<br/>
                <span className="text-sm text-gray-600">"what is ML?" ≈ "explain machine learning" (85% similar)</span>
              </div>
            </div>
          </div>
          <div>
            <h3 className="text-lg font-semibold text-gray-700 mb-2">When is it Used?</h3>
            <ul className="space-y-2 text-sm text-gray-600">
              <li>• <strong>High-frequency similar queries:</strong> Customer support, FAQ systems</li>
              <li>• <strong>Expensive retrieval operations:</strong> Large vector databases</li>
              <li>• <strong>LLM API cost optimization:</strong> Reducing API calls</li>
              <li>• <strong>Real-time applications:</strong> Chatbots, search engines</li>
            </ul>
          </div>
        </div>
      </div>

      {/* Demo Interface */}
      <div className="grid lg:grid-cols-2 gap-6 mb-6">
        {/* Search Interface */}
        <div className="bg-white rounded-lg shadow-lg p-6">
          <h3 className="text-xl font-bold text-gray-800 mb-4 flex items-center">
            <Search className="mr-2 text-blue-600" />
            RAG Query Interface
          </h3>
          
          <div className="mb-4">
            <label className="flex items-center space-x-2 text-sm font-medium text-gray-700 mb-2">
              <input
                type="checkbox"
                checked={cacheEnabled}
                onChange={(e) => setCacheEnabled(e.target.checked)}
                className="rounded"
              />
              <span>Enable Semantic Caching</span>
            </label>
          </div>

          <div className="mb-4">
            <input
              type="text"
              value={query}
              onChange={(e) => setQuery(e.target.value)}
              placeholder="Enter your query..."
              className="w-full p-3 border rounded-lg focus:ring-2 focus:ring-blue-500 focus:border-transparent"
              onKeyPress={(e) => e.key === 'Enter' && handleSearch()}
            />
            <button
              onClick={handleSearch}
              disabled={isLoading || !query.trim()}
              className="mt-2 w-full bg-blue-600 text-white py-2 px-4 rounded-lg hover:bg-blue-700 disabled:opacity-50 disabled:cursor-not-allowed flex items-center justify-center"
            >
              {isLoading ? <RefreshCw className="animate-spin mr-2" size={16} /> : <Search className="mr-2" size={16} />}
              {isLoading ? 'Processing...' : 'Search'}
            </button>
          </div>

          <div className="mb-4">
            <h4 className="text-sm font-medium text-gray-700 mb-2">Sample Queries:</h4>
            <div className="flex flex-wrap gap-2">
              {sampleQueries.map((sample, idx) => (
                <button
                  key={idx}
                  onClick={() => setQuery(sample)}
                  className="text-xs bg-gray-100 hover:bg-gray-200 px-2 py-1 rounded transition-colors"
                >
                  {sample}
                </button>
              ))}
            </div>
          </div>

          {results && (
            <div className="mt-4 p-4 bg-gray-50 rounded-lg">
              <div className="flex items-center justify-between mb-2">
                <h4 className="font-medium text-gray-800">Response</h4>
                <div className="flex items-center space-x-4 text-sm text-gray-600">
                  <span className={`flex items-center ${results.fromCache ? 'text-green-600' : 'text-orange-600'}`}>
                    {results.fromCache ? <Zap size={14} className="mr-1" /> : <Clock size={14} className="mr-1" />}
                    {results.fromCache ? 'Cached' : 'Generated'}
                  </span>
                  <span>{results.responseTime}ms</span>
                </div>
              </div>
              <p className="text-sm text-gray-700 bg-white p-3 rounded border">
                {results.response}
              </p>
              {!results.fromCache && results.retrievedDocs && (
                <div className="mt-2">
                  <h5 className="text-xs font-medium text-gray-600 mb-1">Retrieved Documents:</h5>
                  <div className="space-y-1">
                    {results.retrievedDocs.map((doc, idx) => (
                      <div key={idx} className="text-xs bg-blue-50 p-2 rounded">
                        <strong>{doc.topic}</strong> (Score: {doc.score.toFixed(2)})
                      </div>
                    ))}
                  </div>
                </div>
              )}
            </div>
          )}
        </div>

        {/* Cache Status */}
        <div className="bg-white rounded-lg shadow-lg p-6">
          <h3 className="text-xl font-bold text-gray-800 mb-4 flex items-center justify-between">
            <span className="flex items-center">
              <Database className="mr-2 text-green-600" />
              Semantic Cache Status
            </span>
            <button
              onClick={clearCache}
              className="text-sm bg-red-100 text-red-700 px-3 py-1 rounded hover:bg-red-200"
            >
              Clear Cache
            </button>
          </h3>

          <div className="mb-4">
            <h4 className="font-medium text-gray-700 mb-2">Cached Queries ({semanticCache.length})</h4>
            <div className="space-y-2 max-h-40 overflow-y-auto">
              {semanticCache.map((cached, idx) => (
                <div key={idx} className="text-sm bg-gray-50 p-2 rounded">
                  <div className="font-medium text-gray-700 truncate">{cached.query}</div>
                  <div className="text-xs text-gray-500">
                    Cached {Math.round((Date.now() - cached.timestamp) / 1000)}s ago
                  </div>
                </div>
              ))}
            </div>
          </div>

          <div className="grid grid-cols-2 gap-4">
            <div className="bg-green-50 p-3 rounded text-center">
              <div className="text-2xl font-bold text-green-700">{stats.cacheHits}</div>
              <div className="text-sm text-green-600">Cache Hits</div>
            </div>
            <div className="bg-orange-50 p-3 rounded text-center">
              <div className="text-2xl font-bold text-orange-700">{stats.cacheMisses}</div>
              <div className="text-sm text-orange-600">Cache Misses</div>
            </div>
          </div>

          <div className="mt-4 text-center">
            <div className="text-lg font-bold text-gray-700">{stats.avgResponseTime}ms</div>
            <div className="text-sm text-gray-600">Avg Response Time</div>
          </div>

          {stats.totalQueries > 0 && (
            <div className="mt-4 bg-blue-50 p-3 rounded">
              <div className="text-sm text-blue-700">
                <strong>Cache Hit Rate:</strong> {Math.round((stats.cacheHits / stats.totalQueries) * 100)}%
              </div>
            </div>
          )}
        </div>
      </div>

      {/* Performance Comparison */}
      <div className="bg-white rounded-lg shadow-lg p-6">
        <h3 className="text-xl font-bold text-gray-800 mb-4 flex items-center">
          <BarChart3 className="mr-2 text-purple-600" />
          Performance Impact Analysis
        </h3>
        <div className="grid md:grid-cols-2 gap-6">
          <div>
            <h4 className="font-semibold text-gray-700 mb-3">Without Semantic Caching</h4>
            <div className="space-y-2 text-sm">
              <div className="flex justify-between">
                <span>Average Response Time:</span>
                <span className="font-medium text-red-600">1200-2000ms</span>
              </div>
              <div className="flex justify-between">
                <span>Vector DB Queries:</span>
                <span className="font-medium text-red-600">Every request</span>
              </div>
              <div className="flex justify-between">
                <span>LLM API Calls:</span>
                <span className="font-medium text-red-600">Every request</span>
              </div>
              <div className="flex justify-between">
                <span>Cost per Query:</span>
                <span className="font-medium text-red-600">$0.02-0.05</span>
              </div>
            </div>
          </div>
          <div>
            <h4 className="font-semibold text-gray-700 mb-3">With Semantic Caching</h4>
            <div className="space-y-2 text-sm">
              <div className="flex justify-between">
                <span>Average Response Time:</span>
                <span className="font-medium text-green-600">100-300ms</span>
              </div>
              <div className="flex justify-between">
                <span>Vector DB Queries:</span>
                <span className="font-medium text-green-600">Cache misses only</span>
              </div>
              <div className="flex justify-between">
                <span>LLM API Calls:</span>
                <span className="font-medium text-green-600">Cache misses only</span>
              </div>
              <div className="flex justify-between">
                <span>Cost per Query:</span>
                <span className="font-medium text-green-600">$0.001-0.01</span>
              </div>
            </div>
          </div>
        </div>
        <div className="mt-4 p-4 bg-yellow-50 rounded-lg">
          <h5 className="font-semibold text-yellow-800 mb-2">Key Benefits:</h5>
          <ul className="text-sm text-yellow-700 space-y-1">
            <li>• <strong>80-90% faster response times</strong> for similar queries</li>
            <li>• <strong>60-80% cost reduction</strong> in API usage</li>
            <li>• <strong>Reduced server load</strong> and better scalability</li>
            <li>• <strong>Improved user experience</strong> with instant responses</li>
          </ul>
        </div>
      </div>

      {/* Implementation Notes */}
      <div className="mt-6 bg-white rounded-lg shadow-lg p-6">
        <h3 className="text-xl font-bold text-gray-800 mb-4">Implementation Considerations</h3>
        <div className="grid md:grid-cols-3 gap-4 text-sm">
          <div className="bg-blue-50 p-4 rounded">
            <h4 className="font-semibold text-blue-800 mb-2">Embedding Models</h4>
            <ul className="text-blue-700 space-y-1">
              <li>• OpenAI text-embedding-ada-002</li>
              <li>• Sentence-BERT models</li>
              <li>• Cohere embeddings</li>
              <li>• Custom domain-specific models</li>
            </ul>
          </div>
          <div className="bg-green-50 p-4 rounded">
            <h4 className="font-semibold text-green-800 mb-2">Storage Options</h4>
            <ul className="text-green-700 space-y-1">
              <li>• Redis with vector similarity</li>
              <li>• Elasticsearch with dense vectors</li>
              <li>• Pinecone with metadata filtering</li>
              <li>• In-memory caches (LRU)</li>
            </ul>
          </div>
          <div className="bg-purple-50 p-4 rounded">
            <h4 className="font-semibold text-purple-800 mb-2">Best Practices</h4>
            <ul className="text-purple-700 space-y-1">
              <li>• Similarity threshold: 0.8-0.9</li>
              <li>• Cache TTL: 1-24 hours</li>
              <li>• Monitor hit rates regularly</li>
              <li>• Handle cache invalidation</li>
            </ul>
          </div>
        </div>
      </div>
    </div>
  );
};

export default SemanticCachingDemo;

In [None]:
# Normal Caching 
# Save exact query -> response pair ( memcache)

# Semantic caching
# Similar Queries --> a result 

# Instead of matching only exact queries , we store the embedding of the query
# when a new query comes in we check if it is similar one to a pre existing cached query 
# if yes , then return the cached answer - this way you save lot of money by skipping expensive LLM calls or Database lookups 


# Technical Implementation Details 

In [None]:
# Step 1: Setting it up 
# initialized vector store for semantic cache
# decie which Embedding and LLM to use
embeddings = OpenAIEmbeddings(model = "text-embedding-3-small")

from langchain.chroma import Chroma
cache_vector_store = Chroma(collection_name = "semantic_cache", embeddings =embeddings )


llm = ChatOpenAI(model = "gpt-4o-mini")

# Step2: Define cache check 
similarity_threshold = 0.9 

def check_cache(query:str):
    """Check if sematically similar query exists in cache"""
    results = cache_vector_store.similarity_search(query , k = 1)
    if results and results[0].metadata.get("similarity",0)>similarity_threshold:
        return results[0].page_content
    return None

def update_cache(query:str , response :str):
    """Store query + response in cache for future"""
    # use query as a unique id for uniqueness
    query_hash = hashlib.md5(query.encode())# ????
    doc = Document(page_content = response, metadata = {"query":query })
    cache_vector_store.add_documents([doc],ids =[query_hash] )

# Step 3: RAG Workflow with cache
# 3a: - Chech semantic cache
cached_response = check_cache(query)
if cached_response:
    return cached_response

# if cachec os not there then go ahead with rest of RAG workflow
# Update cache
update_cache(query, response)
