# RAG Semantic Chunking Demo

## Semantic Chunking
Semantic chunking groups sentences by meaning, preserving complete information units.


In [None]:
# Import necessary libraries
import json
import numpy as np
from typing import List, Dict, Tuple

print("RAG Semantic Chunking Demo")
print("=" * 40)


In [None]:
# Sample data from the README - Pizza recipe chunks with semantic vectors
pizza_chunks = {
    "chunk_1": {
        "title": "Historical Context and Popularization",
        "content": "Modern pizza was born in Naples, Italy, in the 18th century, but has a fascinating history of cultural exchange. Although flatbread with toppings existed since ancient times, pizza as we know it developed when tomatoes arrived from America in the 16th century. Initially viewed with suspicion, poor Neapolitans began using them on flatbread, creating the first 'pizza al pomodoro.' Pizza truly gained popularity in 1889 when pizzaiolo Raffaele Esposito created 'Pizza Margherita' for Queen Margherita of Savoy. From Naples, it spread throughout Italy and then worldwide thanks to Italian immigration.",
        "vector": [0.18, -0.42, 0.85]
    },
    "chunk_2": {
        "title": "Ingredients (American Connection)",
        "content": "- Tomatoes: Originally from America, fundamental base for the sauce\n- Peppers (optional): Also from America, for extra flavor\n- Wheat flour: For the dough, cultivated in Italy but improved with New World techniques\n- Mozzarella: Traditional Italian buffalo cheese\n- Fresh basil: Italian Mediterranean herb\n- Olive oil: Italian star product\n- Salt and pepper: Basic seasonings",
        "vector": [0.15, -0.51, 0.83]
    },
    "chunk_3": {
        "title": "Preparation Process",
        "content": "1. Mix flour, water, yeast, and salt for the dough\n2. Let rest until it doubles in size\n3. Roll out the dough in a circular shape\n4. Spread tomato sauce evenly\n5. Add mozzarella in pieces\n6. Bake at 480°F for 10-12 minutes\n7. Garnish with fresh basil when it comes out",
        "vector": [0.09, -0.48, 0.76]
    },
    "chunk_4": {
        "title": "Consumption Culture",
        "content": "In Italy, pizza is cut into triangular portions and eaten with hands, slightly folding each piece. It's served hot, directly from the oven, and traditionally accompanied with light red wine or beer. Italians usually eat it for dinner, between 7 and 9 PM, and it's common to share it in family groups or with friends.",
        "vector": [0.21, -0.39, 0.81]
    }
}

print(f"Loaded {len(pizza_chunks)} semantic chunks")
for chunk_id, chunk in pizza_chunks.items():
    print(f"- {chunk['title']}: {len(chunk['content'])} chars")


In [None]:
# Simple semantic search function
def cosine_similarity(a: List[float], b: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def semantic_search(query_vector: List[float], chunks: Dict, top_k: int = 2) -> List[Tuple[str, float]]:
    """Search for most similar chunks using cosine similarity."""
    similarities = []
    for chunk_id, chunk in chunks.items():
        similarity = cosine_similarity(query_vector, chunk['vector'])
        similarities.append((chunk_id, similarity))
    
    # Sort by similarity (descending) and return top_k
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

# Example query vectors
queries = {
    "ingredients": [0.12, -0.50, 0.80],  # Similar to ingredients chunk
    "history": [0.20, -0.40, 0.85],      # Similar to historical chunk
    "cooking": [0.10, -0.45, 0.75],     # Similar to preparation chunk
    "eating": [0.22, -0.35, 0.82]       # Similar to consumption chunk
}

print("Semantic Search Demo")
print("=" * 30)


In [None]:
# Test semantic search with different queries
for query_name, query_vector in queries.items():
    print(f"\n🔍 Query: '{query_name}'")
    print(f"Query vector: {query_vector}")
    
    results = semantic_search(query_vector, pizza_chunks, top_k=2)
    
    for i, (chunk_id, similarity) in enumerate(results, 1):
        chunk = pizza_chunks[chunk_id]
        print(f"  {i}. {chunk['title']} (similarity: {similarity:.3f})")
        print(f"     Content preview: {chunk['content'][:100]}...")
        print()
