In [16]:
import os
from typing import List, Dict

def load_recipes(base_dir: str = "data", max_index: int = 200) -> List[Dict]:
    """
    Load recipes from base_dir/i/data.txt for i in [0, max_index].
    Each loaded recipe is a dict with an id, title and text.
    """
    docs = []
    for i in range(max_index):  # 0..200
        recipe_dir = os.path.join(base_dir, str(i))
        recipe_path = os.path.join(recipe_dir, "data.txt")

        if not os.path.exists(recipe_path):
            # Skip missing indices gracefully
            print(f"Warning: {recipe_path} not found, skipping.")
            continue

        with open(recipe_path, "r", encoding="utf-8") as f:
            text = f.read()

        docs.append(
            {
                "id": str(i),
                "title": f"recipe_{i}",
                "text": text,
                "path": recipe_path,
            }
        )
    return docs


recipes = load_recipes("../model/data")
print(len(recipes), "recipes loaded.")

200 recipes loaded.


In [17]:
def chunk_text_chars(text: str, max_chars: int = 500, overlap: int = 50):
    chunks = []
    n = len(text)
    start = 0
    iteration = 0

    while start < n:
        iteration += 1
        end = min(n, start + max_chars)

        chunk = text[start:end]
        chunks.append(chunk)

        # If we reached the end of the text, stop now
        if end >= n:
            break

        # Calculate tentative next start
        next_start = end - overlap
        # Safety check to avoid infinite loop
        if next_start <= start:
            next_start = end

        # Move to next window
        start = next_start
    return chunks

def build_recipe_chunks(recipes):
    """
    For each recipe document, create smaller chunks.
    """
    chunks = []
    for doc in recipes:
        text = doc["text"]
        for i, chunk in enumerate(chunk_text_chars(text, max_chars=500, overlap=50)):
            chunks.append(
                {
                    "doc_id": doc["id"],
                    "chunk_id": f'{doc["id"]}_chunk_{i}',
                    "title": doc["title"],
                    "text": chunk,
                }
            )
    return chunks

recipe_chunks = build_recipe_chunks(recipes)
print(len(recipe_chunks), "chunks total.")

1908 chunks total.


In [18]:
pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [19]:
import numpy as np
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

# Global models / arrays (for simplicity)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # small, fast

def build_embeddings(recipe_chunks):
    """
    Compute embeddings for all chunks.
    Returns a NumPy array of shape (num_chunks, embedding_dim).
    """
    texts = [c["text"] for c in recipe_chunks]
    emb = embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    return emb

def embed_query(query: str):
    return embed_model.encode([query], convert_to_numpy=True)[0]

def retrieve_top_k(query: str, recipe_chunks, embeddings, k: int = 5):
    """
    Given a question, return the top-k most similar chunks.
    """
    q = embed_query(query)
    sims = embeddings @ q / (norm(embeddings, axis=1) * norm(q) + 1e-10)
    top_idx = np.argsort(-sims)[:k]
    results = []
    for idx in top_idx:
        results.append(
            {
                "score": float(sims[idx]),
                "chunk": recipe_chunks[idx],
            }
        )
    return results

In [24]:
embeddings = build_embeddings(recipe_chunks)
print("Embeddings shape:", embeddings.shape)

hits = retrieve_top_k("Tofu", recipe_chunks, embeddings, k=5)

for h in hits:
    print(f"score={h['score']:.3f}  title={h['chunk']['title']}")

Batches: 100%|██████████| 60/60 [00:08<00:00,  7.15it/s]

Embeddings shape: (1908, 384)
score=0.795  title=recipe_67
score=0.748  title=recipe_67
score=0.728  title=recipe_67
score=0.710  title=recipe_67
score=0.707  title=recipe_67



