In [34]:
from pathlib import Path
import string

def remove_interpunction(text: str) -> str:
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

def fix_whitespace(text: str) -> str:
    return ' '.join(text.split())

def read_file(path: Path) -> str:
    with open(path, 'r') as file:
        content = file.read()
    return content

def normalize_text(text: str) -> str:
    text = remove_interpunction(text)
    text = fix_whitespace(text)
    text = text.lower()
    return text

In [35]:
import os
from typing import List, Dict

def load_recipes(base_dir: str = "data", max_index: int = 200) -> List[Dict]:
    """
    Load recipes from base_dir/i/data.txt for i in [0, max_index].
    Each loaded recipe is a dict with an id, title and text.
    """
    docs = []
    for i in range(max_index):  # 0..200
        recipe_path = os.path.join(base_dir, f"{i}.txt")

        if not os.path.exists(recipe_path):
            # Skip missing indices gracefully
            print(f"Warning: {recipe_path} not found, skipping.")
            continue

        with open(recipe_path, "r", encoding="utf-8") as f:
            text = f.read()
            text = normalize_text(text)

        docs.append(
            {
                "id": str(i),
                "title": f"recipe_{i}",
                "text": text,
                "path": recipe_path,
            }
        )
    return docs


recipes = load_recipes("../model/data")
print(len(recipes), "recipes loaded.")

200 recipes loaded.


In [36]:
def chunk_text_chars(text: str, max_chars: int = 500, overlap: int = 50):
    chunks = []
    n = len(text)
    start = 0
    iteration = 0

    while start < n:
        iteration += 1
        end = min(n, start + max_chars)

        chunk = text[start:end]
        chunks.append(chunk)

        # If we reached the end of the text, stop now
        if end >= n:
            break

        # Calculate tentative next start
        next_start = end - overlap
        # Safety check to avoid infinite loop
        if next_start <= start:
            next_start = end

        # Move to next window
        start = next_start
    return chunks

def build_recipe_chunks(recipes):
    """
    For each recipe document, create smaller chunks.
    """
    chunks = []
    for doc in recipes:
        text = doc["text"]
        for i, chunk in enumerate(chunk_text_chars(text, max_chars=500, overlap=50)):
            chunks.append(
                {
                    "doc_id": doc["id"],
                    "chunk_id": f'{doc["id"]}_chunk_{i}',
                    "title": doc["title"],
                    "text": chunk,
                }
            )
    return chunks

recipe_chunks = build_recipe_chunks(recipes)
print(len(recipe_chunks), "chunks total.")

1832 chunks total.


In [37]:
pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [38]:
import numpy as np
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

# Global models / arrays (for simplicity)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # small, fast

def build_embeddings(recipe_chunks):
    """
    Compute embeddings for all chunks.
    Returns a NumPy array of shape (num_chunks, embedding_dim).
    """
    texts = [c["text"] for c in recipe_chunks]
    emb = embed_model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    return emb

def embed_query(query: str):
    return embed_model.encode([query], convert_to_numpy=True)[0]

def retrieve_top_k(query: str, recipe_chunks, embeddings, k: int = 5):
    """
    Given a question, return the top-k most similar chunks.
    """
    q = embed_query(query)
    sims = embeddings @ q / (norm(embeddings, axis=1) * norm(q) + 1e-10)
    top_idx = np.argsort(-sims)[:k]
    results = []
    for idx in top_idx:
        results.append(
            {
                "score": float(sims[idx]),
                "chunk": recipe_chunks[idx],
            }
        )
    return results

In [44]:
embeddings = build_embeddings(recipe_chunks)
print("Embeddings shape:", embeddings.shape)

hits = retrieve_top_k("", recipe_chunks, embeddings, k=10)

for h in hits:
    print(f"score={h['score']:.3f}  title={h['chunk']['title']}")

Batches: 100%|██████████| 58/58 [00:06<00:00,  9.24it/s]

Embeddings shape: (1832, 384)
score=0.255  title=recipe_113
score=0.224  title=recipe_100
score=0.183  title=recipe_100
score=0.181  title=recipe_100
score=0.171  title=recipe_39
score=0.169  title=recipe_115
score=0.165  title=recipe_63
score=0.159  title=recipe_161
score=0.158  title=recipe_161
score=0.157  title=recipe_39





In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

class GenerativeQA:
    def __init__(self, device: str = "cpu", model_name: str = "google/flan-t5-small") -> None:
        device_obj = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device_obj)

        self.pipe = pipeline(
            "text2text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if device_obj.type == "cuda" else -1,
        )

    def answer(self, question: str, contexts: list[str]) -> str:
        """
        Answer a question given a list of retrieved context chunks.
        """
        joined_context = "\n\n".join(contexts)

        prompt = (
            "You are an assistant that answers questions about recipes.\n"
            "Use only the information in the context. "
            "If the answer is not in the context, say you don't know.\n\n"
            f"Context:\n{joined_context}\n\n"
            f"Question: {question}\n\n"
            "Answer:"
        )

        out = self.pipe(
            prompt,
            max_new_tokens=128,
            num_beams=4,
            do_sample=False,
            truncation=True,
        )[0]["generated_text"]
        return out.strip()

In [46]:
def ask_recipe_question(gen_qa: GenerativeQA,
                        question: str,
                        recipe_chunks,
                        embeddings,
                        k_retrieval: int = 5):
    # 1. retrieve top-k relevant chunks
    hits = retrieve_top_k(question, recipe_chunks, embeddings, k=k_retrieval)
    contexts = [h["chunk"]["text"] for h in hits]

    # 2. generate answer
    answer = gen_qa.answer(question, contexts)
    return answer, hits

In [None]:
q = input("\nAsk a question about your recipes (or type 'quit'): ").strip()


answer, hits = ask_recipe_question(gen_qa, q, recipe_chunks, embeddings, k_retrieval=5)

print("\nAnswer:")
print(answer)

print("\nTop retrieved recipe chunks:")
for h in hits[:3]:
    print(f"- Recipe {h['chunk']['doc_id']} ({h['chunk']['title']}), score={h['score']:.3f}")
print("-" * 60)