In [1]:
from pathlib import Path
import string

def remove_interpunction(text: str) -> str:
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

def fix_whitespace(text: str) -> str:
    return ' '.join(text.split())

def read_file(path: Path) -> str:
    with open(path, 'r') as file:
        content = file.read()
    return content

def normalize_text(text: str) -> str:
    text = remove_interpunction(text)
    text = fix_whitespace(text)
    text = text.lower()
    return text

In [2]:
import os
from typing import List, Dict

def load_recipes(base_dir: str = "data", max_index: int = 100) -> List[Dict]:
    """
    Load recipes from base_dir/i/data.txt for i in [0, max_index].
    Each loaded recipe is a dict with an id, title and text.
    """
    docs = []
    for i in range(max_index):  # 0..200
        recipe_path = os.path.join(base_dir, f"{i}.txt")

        if not os.path.exists(recipe_path):
            # Skip missing indices gracefully
            print(f"Warning: {recipe_path} not found, skipping.")
            continue

        with open(recipe_path, "r", encoding="utf-8") as f:
            text = f.read()
            text = normalize_text(text)

        docs.append(
            {
                "id": str(i),
                "title": f"recipe_{i}",
                "text": text,
                "path": recipe_path,
            }
        )
    return docs


recipes = load_recipes("../model/data")
print(len(recipes), "recipes loaded.")

100 recipes loaded.


In [3]:
from typing import List
from transformers import PreTrainedTokenizerBase

def chunk_text_tokens(
    text: str,
    tokenizer: PreTrainedTokenizerBase,
    max_tokens: int = 200,
    overlap_tokens: int = 50,
) -> List[str]:
    """
    Chunk `text` into overlapping windows based on *token* count.

    - `max_tokens`: maximum number of tokens per chunk.
    - `overlap_tokens`: number of tokens to overlap between consecutive chunks.

    Returns a list of text chunks (substrings of `text`), each corresponding
    to a contiguous span of tokens.
    """

    # Temporarily increase model_max_length so the tokenizer doesn't complain
    old_max_len = tokenizer.model_max_length
    tokenizer.model_max_length = int(1e6)  # something large

    try:
        encoded = tokenizer(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
            truncation=False,  # we WANT the full sequence here
        )
    finally:
        # Restore original max length
        tokenizer.model_max_length = old_max_len

    input_ids = encoded["input_ids"]
    offsets = encoded["offset_mapping"]
    n_tokens = len(input_ids)

    chunks = []
    start_tok = 0

    while start_tok < n_tokens:
        # End token index (exclusive)
        end_tok = min(n_tokens, start_tok + max_tokens)

        # Map token span back to character span
        start_char = offsets[start_tok][0]
        end_char = offsets[end_tok - 1][1]  # end index is exclusive

        chunk_text = text[start_char:end_char]
        chunks.append(chunk_text)

        if end_tok >= n_tokens:
            break

        # Next window with overlap in token space
        next_start_tok = end_tok - overlap_tokens

        # Safety: ensure progress
        if next_start_tok <= start_tok:
            next_start_tok = end_tok

        start_tok = next_start_tok

    return chunks

def build_recipe_chunks(recipes, tokenizer, max_tokens: int = 100, overlap_tokens: int = 50):
    """
    For each recipe document, create smaller token-based chunks.
    """
    chunks = []
    for doc in recipes:
        text = doc["text"]
        token_chunks = chunk_text_tokens(
            text,
            tokenizer=tokenizer,
            max_tokens=max_tokens,
            overlap_tokens=overlap_tokens,
        )
        for i, chunk in enumerate(token_chunks):
            chunks.append(
                {
                    "doc_id": doc["id"],
                    "chunk_id": f'{doc["id"]}_chunk_{i}',
                    "title": doc["title"],
                    "text": chunk,
                }
            )
    return chunks
import random
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
recipe_chunks = build_recipe_chunks(recipes, tokenizer, max_tokens=400, overlap_tokens=0)
import random
random.seed(123)
random.shuffle(recipe_chunks)
print(len(recipe_chunks), "chunks total.")
texts = [c["text"] for c in recipe_chunks]
for text in texts:
    print(text)
    print('\n')

  from .autonotebook import tqdm as notebook_tqdm


289 chunks total.
 bass fillet with cherry tomatoes is ready to be served it is recommended to consume the sea bass fillet with cherry tomatoes immediately if you dont feel like turning on the oven you can make a simple fresh tomato sauce to prepare this recipe for 4 people you can use a sea bass weighing about 22 lbs url httpswwwgiallozafferanocomrecipesseabassfilletwithcherrytomatoeshtml


 the sauce personalize it with your favorite spices or whatever’s in season—this is one of those healthy rice recipes that always suits your mood no question you might also like to prepare black venus rice with vegetables first cook the rice in salted boiling water for the time indicated on the package meanwhile peel the carrots and cut them into cubes about 12 inch trim and cut the zucchini first into sticks and then into cubes of the same size clean and thinly slice the onion sauté the onion for 2 minutes in a pan with a dash of oil then add the carrots and zucchini add the peas salt stir and coo

In [4]:
pip install sentence-transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [6]:
import numpy as np
from numpy.linalg import norm
import torch
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
# Global models / arrays (for simplicity)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")  # small, fast

def encode_corpus(model, recipe_chunks, batch_size=32, device="cpu"):
    """
    Encode a list of texts into dense vectors using a SentenceTransformer model.
    Uses a SentenceTransformer model (here, all-MiniLM-L6-v2).
    Here we try to transform each paragraph into a point in high-dimensional space
    """
    all_embeddings = []
    texts = [c["text"] for c in recipe_chunks]
    for start_idx in tqdm(range(0, len(texts), batch_size), desc="Encoding corpus"):
        batch_texts = texts[start_idx:start_idx + batch_size]
        with torch.no_grad():
            emb = model.encode(
                batch_texts,
                convert_to_tensor=True,
                show_progress_bar=False,
                device=device
            )
        all_embeddings.append(emb.cpu())

    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings.numpy().astype("float32")

def retrieve_top_k(model, index, questions, k=5, batch_size=32, device="cpu"):
    """
    Encode questions and retrieve top-k nearest contexts from the FAISS index.
    Returns:
      - all_scores: [num_questions, k] similarity scores
      - all_indices: [num_questions, k] corpus indices
    We are basically checking which contexts are most semantically similar to the given question
    """
    all_scores = []
    all_indices = []

    for start_idx in tqdm(range(0, len(questions), batch_size), desc="Encoding queries + retrieving"):
        batch_questions = questions[start_idx:start_idx + batch_size]
        with torch.no_grad():
            q_emb = model.encode(
                batch_questions,
                convert_to_tensor=True,
                show_progress_bar=False,
                device=device
            )
        q_emb = q_emb.cpu().numpy().astype("float32")
        faiss.normalize_L2(q_emb)  # same normalization

        scores, indices = index.search(q_emb, k)
        all_scores.append(scores)
        all_indices.append(indices)

    all_scores = np.vstack(all_scores)
    all_indices = np.vstack(all_indices)
    return all_scores, all_indices


In [7]:
def build_faiss_index(embeddings):
    """
    Build a simple FAISS index for inner-product similarity.
    For cosine similarity, it's standard to L2-normalize embeddings.
    This helps us build a 'vector database'
    """
    # L2 normalize embeddings
    faiss.normalize_L2(embeddings)

    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)  # IP = inner product
    index.add(embeddings)
    print(f"FAISS index built with {index.ntotal} vectors of dim {dim}.")
    return index

In [8]:
embeddings = encode_corpus(embed_model, recipe_chunks)
index = build_faiss_index(embeddings)
print("Embeddings shape:", embeddings.shape)

score, indices = retrieve_top_k(embed_model, index, ["cream cake"], k=5)

for h in indices:
    print(f"score={h}")

Encoding corpus:   0%|          | 0/10 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Encoding corpus: 100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


FAISS index built with 289 vectors of dim 384.
Embeddings shape: (289, 384)


Encoding queries + retrieving: 100%|██████████| 1/1 [00:00<00:00, 63.27it/s]

score=[166 265  61 223 235]





In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
GEN_MODEL_NAME = "t5-small"
class GenerativeQA:
    def __init__(self, device: str = "cpu", model_name: str = GEN_MODEL_NAME) -> None:
        device_obj = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device_obj)

        self.pipe = pipeline(
            "text2text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if device_obj.type == "cuda" else -1,
        )

    def answer(self, question: str, contexts: list[str]) -> str:
        """
        Answer a question given a list of retrieved context chunks.
        """
        joined_context = "\n".join(contexts)

        prompt = (
            f"question: {question}\n"
            f"context: {joined_context}\n\n"
            "Answer the question using only the information provided in the context. "
            "If the answer cannot be derived strictly from the context, say that the context does not contain the required information. "
            "Within the context, the ingredients of a recipe may appear inside a sentence such as "
            "'The ingredients ingredient 1: eggs, ingredient 2: flour, ingredient 3: sugar, ..., ingredient n: ...', "
            "where each ingredient is written as 'ingredient k: <name>'. "
            "When the question asks for the ingredients, you must:\n"
            "1) Find all occurrences of 'ingredient k: <name>' in the context (for any k),\n"
            "2) Extract only the <name> parts, in order of their numbering, and\n"
            "3) Answer in the form: 'the ingredients that you need are X, Y and Z', listing only the ingredient names in a natural enumeration.\n"
            "Do not add ingredients or information that are not explicitly listed in the context. "
            "Do not infer or assume missing ingredients. "
            "If no ingredients in the format 'ingredient k: <name>' are present, say that the context does not provide the ingredients. "
            "Follow all these rules precisely."
        )
        out = self.pipe(
            prompt,
            max_new_tokens=100,
            num_beams=4,
            do_sample=False,
        )[0]["generated_text"]
        return out.strip()
    
gen_qa = GenerativeQA(device="cpu", model_name="google/flan-t5-base")

Device set to use cpu


In [10]:
def ask_recipe_question(
    gen_qa: GenerativeQA,
    question: str,
    recipe_chunks,
    embed_model,
    faiss_index,
    k_retrieval: int = 5,
    device: str = "cpu",
):
    """
    Retrieve top-k relevant recipe chunks for a question and generate an answer.

    Returns:
      - answer: str
      - hits: list of dicts with fields {"chunk", "score"}
    """
    
    # 1. retrieve top-k relevant chunks (FAISS + SentenceTransformer)
    scores, indices = retrieve_top_k(
        model=embed_model,
        index=faiss_index,
        questions=[question],        # single-question batch
        k=k_retrieval,
        batch_size=1,
        device=device,
    )

    # scores, indices have shape [1, k]; take the first row
    top_scores = scores[0]
    top_indices = indices[0]

    # build contexts and a structured hits list
    contexts = []
    hits = []
    for rank, (chunk_idx, score) in enumerate(zip(top_indices, top_scores)):
        chunk = recipe_chunks[chunk_idx]
        contexts.append(chunk["text"])
        hits.append(
            {
                "rank": rank,
                "score": float(score),
                "chunk_id": chunk["chunk_id"],
                "chunk": chunk,
            }
        )

    # optional: inspect retrieved chunks
    for rank, context_text in enumerate(contexts):
        print(f"\n--- Retrieved chunk {rank} ---")
        print(context_text)

    # 2. generate answer from the retrieved contexts
    answer = gen_qa.answer(question, contexts)
    return answer, hits

In [11]:
#q = input("\nAsk a question about your recipes (or type 'quit'): ").strip()
#q = "What ingredients do I need to make a italian cream cake?"
#q = "List the ingredients for a cream cake."
#q = "what is a good pasta"
q = normalize_text(q)

answer, hits = ask_recipe_question(
    gen_qa,
    question=q,
    recipe_chunks=recipe_chunks,
    embed_model=embed_model,
    faiss_index=index,
    k_retrieval=3,
)

print(q)
print("\nAnswer:")
print(answer)

print("\nTop retrieved recipe chunks:")
for h in hits[:10]:
    print(f"- Recipe {h['chunk']['doc_id']} ({h['chunk']['title']}), score={h['score']:.3f}")
print("-" * 60)

Encoding queries + retrieving: 100%|██████████| 1/1 [00:00<00:00, 24.72it/s]


Token indices sequence length is longer than the specified maximum sequence length for this model (1481 > 512). Running this sequence through the model will result in indexing errors



--- Retrieved chunk 0 ---
name cream cake the ingredients ingredient 1 fresh liquid cream ingredient 2 eggs ingredient 3 type 00 flour ingredient 4 sugar ingredient 5 lemon peel ingredient 6 baking powder ingredient 7 salt and ingredient 8 powdered sugar instructions cream cake or torta alla panna really holds a special place in the italian kitchen why because of its simplicity and authenticity this classic cream cake recipe uses just a few basic ingredients—fresh cream eggs and flour—leaving out butter or oil the result a moist and velvety cake like biting into a tender cloud in italy families enjoy this homemade cream cake throughout the day in the morning it might be dusted with powdered sugar evening its dressed up with whipped cream and berries whether you eat it plain or add chocolate shavings this cake can make any moment feel more special seriously good stuff across different italian regions people get creative with their easy cream cake some add a hint of citrus zest for a un