In [1]:
import os
from collections import defaultdict

import chromadb

from sentence_transformers import CrossEncoder

from pydantic import BaseModel
from pydantic import Field
from typing import Optional
from typing import Literal

from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_mistralai import ChatMistralAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers import EnsembleRetriever
from langchain_core.output_parsers import StrOutputParser

from config import set_environment

In [4]:
set_environment()

In [22]:
db_path = "data"
collection_name = "recipe_dataset"

In [23]:
chroma_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2",
                                   model_kwargs={'device': "cuda"},
                                   encode_kwargs={'normalize_embeddings': False})

re_rank_model =  CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [5]:
mistral_llm = ChatMistralAI(model="mistral-small-latest")

In [6]:
chat_ollama =  ChatOllama(
    model = "cogito-2.1:671b-cloud",
    base_url="https://ollama.com",
    headers={"Authorization": f"Bearer {os.environ['OLLAMA_API_KEY']}"},
    temperature=0
)

In [7]:
from langchain_groq import ChatGroq

chat_groq = ChatGroq(
    model="openai/gpt-oss-20b",
    temperature=0,
    max_tokens=None,
    # reasoning_format="parsed",
)

In [24]:
client_settings = chromadb.config.Settings(
    anonymized_telemetry=False,
    is_persistent=True
)
client = chromadb.PersistentClient(path=db_path, settings=client_settings)
collections = client.list_collections()
collections

[Collection(name=recipe_dataset)]

In [25]:
collection = client.get_collection(name=collection_name)
collection.count()

77000

In [34]:
import pickle

with open("data/bm25_retriever.pkl", "rb") as f:
    bm25_retriever = pickle.load(f)
print("Retriever loaded from disk.")

vector_store = Chroma(
    collection_name = "recipe_dataset",
    embedding_function = chroma_embeddings,
    persist_directory="data",
    client_settings=client_settings,
)
bm25_retriever.k = 5
semantic_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})


hybrid_retriever = EnsembleRetriever(
    retrievers = [semantic_retriever, bm25_retriever],
    weights = [0.6,0.4]
)

Retriever loaded from disk.


In [58]:
class RouteQuery(BaseModel):

    datasource: Literal["vectorstore", "web_search"] = Field(
        description="Given a user question choose to route it to web search or a vectorstore.",
    )

structured_llm_router = mistral_llm.with_structured_output(RouteQuery)

# Prompt
system = """You are the Router for a Recipe Intelligence System. 
Your goal is to route user queries to the most efficient data source.

### DATA SOURCES:
1. 'vector_store': Contains 77,000 recipes with structured metadata (ingredients, directions, category, dietary restrictions.).
2. 'web_search': Best for general culinary science, history of dishes, basic cooking techniques.

### ROUTING RULES:
- Route to 'vector_store' if:
    - User asks for a meal recommendation or recipes with dietary restrictions or food preferences. (e.g., "Find me a quick breakfast", "vegan pasta", "spicy noodles" etc).
    - User asks about a specific recipe title or ingredient combo
    - User asks "How is an ingredient [X] is used in [Recipe Y]?"
    - User asks "How to make [Dish Name]" or "Recipe for [Dish Name]"
- Route to 'generic_search' if:
    - User asks for basic cooking techniques (e.g., How to boil an egg" etc).
    - User asks for food science (e.g., "Smoking point of oil" etc).
    - User asks for general food facts (e.g., "Where did Butter chicken originate?" etc)."""

route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

question_router = route_prompt | structured_llm_router

In [65]:
print(
    question_router.invoke(
        {"question": "How long simmering double boiler?"}
    )
)

datasource='web_search'


In [26]:
class RecipeSearch(BaseModel):
    # content_search: Optional[str] = Field(
    #     None,
    #     description="Similarity search query.",
    # )
    # title_search: Optional[str] = Field(
    #     None,
    #     description=(
    #         "Alternate version of the content search query to apply to recipe titles. "
    #         "Should be succinct and only include key words that could be in a recipe "
    #         "title. Use only if user asks explicitly for a specific recipe."
    #     ),
    # )
                      

    category: Optional[Literal[
        "Baking","Breads", "Condiments and sides" ,"Drinks", "Grains and Pasta" ,
        "Mains", "Other", "Salads", "Soup", "Sweet", "Vegan"
    ]] = Field(
        None,
        description=(
            "Indicates the category of the recipe to search for. "
            "Should be one of the provided values."
            "MANDATORY: Select the best category from the list if mentioned. If not mentioned, return none."
            "Example: 'sourdough' -> Breads, 'cupcake' -> Baking, 'lentil soup' -> Soup. etc"
        )
    )
    is_nut_free: Optional[int] = Field(
        None,
        description="Nut free filter. Only use if explicitly specified.",
    )
    is_gluten_free: Optional[int] = Field(
        None,
        description="Gluten free filter. Only use if explicitly specified.",
    )
    is_dairy_free: Optional[int] = Field(
        None,
        description="Dairy free filter. Only use if explicitly specified.",
    )
    is_spicy_food: Optional[int] = Field(
        None,
        description="Spicy food filter. Only use if explicitly specified.",
    )
                     
    is_comfort_food: Optional[int] = Field(
        None,
        description="Comfort food filter. Only use if explicitly specified.",
    )
    is_light_food: Optional[int] = Field(
        None,
        description="Light food filter. Only use if explicitly specified.",
    )
    is_hearty_food: Optional[int] = Field(
        None,
        description="Hearty food filter. Only use if explicitly specified.",
    )
    is_healthy: Optional[int] = Field(
        None,
        description="Healthy food filter. Only use if explicitly specified.",
    )
   
                     
    is_breakfast: Optional[int] = Field(
        None,
        description="Breakfast filter. Only use if explicitly specified.",
    )
    is_lunch: Optional[int] = Field(
        None,
        description="Lunch filter. Only use if explicitly specified.",
    )
    is_dinner: Optional[int] = Field(
        None,
        description="Dinner filter. Only use if explicitly specified.",
    )
    is_quick: Optional[int] = Field(
        None,
        description="Quick filter. Only use if explicitly specified.",
    )
    
    is_no_oven: Optional[int] = Field(
        None,
        description="No oven filter. Only use if explicitly specified.",
    )
    
    is_slow_cooker: Optional[int] = Field(
        None,
        description="Slow cooker filter. Only use if explicitly specified.",
    )
    is_air_fryer: Optional[int] = Field(
        None,
        description="Air fryer filter. Only use if explicitly specified.",
    )
    is_one_pot: Optional[int] = Field(
        None,
        description="One pot filter. Only use if explicitly specified.",
    )

In [27]:
system = """You are an expert at converting user questions into database queries.
You have access to a database of recipes. 
Given a question, return a database query optimized to retrieve the most relevant results.
If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

In [29]:
structured_llm = chat_groq.with_structured_output(RecipeSearch)
query_analyzer = prompt | structured_llm

In [30]:
# question = "vegan gluten free pasta nut free"
# question = "quick and healthy breakfast"
# question = "lunch no-cook salad with citrus and beet"
# question = "What is the use of corn in Tamale Bake?"
# question = "How many eggs are required in Mom's Meat Loaf recipe?"
question = "How to boil an egg?"
result = query_analyzer.invoke({"question": question})
result

RecipeSearch(category=None, is_nut_free=None, is_gluten_free=None, is_dairy_free=None, is_spicy_food=None, is_comfort_food=None, is_light_food=None, is_hearty_food=None, is_healthy=None, is_breakfast=None, is_lunch=None, is_dinner=None, is_quick=1, is_no_oven=1, is_slow_cooker=None, is_air_fryer=None, is_one_pot=None)

In [31]:
meta_data_filters = result.model_dump(exclude_none=True)
if meta_data_filters.get("category") == "Other" :
    del meta_data_filters["category"]
print(meta_data_filters)

{'is_quick': 1, 'is_no_oven': 1}


KeyError: 0

In [32]:
filter_dict = {}
and_list = []
if len(meta_data_filters) > 1:
   filter_dict["$and"] = [{f:meta_data_filters[f]} for f in meta_data_filters]
elif len(meta_data_filters) == 1:
    filter_dict = meta_data_filters
else:
    filter_dict = None
filter_dict

{'$and': [{'is_quick': 1}, {'is_no_oven': 1}]}

In [46]:
filter_dict["$and"].pop(0)
filter_dict

{'$and': [{'is_no_oven': 1}]}

In [47]:
semantic_retriever.search_kwargs["filter"] = filter_dict
docs = hybrid_retriever.invoke(question)
len(docs)

ValueError: Expected where value for $and or $or to be a list with at least two where expressions, got [{'is_no_oven': 1}] in query.

In [36]:
filter_names = {
    'is_nut_free' : ["Contains nuts", "Nut free"],
    'is_gluten_free': ["Contains Gluten", "Gluten free"],
    'is_dairy_free' : ["Contains Dairy", "Dairy free"], 
    'is_spicy_food': ["Not spicy food", "Spicy food"], 
    'is_comfort_food': ["Not comfort food", "Comfort food"], 
    'is_light_food': ["Not light food", "Light food"],     
    'is_hearty_food': ["Not hearty food", "Hearty food"], 
    'is_healthy': ["Not healthy food", "Healthy food"], 
    'is_breakfast': ["Not breakfast", "Breakfast"], 
    'is_lunch': ["Not lunch", "Lunch"], 
    'is_dinner': ["Not Dinner", "Dinner"],
    'is_no_oven': ["Uses Oven", "No oven"],
    'is_slow_cooker': ["Uses Slow Cooker", "No slow cooker"],
    'is_air_fryer': ["Uses air fryer", "No air fryer"], 
    'is_one_pot': ["Uses One pot", "No One pot"],
    'is_quick': ["Not quick", "Quick"],
}    

In [37]:
formatted_docs = []
for doc in docs:
        filter_data = ""
        if meta_data_filters is not None and len(meta_data_filters) > 0:
             for f in meta_data_filters:
                if f in filter_names:
                    filter_data += f"{filter_names[f][doc.metadata[f]]}, "
                else:
                     filter_data += f"{f} : {doc.metadata[f]}, "
        header = f"--- STATUS: {filter_data} ---"           
        formatted_content = f"{header}\n{doc.page_content}"
        print(header + "\n" + doc.metadata["title"])
        formatted_docs.append(formatted_content)

--- STATUS: Quick, No oven,  ---
Boiled Custard
--- STATUS: Quick, No oven,  ---
Basic Eggs Breakfast Set
--- STATUS: Quick, No oven,  ---
Huani
--- STATUS: Quick, No oven,  ---
Drowned Eggs
--- STATUS: Quick, No oven,  ---
Boiled Egg Salad Recipe
--- STATUS: Not quick, No oven,  ---
How To Make Proper Water
--- STATUS: Not quick, No oven,  ---
Snow Cream
--- STATUS: Not quick, No oven,  ---
How To Prepare Fresh Green Beans
--- STATUS: Not quick, No oven,  ---
Chilled Pasta with Scallops and Tomatoes
--- STATUS: Not quick, Uses Oven,  ---
"My They'Re Good, How Did You Make Them" Brownies


In [38]:
def re_rank(query, query_result, check_score=True):
    # cross-encoder re-ranker
    ranks = re_rank_model.rank(query, query_result)
    if ranks[0]["score"] < 0:
        return [query_result[ranks[0]["corpus_id"]]]
    reranked_docs = []
    for rank_score in ranks:
        if check_score and rank_score["score"] < 0:
            continue
        else:
            reranked_docs.append(query_result[rank_score["corpus_id"]])
    return reranked_docs

In [39]:
re_ranked_docs = re_rank(question, formatted_docs, check_score=True)
print(f"number of filtered docs: {len(re_ranked_docs)}")
print(re_ranked_docs)

number of filtered docs: 3
['--- STATUS: Quick, No oven,  ---\n# Title: Huani\n### Ingredients:\n1. 2 hard-boiled eggs\n2. 1 pinch season salt\n### Directions: \n1. Boil the eggs and mash them in a food masher.\n2. Add the season salt and stir.\n3. Enjoy!', '--- STATUS: Quick, No oven,  ---\n# Title: Boiled Custard\n### Ingredients:\n1. 4 cups milk\n2. 6 large egg yolks\n3. 3/4 cup sugar\n4. 2 tablespoons cornstarch\n5. Dash of salt\n6. 2 teaspoons vanilla extract\n7. Ground nutmeg\n### Directions: \n1. Pour milk into top of a double boiler; bring water to a boil. Heat milk until tiny bubbles begin to appear around edges of pan. Remove from heat, and set aside.\n2. Beat egg yolks with a wire whisk until frothy. Add sugar, cornstarch, and salt, beating until thickened. Gradually stir about 1 cup hot milk into yolk mixture; add to remaining milk, stirring constantly.\n3. Cook custard mixture in double boiler over low heat 25 minutes or until mixture is thickened and a candy thermometer r

In [40]:
from typing import List


class Grade(BaseModel):
    title: str = Field(description="Title of the document being scored.")
    analysis: str = Field(description="A brief sentence explaining the relevance score given.")
    score: int = Field(description="Relevance score ranging from 0 to 5, with 0 being completely irrelevant and 5 being highly relevant.")

class GradeList(BaseModel):
    scores: List[Grade] = Field(description="List of relevance scores with analysis.")

In [41]:
from langchain_core.output_parsers import JsonOutputParser

grader_prompt = """
You are a quality control agent. 
Your goal is to assess the relevance of the retrieved documents to a user question.
Check each document in the list and return a integer score ranging between 0-5 with 0 being completely irrelevant and 5 being highly relevant for each of the document.

## Grading criteria

5 (Perfect): Matches specific name (e.g., "Mom's", "Vicky's", "Gordon Ramsay's" etc), all ingredients, all dietary constraints and flavour profiles.

4 (Strong): Matches the dish type and all dietary constraints, but is a generic or different "version" of the specific recipe in the user's question.

3 (Partial): Matches the dish type, meets most of the dietary needs or flavour profiles.

0-2 (Irrelevant): Wrong dish type or fundamentally violates user's dietary constraints or flavour profiles.

Return a JSON list containing one entry per document in the exact order of the documents provided. 
Return JSON in the exact format provided:
{{
  "scores": [
    {{
      "title":  "Title of the document being scored."
      "analysis": "A brief sentence explaining the score given.",
      "score": Relevance score ranging from 0 to 5, with 0 being completely irrelevant and 5 being highly relevant
    }}
  ]
}}

## Question: {question}
## Documents: {documents}
"""

grader_prmpt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a strict grading API that ONLY outputs JSON."
               " Do not include any preamble, introduction, or explanation outside of the JSON structure."),
    ("human", grader_prompt)
])
grader_llm = chat_ollama.with_structured_output(GradeList)
grader_chain = grader_prmpt_template | grader_llm

In [42]:
grader_output = grader_chain.invoke({"question":question, "documents":re_ranked_docs})
grader_output

GradeList(scores=[Grade(title='Huani', analysis="Directly addresses boiling eggs as part of the recipe, though it's a specific dish rather than just instructions for boiling eggs.", score=4), Grade(title='Boiled Custard', analysis='Involves eggs but is a custard recipe, not about boiling eggs as the main focus. Violates the simple boiling instruction request.', score=1), Grade(title='Boiled Egg Salad Recipe', analysis='Uses boiled eggs but is a salad recipe, not focused on the boiling process itself. Partially relevant but not directly answering the question.', score=3)])

In [43]:
from pydantic import BaseModel, Field

class ReformulatedQuery(BaseModel):
    reasoning: str = Field(description="Why the original query failed and what is being changed.")
    relaxed_filters: dict = Field(description="The new metadata filters (e.g., removing 'is_quick').")
    new_content_query: str = Field(description="The new search string.")



# System Prompt for the Rewriter
query_rewriter_prompt = """You are a Query Optimizer. 
The previous search returned results that did not match the user's question.
Analyze the constraints: {filters} for the query: {question}.

Your goal:
1. Identify the most restrictive constraint.
2. Relax that constraint (e.g., if 'is_quick' was 1, set it to None).
3. Keep dietary constraints (Vegan, Gluten free, Nut-free, Diary-free) as they are mandatory safety rules.
4. Provide a new set of filters that is slightly broader.
5. Do not add any new filters.
"""
 
query_rewriter_prompt_template = ChatPromptTemplate.from_template(query_rewriter_prompt)  
rewriter_llm = chat_groq.with_structured_output(ReformulatedQuery)
rewriter_chain = query_rewriter_prompt_template | rewriter_llm

In [44]:
rewriter_output = rewriter_chain.invoke({"question": question, "filters": meta_data_filters})
rewriter_output

ReformulatedQuery(reasoning="The 'is_quick' constraint was overly restrictive for this query, so it has been relaxed.", relaxed_filters={'is_no_oven': 1}, new_content_query='How to boil an egg?')

In [144]:
template = """
<system-role>
    You are a culinary master expert at answering questions.
    Your goal is to provide clear, natural and helpful answers for a question based on the recipes provided.
 <system-role>
 </rules>   
    1. DATA INTEGRITY: 
      - Answer using only the given context.
      - Do not make assumptions based on titles. 
      - Use the provided ingredients and directions only. 
    2. STRICT DIETARY ALIGNMENT: 
        - Only include recipes that fully match the user's dietary requests. 
        - If a recipe in the context does not match the user constraints, EXCLUDE it.
    3.MANDATORY STRUCTURE: For every matching recipe, you MUST provide:
       - Recipe Name
       - [DIETARY NOTE]: Explicitly state if it matches all user constraints or if there is a warning.
         Suggest alternatives where possible.
       - Prep/Cook Time (if available)
       - Ingredients (bulleted)
       - [TYPO NOTE]: If an ingredient amount seems like a typo (e.g., 12 tsp pepper), add: "NOTE: Context says 12, likely 1/2."
       - Step-by-step Directions (numbered)
    4. Do not ask any follow-up questions.
    5. Provide a direct answer in a professional tone.
    6. Do not include any pre amble or post amble.
 </rules>
 <recipes>
    {context}
 </recipes
 <question>
    {question}
 </question>
    """
rag_prmpt = ChatPromptTemplate.from_template(template)
rag_chain = rag_prmpt | chat_ollama | StrOutputParser()

In [152]:
if len(re_ranked_docs) > 5:
    limited_docs = re_ranked_docs[0:5]
else:
    limited_docs = re_ranked_docs
rag_response = rag_chain.invoke({"context": limited_docs, "question": question})
print(rag_response)

I do not see any recipe titled "Mom's Meat Loaf" in the provided context. The only recipe available is for "Old Fashioned Meat Loaf" which uses 2 eggs.


In [146]:
print(rag_response)

Based on the provided recipes, here are two no-cook salad options featuring citrus and beets:

1. Spinach, Beet, And Egg Salad
Ingredients:
- 2 medium beets (precooked)
- 2 hard-boiled eggs
- 1 carrot, shredded
- 2 cups spinach, washed and chopped
- 1 cup lettuce
- 12 almonds
- Salad dressing of choice

Directions:
1. Use precooked beets - peel and chop them
2. Hard boil eggs, peel and slice
3. Mix all vegetables in a large bowl
4. Add beets and eggs, mix together
5. Top with almonds and dressing

2. Mixed Baby Greens with Oranges, Grapefruit and Avocado
(Note: While this doesn't contain beets, it's a no-cook citrus salad that could be adapted by adding precooked beets)

Ingredients:
- 1 lb mixed baby greens
- 2 oranges, peeled and sliced
- 1 large grapefruit, peeled and sliced
- 1 large avocado, peeled and sliced
- 2 tbsp olive oil
- 2 tbsp fruited balsamic vinegar
- Kosher salt
- Fresh ground pepper

Directions:
1. Combine lettuce with citrus and avocado
2. Add olive oil, vinegar, sa