# Data Understanding

In [17]:
import pandas as pd

df_qna = pd.read_csv("raw_data/cooking_knowledge.csv")
df_recipes = pd.read_csv("raw_data/recipes.csv")

In [23]:
print("Number of rows in qna dataset: ",len(df_qna))
df_qna.head()

Number of rows in qna dataset:  5647


Unnamed: 0,type,question,response
0,cooking and conservation,What is HACCP?,HACCP (Hazard Analysis Critical Control Point)...
1,cooking and conservation,What is frying?,Frying is a quick cooking method using hot oil...
2,cooking and conservation,What is Navarin?,Navarin is a dark lamb stew prepared by browni...
3,cooking and conservation,What is betalin?,Betalin is a pigment that imparts deep purple-...
4,cooking and conservation,What is stewing?,Stewing slowly cooks small pieces of food in a...


In [24]:
print("Number of rows in the recipes dataset: ",len(df_recipes))
df_recipes.head()

Number of rows in the recipes dataset:  2231142


Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [25]:
df_recipes.columns

Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER'],
      dtype='object')

In [26]:
df_recipes.drop(['Unnamed: 0'],axis=1,inplace=True)
df_recipes.drop(['NER'],axis=1,inplace=True)
df_recipes.head()

Unnamed: 0,title,ingredients,directions,link,source
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered


In [None]:
# Checking missing values
print(df_recipes.isnull().sum())
print(df_qna.isnull().sum())

title          1
ingredients    0
directions     0
link           0
source         0
dtype: int64
type        0
question    0
response    0
dtype: int64


In [None]:
# Removing the row with a missing value
df_recipes = df_recipes.dropna()
print(df_recipes.isnull().sum())

title          0
ingredients    0
directions     0
link           0
source         0
dtype: int64


# Data Preparation

### Converting Datasets to JSON Format

In [None]:
import json
import os

# Creating data directory (for the standardized datasets - JSON)
if not os.path.exists('data'):
    os.makedirs('data')

# Cooking knowledge (QnA) preparation
def prepare_qna_data():
    
    # Converting it to structured format (list of dictionaries)
    qna_data = []
    for _, row in df_qna.iterrows():
        qna_item = {
            'type': row['type'],
            'question': row['question'],
            'response': row['response']
        }
        qna_data.append(qna_item)
    
    # Save as JSON
    with open('data/cooking_qna.json', 'w') as f:
        json.dump(qna_data, f, indent=2)
    
    print(f"Saved {len(qna_data)} QnA items")
    return qna_data

# Recipes dataset preparation
def prepare_recipes_data():
    
    # Cleaning and structuring the data
    recipes_data = []
    for _, row in df_recipes.iterrows():
        # Converting string representations of lists to actual lists
        try:
            ingredients = json.loads(row['ingredients'].replace("'", "\""))
            directions = json.loads(row['directions'].replace("'", "\""))
        except:
            # Handle cases where json parsing fails
            ingredients = row['ingredients'].strip('[]').split(', ')
            directions = row['directions'].strip('[]').split(', ')
        
        recipe = {
            'title': row['title'],
            'ingredients': ingredients,
            'directions': directions,
            'source': row['source'] if 'source' in row else 'Unknown',
            'link': row['link'] if 'link' in row else 'Unknown'
        }
        recipes_data.append(recipe)
    
    # Save as JSON
    with open('data/recipes.json', 'w') as f:
        json.dump(recipes_data, f, indent=2)
    
    print(f"Saved {len(recipes_data)} recipes")
    return recipes_data

### Combining the datasets into a single file

In [None]:
def combine_datasets():

    # Loading the prepared datasets - parameter 'r' means 'read only'
    with open('data/cooking_qna.json', 'r') as f:
        qna_data = json.load(f)
    
    with open('data/recipes.json', 'r') as f:
        recipes_data = json.load(f)
    
    # Creating text entries for embedding
    combined_data = []
    
    # Process QnA data
    for item in qna_data:
        combined_data.append({
            'id': f"qna_{len(combined_data)}",
            'content': f"Question: {item['question']}\nAnswer: {item['response']}",
            'metadata': {
                'type': 'qna',
                'category': item['type']
            }
        })
    
    # Process recipes data
    for recipe in recipes_data:
        # Format ingredients as a list
        ingredients_text = "\n".join([f"- {ing}" for ing in recipe['ingredients']])
        
        # Format directions as numbered steps
        directions_text = "\n".join([f"{i+1}. {step}" for i, step in enumerate(recipe['directions'])])
        
        # Combine everything into a single document
        recipe_text = f"Recipe: {recipe['title']}\n\nIngredients:\n{ingredients_text}\n\nDirections:\n{directions_text}"
        
        combined_data.append({
            'id': f"recipe_{len(combined_data) - len(qna_data)}",
            'content': recipe_text,
            'metadata': {
                'type': 'recipe',
                'title': recipe['title'],
                'ingredients': recipe['ingredients']
            }
        })
    
    # Save the combined dataset
    with open('data/combined_data.json', 'w') as f:
        json.dump(combined_data, f, indent=2)
    
    print(f"Created combined dataset with {len(combined_data)} items")
    return combined_data

if __name__ == "__main__":
    print("Preparing QnA data...")
    prepare_qna_data()
    
    print("Preparing recipes data...")
    prepare_recipes_data()
    
    print("Combining datasets...")
    combine_datasets()
    
    print("Data preparation complete!")

# Embedding Generation

In [1]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





The JSON file contains too many rows - each embedding takes approx. 15-40 ms to generate, meaning that it would take 15-30 hours. It might also crush or run out of memory before finishing. Therefore, I will reduce the data to 50000 rows with an option for further fine-tuning in the future from the rest of the data.

In [2]:
import random
import json

# Opening combined dataset
with open('data/combined_data.json', 'r') as f:
    data = json.load(f)

# Separating by type – to ensure my sample will contain both QnA and recipes
qna_data = [item for item in data if item['metadata']['type'] == 'qna']
recipe_data = [item for item in data if item['metadata']['type'] == 'recipe']

# Choosing number of samples
qna_sample = random.sample(qna_data, 500)
recipe_sample = random.sample(recipe_data, 49500)

# Combine and shuffle
sampled_data = qna_sample + recipe_sample
random.shuffle(sampled_data)

# Saving to a new file
with open('data/sample_combined_data.json', 'w') as f:
    json.dump(sampled_data, f, indent=2)

print(f"Sampled {len(sampled_data)} entries (QnA: {len(qna_sample)}, Recipes: {len(recipe_sample)})")


Sampled 50000 entries (QnA: 500, Recipes: 49500)


Now I will generate embeddings for 50.000 rows.

In [3]:
# Generating embeddings items in my sample

def generate_embeddings():
    # Loading the combined data
    with open('data/sample_combined_data.json', 'r') as f:
        sample_data = json.load(f)
    
    # Creating output directory for embeddings if it doesn't exist
    if not os.path.exists('embeddings'):
        os.makedirs('embeddings')
    
    # Sentence transformer model initialization
    print("Loading SentenceTransformer (embedding) model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')  # small but effective model
    
    # Extracting all texts that need embeddings
    texts = [item['content'] for item in sample_data]
    ids = [item['id'] for item in sample_data]
    
    print(f"Generating embeddings for {len(texts)} items...")
    
    # Generating embeddings in batches to avoid memory issues
    all_embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
    
    # Creating a dictionary mapping IDs to embeddings
    embeddings_dict = {}
    for i, item_id in enumerate(ids):
        embeddings_dict[item_id] = all_embeddings[i].tolist()
    
    # Save embeddings
    with open('embeddings/embeddings.json', 'w') as f:
        json.dump(embeddings_dict, f)
    
    # Also save as numpy array for FAISS
    embeddings_array = np.array(all_embeddings, dtype=np.float32)
    np.save('embeddings/embeddings_array.npy', embeddings_array)
    
    # Save IDs in the same order as embeddings
    with open('embeddings/ids.json', 'w') as f:
        json.dump(ids, f)
    
    print(f"Saved embeddings for {len(ids)} items")
    print(f"Embedding dimension: {embeddings_array.shape[1]}")
    
    return embeddings_array, ids

if __name__ == "__main__":
    print("Generating embeddings...")
    generate_embeddings()
    print("Embedding generation complete!")

Generating embeddings...
Loading SentenceTransformer (embedding) model...
Generating embeddings for 50000 items...


Batches: 100%|██████████| 1563/1563 [44:01<00:00,  1.69s/it] 


Saved embeddings for 50000 items
Embedding dimension: 384
Embedding generation complete!


# Vector Database - FAISS

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Using cached faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl (15.0 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [10]:
import json
import numpy as np
import faiss
import os

In [11]:
# Making a class that can be called later
class VectorDatabase:
    def __init__(self, index_path='data/faiss_index'):
        self.index_path = index_path
        self.index = None
        self.ids = []
        self.combined_data = []
    
    # Building a FAISS index from the embeddings
    def build_index(self, embeddings_path='embeddings/embeddings_array.npy', 
                   ids_path='embeddings/ids.json',
                   data_path='data/sample_combined_data.json'):
    
        # Loading data
        print("Loading embeddings...")
        embeddings = np.load(embeddings_path)

        with open(ids_path, 'r') as f:
            self.ids = json.load(f)
        with open(data_path, 'r') as f:
            self.combined_data = json.load(f)
        
        # Directory for the index if it doesn't exist
        os.makedirs(os.path.dirname(self.index_path), exist_ok=True)
        
        # Create and add to FAISS index
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)  # simple L2 distance index
        self.index.add(embeddings)  # add vectors to the index
        
        # Save the index
        faiss.write_index(self.index, f"{self.index_path}.idx")
        
        # Save the IDs and data paths for later loading
        with open(f"{self.index_path}_metadata.json", 'w') as f:
            json.dump({
                'ids_path': ids_path,
                'data_path': data_path
            }, f)
        
        print(f"Built FAISS index with {self.index.ntotal} vectors")
        return self.index
    
    # Load a previously built FAISS index
    def load_index(self):
        
        # Load saved index and metadata
        self.index = faiss.read_index(f"{self.index_path}.idx")

        with open(f"{self.index_path}_metadata.json", 'r') as f:
            metadata = json.load(f)
        with open(metadata['ids_path'], 'r') as f:
            self.ids = json.load(f)
        
        # Load sample of combined data
        with open(metadata['data_path'], 'r') as f:
            self.combined_data = json.load(f)
        
        print(f"Loaded FAISS index with {self.index.ntotal} vectors")
        return self.index
    
    # Function for searching the index for similar vectors
    def search(self, query_embedding, k=5):

        if self.index is None:
            self.load_index()
        
        # Make sure the query embedding is in the right shape
        if len(query_embedding.shape) == 1:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Search the index
        distances, indices = self.index.search(query_embedding, k)
        
        # Get the corresponding data
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.ids):  # Ensure index is valid
                item_id = self.ids[idx]
                
                # Find the corresponding item in combined_data
                item_data = next((item for item in self.combined_data if item['id'] == item_id), None)
                
                if item_data:
                    results.append({
                        'id': item_id,
                        'distance': float(distances[0][i]),
                        'content': item_data['content'],
                        'metadata': item_data['metadata']
                    })
        
        return results

if __name__ == "__main__":
    # Initialize the vector database
    vector_db = VectorDatabase()
    
    # Build the index
    vector_db.build_index()
    
    print("Vector database setup complete!")

Loading embeddings...
Built FAISS index with 50000 vectors
Vector database setup complete!


# Local LMM Integration

In [1]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.9-cp312-cp312-win_amd64.whl
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Installing collected packages: diskcache, llama-cpp-python
Successfully installed diskcache-5.6.3 llama-cpp-python-0.3.9


In [5]:
!pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.169.0-py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.40.1-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.25.0rc1-py3-none-any.whl.metadata (3.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-pro

In [6]:
import os
from llama_cpp import Llama
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from dotenv import load_dotenv

In [None]:
# for accessing the API key for ChatGPT
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [19]:
# Loading local LLM (Mistral 7B Instruct)
llm = Llama(model_path="models/mistral-7b-instruct-v0.1.Q4_K_M.gguf", n_ctx=2048, n_threads=8)

# Embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Vector DB
db = VectorDatabase()
db.load_index()

# Set your OpenAI key if not already done
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def ask(question, k=5, max_tokens=256, use_fallback=True):
    
    # Embed query and search vector DB
    query_embedding = embed_model.encode(question)
    results = db.search(np.array(query_embedding), k=k)

    # Prepare context and prompt
    context = "\n\n".join([r["content"] for r in results])
    prompt = f"""<s>[INST] You are a friendly cooking assistant helping someone who may be a beginner cook. 
                Always be encouraging, patient, and explain cooking concepts in a simple, easy-to-understand way.
                Use these relevant pieces of information (context below) to answer the question:

    Context:
    {context}

    User's question: {question}

    Please provide a concise, friendly, and helpful answer. If the information isn't sufficient to answer the question fully, 
    just answer with what you know from the provided information. [/INST]</s>
    """

    # Calling local Mistral model
    try:
        response = llm(prompt, max_tokens=max_tokens, stop=["</s>"])
        answer = response["choices"][0]["text"].strip()

        # Check if the answer is empty or too vague
        if not answer or answer.lower() in ["I don't know", 
            "I don't have enough information",
            "I cannot provide",
            "I'm not sure",
            "insufficient information"]:
            raise ValueError("Low confidence from local model.")
        return {
             "answer": answer,
             "source": "mistral"
        } 

    except Exception as e:
        if use_fallback:
            print("[!] Local model failed. Falling back to Gemini...")
            gemini_answer = ask_gemini(question, context, max_tokens)
            return {
                "answer": gemini_answer,
                "source": "gemini"
            }
        else:
            raise e
        
def ask_gemini(question, context, max_tokens = 768):
    model = genai.GenerativeModel("gemini-1.5-flash")
    full_prompt=f"""You are a friendly cooking assistant helping someone who may be a beginner cook. 
                Always be encouraging, patient, and explain cooking concepts in a simple, easy-to-understand way.
                Please provide a concise, friendly, and helpful answer.
                Use these relevant pieces of information (context below) to answer the question:

    Context:
    {context}

    User's question: {question}

    Answer:"""
    
    response = model.generate_content(full_prompt)
    return response.text.strip()


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from models/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 lla

Loaded FAISS index with 50000 vectors


In [20]:
# Test 1
if __name__ == "__main__":
    q = "How to make hard-boiled eggs?"
    result = ask(q, max_tokens=512)
    print(f"Source: {result['source']}")
    print(result["answer"])

llama_perf_context_print:        load time =  115055.51 ms
llama_perf_context_print: prompt eval time =  115053.08 ms /  1519 tokens (   75.74 ms per token,    13.20 tokens per second)
llama_perf_context_print:        eval time =   73753.24 ms /   277 runs   (  266.26 ms per token,     3.76 tokens per second)
llama_perf_context_print:       total time =  189186.36 ms /  1796 tokens


Source: mistral
Hi there! I'd be happy to help you learn how to make perfect hard-boiled eggs. Here's a simple recipe that I know you'll love:

Ingredients:
- Eggs
- Water
- Ice

Directions:
1. Put the eggs in a pot large enough so that they will fit in a single layer on the bottom.
2. Add enough cold tap water to cover the eggs by two inches.
3. Bring the water to a rapid boil, uncovered.
4. As soon as the water boils, shut off the heat and cover the pan.
5. Let it sit, on the burner, for exactly ten minutes.
6. While the eggs are sitting, prepare an ice water bath in a large bowl.
7. At ten minutes, dip out the eggs and move them to the ice water bath.
8. Let the eggs cool completely.
9. Peel and use!

Remember, the secret to perfect hard-boiled eggs is to cook them for exactly ten minutes and then cool them in ice water to stop the cooking process. This way, the eggs will be easy to peel and you'll get a perfectly cooked yolk every time. Enjoy your delicious hard-boiled eggs!


In [18]:
# Test 2 - more advanced
if __name__ == "__main__":
    q = "Can you suggest some beginner-friendly champignons recipes?"
    result = ask(q, max_tokens=512)
    print(f"Source: {result['source']}")
    print(result["answer"])

[!] Local model failed. Falling back to Gemini...
Source: gemini
Of course!  Let's find some mushroom recipes perfect for beginners.  The key is to pick recipes with clear instructions and not too many complicated steps.

Two great options from our list are:

* **Champignon Salat Mit Ei (German Mushroom & Egg Salad):** This salad is super easy! You just slice mushrooms, chop some ingredients, whisk a dressing, and assemble.  No cooking of the mushrooms is required, making it perfect for a first-timer.

* **Roasted Mushrooms Baked With A Bourbon Cream Sauce:** While it has multiple steps, each one is straightforward. Roasting mushrooms is simple – just toss them with oil and seasonings and bake! The sauce involves some simmering, but the instructions are clear.


Don't be afraid to experiment! Mushrooms are incredibly versatile. If you feel confident, the *Croute aux Champignons* is a delicious next step, but the other recipes are great starting points. Remember, cooking is all about ha