In [None]:
import pandas as pd
import json
from tqdm.auto import tqdm
import os
import torch
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from pinecone import ServerlessSpec
import time
from groq import Groq
from rank_bm25 import BM25Okapi
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv

# Load the environment variables from the .env file
load_dotenv(dotenv_path="../.env")
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

groq_client = Groq(
  api_key=GROQ_API_KEY
)
client = groq_client

### Data Generation

In [4]:
df = pd.read_csv('../data/clean.csv')
documents = df.to_dict(orient='records')

In [5]:
documents[0]

{'name': 'Doro Wat',
 'country': 'Ethiopia',
 'ingredients': 'Chicken, onions, garlic, ginger, berbere spice mix, niter kibbeh',
 'instructions': 'In a pot, sauté onions, garlic, and ginger in niter kibbeh until soft. Add berbere spice mix, cook for a few minutes, then add chicken. Cook until the chicken is tender.',
 'meal_type': 'Main',
 'spice_level': 'High',
 'cooking_time_(minutes)': 90,
 'vegetarian': 'No',
 'main_cooking_method': 'Stewing',
 'serving_temperature': 'Hot',
 'how_to_make': 'Start by preparing niter kibbeh, a spiced clarified butter. Then sauté onions, garlic, and ginger in the niter kibbeh. Add berbere spice mix and stir for a few minutes. Add chicken pieces, cover, and cook until the chicken is thoroughly cooked.',
 'id': 'f2ff4980-fda2-4bfe-8a05-aca63be38345'}

In [6]:
prompt_template = """
You emulate a user of our Chief assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:


name: {name},
ingredients: {ingredients},
instructions: {instructions},
meal_type: {meal_type},
spice_level: {spice_level},
cooking_time_(minutes): {cooking_time_(minutes)},
vegetarian: {vegetarian},
main_cooking_method: {main_cooking_method},
serving_temperature: {serving_temperature},
how_to_make: {how_to_make}


Provide the output in parsable JSON without using code blocks and don't add anything else:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [7]:
prompt = prompt_template.format(**documents[0])


In [8]:
def llm(prompt):
    response = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [9]:
questions = llm(prompt)


In [10]:
questions

'{"questions": ["What type of dish is Doro Wat and what is its spice level?", "How long does it take to cook Doro Wat and what is the main cooking method used?", "What ingredients are used to sauté the onions, garlic, and ginger in the recipe?", "Is Doro Wat suitable for vegetarians and what is the recommended serving temperature?", "What is the first step in making Doro Wat and what is added after sautéing the onions, garlic, and ginger?"]}'

In [11]:
json.loads(questions)


{'questions': ['What type of dish is Doro Wat and what is its spice level?',
  'How long does it take to cook Doro Wat and what is the main cooking method used?',
  'What ingredients are used to sauté the onions, garlic, and ginger in the recipe?',
  'Is Doro Wat suitable for vegetarians and what is the recommended serving temperature?',
  'What is the first step in making Doro Wat and what is added after sautéing the onions, garlic, and ginger?']}

In [12]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [13]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']




100%|██████████| 166/166 [05:50<00:00,  2.11s/it]
100%|██████████| 166/166 [05:50<00:00,  2.11s/it]


In [14]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))


In [15]:
final_results[0]


('f2ff4980-fda2-4bfe-8a05-aca63be38345',
 'What is the primary protein used in Doro Wat?')

In [16]:
final_results[-1]

('36d849e4-35c1-4848-84e2-987fd1b33b9d',
 'At what temperature is Mitarashi Dango typically served')

In [17]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])


In [18]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [19]:
df_results.head()

Unnamed: 0,id,question
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the primary protein used in Doro Wat?
1,f2ff4980-fda2-4bfe-8a05-aca63be38345,How long does it take to prepare Doro Wat?
2,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the role of niter kibbeh in the recipe?
3,f2ff4980-fda2-4bfe-8a05-aca63be38345,Is Doro Wat suitable for a vegetarian diet?
4,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the recommended serving temperature fo...


In [20]:
df[["id", "name"]].head()

Unnamed: 0,id,name
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,Doro Wat
1,78c8ff6c-af26-4012-a637-af925fda17f7,Injera
2,3437bb23-0c42-4548-9165-f31f586b968f,Sushi
3,ec101289-7120-4818-99ec-40345acf99b8,Tacos
4,8395a5cd-7bd2-45f7-a19d-c4a232d02fa4,Paella


### Retrieval Evaluation

In [21]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')


In [22]:
df_question.head()


Unnamed: 0,id,question
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the primary protein used in Doro Wat?
1,f2ff4980-fda2-4bfe-8a05-aca63be38345,How long does it take to prepare Doro Wat?
2,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the role of niter kibbeh in the recipe?
3,f2ff4980-fda2-4bfe-8a05-aca63be38345,Is Doro Wat suitable for a vegetarian diet?
4,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the recommended serving temperature fo...


In [23]:
ground_truth = df_question.to_dict(orient='records')
ground_truth[0]


{'id': 'f2ff4980-fda2-4bfe-8a05-aca63be38345',
 'question': 'What is the primary protein used in Doro Wat?'}

In [24]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [25]:
pc = Pinecone(api_key=PINECONE_API_KEY)


spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

index_name = 'chefrag'  # Changed to match your app configuration

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [42]:
# Create embeddings and upload to Pinecone
print("Creating embeddings and uploading to Pinecone...")

def clean_metadata(doc):
    """Clean metadata by handling NaN values and ensuring Pinecone compatibility"""
    cleaned = {}
    for key, value in doc.items():
        if pd.isna(value):
            # Skip None/NaN values entirely - don't include them in metadata
            continue
        elif isinstance(value, (int, float)) and not pd.isna(value):
            cleaned[key] = value
        elif value is not None:
            cleaned[key] = str(value)
        # Skip None values entirely
    return cleaned

def clean_value_for_text(value):
    """Clean individual values for text representation"""
    if pd.isna(value) or value is None:
        return ""
    return str(value)

# Prepare documents for embedding
docs_to_embed = []
for i, doc in enumerate(documents):
    # Clean the document first
    clean_doc = clean_metadata(doc)
    
    # Create a text representation of each document using cleaned values
    doc_text = f"""
    name: {clean_value_for_text(doc.get('name', ''))},
    ingredients: {clean_value_for_text(doc.get('ingredients', ''))},
    instructions: {clean_value_for_text(doc.get('instructions', ''))},
    meal_type: {clean_value_for_text(doc.get('meal_type', ''))},
    spice_level: {clean_value_for_text(doc.get('spice_level', ''))},
    cooking_time_(minutes): {clean_value_for_text(doc.get('cooking_time_(minutes)', ''))},
    vegetarian: {clean_value_for_text(doc.get('vegetarian', ''))},
    main_cooking_method: {clean_value_for_text(doc.get('main_cooking_method', ''))},
    serving_temperature: {clean_value_for_text(doc.get('serving_temperature', ''))},
    how_to_make: {clean_value_for_text(doc.get('how_to_make', ''))}
    """.strip()
    
    docs_to_embed.append({
        'id': str(clean_doc.get('id', i)),
        'text': doc_text,
        'metadata': clean_doc  # Use cleaned metadata (no None values)
    })

print(f"Prepared {len(docs_to_embed)} documents for embedding")

# Let's check a sample to make sure no None values
sample_metadata = docs_to_embed[0]['metadata']
print(f"Sample metadata keys: {list(sample_metadata.keys())}")
print(f"Sample metadata values with None: {[k for k, v in sample_metadata.items() if v is None]}")

# Create embeddings in batches
batch_size = 100
for i in tqdm(range(0, len(docs_to_embed), batch_size)):
    batch = docs_to_embed[i:i+batch_size]
    
    # Create embeddings for this batch
    texts = [item['text'] for item in batch]
    embeddings = model.encode(texts)
    
    # Prepare vectors for Pinecone
    vectors_to_upsert = []
    for j, (embedding, item) in enumerate(zip(embeddings, batch)):
        # Final cleanup: ensure no None values in the final metadata
        final_metadata = {
            'text': item['text'],
            **{k: v for k, v in item['metadata'].items() if v is not None}
        }
        
        vectors_to_upsert.append({
            'id': item['id'],
            'values': embedding.tolist(),
            'metadata': final_metadata
        })
    
    # Upsert to Pinecone
    index.upsert(vectors=vectors_to_upsert)
    
print(f"Successfully uploaded {len(docs_to_embed)} documents to Pinecone!")
print("Final index stats:")
print(index.describe_index_stats())

Creating embeddings and uploading to Pinecone...
Prepared 166 documents for embedding
Sample metadata keys: ['name', 'country', 'ingredients', 'instructions', 'meal_type', 'spice_level', 'cooking_time_(minutes)', 'vegetarian', 'main_cooking_method', 'serving_temperature', 'how_to_make', 'id']
Sample metadata values with None: []


100%|██████████| 2/2 [00:04<00:00,  2.12s/it]



Successfully uploaded 166 documents to Pinecone!
Final index stats:
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [43]:
def get_all_documents(batch_size=10):
    documents = []
    total_vectors = index.describe_index_stats()['total_vector_count']
    # print(f"Total vectors in index: {total_vectors}")
    
    random_vector = np.random.rand(384).tolist()  # Assuming 384 is the dimension of your vectors
    
    # Query all vectors
    query_response = index.query(
        vector=random_vector,
        top_k=total_vectors,
        include_metadata=True
    )
    
    # print(f"Retrieved {len(query_response['matches'])} vectors")
    
    for match in query_response['matches']:
        # print(f"Vector ID: {match.id}")
        # print(f"Vector metadata: {match.metadata}")
        
        if 'text' in match.metadata:
            documents.append(match.metadata['text'])
        else:
            print(f"Warning: 'text' not found in metadata for vector {match.id}")
    
    # print(f"Total documents retrieved: {len(documents)}")
    return documents


def keyword_search(query, documents, top_k=15):
    tokenized_corpus = [doc.split() for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    top_n = np.argsort(bm25_scores)[::-1][:top_k]
    return [(idx, bm25_scores[idx]) for idx in top_n]

def query_pinecone(query, top_k=5):
    xq = model.encode(query).tolist()
    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
    results = []
    for match in xc.matches:
        result = {
            "id": match.id,
            "score": match.score,
            "metadata": match.metadata
        }
        
        results.append(result)
    # print(results)
    
    return results




def hybrid_query_pinecone(query, documents, top_k=15, alpha=0.8):
    # Vector search
    xq = model.encode(query).tolist()
    vector_results = index.query(vector=xq, top_k=top_k, include_metadata=True)
    
    # Keyword search
    keyword_results = keyword_search(query, documents, top_k=top_k)
    
    # Combine results
    combined_results = {}
    for match in vector_results.matches:
        combined_results[match.id] = {
            "id": match.id,
            "vector_score": match.score,
            "keyword_score": 0,
            "metadata": match.metadata
        }
    
    for idx, score in keyword_results:
        if idx in combined_results:
            combined_results[idx]["keyword_score"] = score
        else:
            combined_results[idx] = {
                "id": idx,
                "vector_score": 0,
                "keyword_score": score,
                "metadata": None  # You might want to fetch metadata for these results
            }
    
    # Calculate hybrid score
    for result in combined_results.values():
        result["hybrid_score"] = alpha * result["vector_score"] + (1 - alpha) * result["keyword_score"]
    
    # Sort by hybrid score and return top results
    sorted_results = sorted(combined_results.values(), key=lambda x: x["hybrid_score"], reverse=True)[:top_k]
    
    return [result for result in sorted_results if result["hybrid_score"] > 0.2]

def evaluate(ground_truth, search_function):
    relevance_total = []
    count = 0

    for q in tqdm(ground_truth):
        doc_id = q['id']
        # print(doc_id)
        # try:
        #     results = search_function(q)
        #     count += 1
        # except:
        #     pass
        results = search_function(q)
        count += 1
        # print(results)
        relevance = [d['id'] == doc_id for d in results]
        # print(relevance)
        relevance_total.append(relevance)
        # break

    # print(count)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

#### Only Vector Search

In [44]:
evaluate(ground_truth, lambda q: query_pinecone(q['question']))


100%|██████████| 830/830 [05:24<00:00,  2.56it/s]


{'hit_rate': 0.9891566265060241, 'mrr': 0.9505823293172688}

- {'hit_rate': 0.8880778588807786, 'mrr': 0.848337388483374}


#### Hybrid Search

In [45]:
all_documents = get_all_documents()
print(f"Length of all_documents: {len(all_documents)}")
print("First 5 documents:")
print(all_documents[:5])  # Print the first 5 documents to see their content


Length of all_documents: 166
First 5 documents:
['name: Chicken Tikka Masala,\n    ingredients: Chicken, yogurt, tomatoes, onions, garlic, ginger, spices,\n    instructions: Marinate chicken in yogurt and spices, grill. Cook in a sauce of tomatoes, onions, and spices.,\n    meal_type: Main,\n    spice_level: Medium,\n    cooking_time_(minutes): 45,\n    vegetarian: No,\n    main_cooking_method: Grilling,\n    serving_temperature: Hot,\n    how_to_make: Marinate chicken in yogurt and spices, grill until charred, then simmer in a tomato-based sauce until cooked through.', 'name: Tiramisu,\n    ingredients: Mascarpone, espresso, ladyfingers, cocoa powder,\n    instructions: Layer mascarpone with espresso-dipped ladyfingers, chill and dust with cocoa powder.,\n    meal_type: Dessert,\n    spice_level: ,\n    cooking_time_(minutes): 20,\n    vegetarian: Yes,\n    main_cooking_method: No-cook,\n    serving_temperature: Cold,\n    how_to_make: Whisk mascarpone with sugar, dip ladyfingers in e

In [46]:
evaluate(ground_truth, lambda q: hybrid_query_pinecone(q['question'], all_documents))

100%|██████████| 830/830 [05:21<00:00,  2.58it/s]


{'hit_rate': 0.4819277108433735, 'mrr': 0.09494723482675312}

### RAG Evaluation

In [47]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks and make sure it is json parsable and nothing else is added
to the below structure:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [48]:
len(ground_truth)


830

In [49]:
ground_truth[0]["question"]

'What is the primary protein used in Doro Wat?'

In [50]:
GROQ_API_KEY = os.getenv('GROQ_API_KEY')



In [52]:
def format_dish_info(dish):
    return "\n".join([f"{key}: {value}" for key, value in dish['metadata'].items() if value])

def qa_function(question):
    # Query Pinecone
    results = query_pinecone(question)
    # print(results)
    if not results:
        return "I'm sorry, I couldn't find any relevant information to answer your question."
    
    # Format the dish information
    all_dish_info = "\n\n".join([format_dish_info(dish) for dish in results])

    # print(all_dish_info)
    
    # Create the prompt
    prompt = f"""
    Based on the following information about a dish, please answer the question: {question}

    Dish information:
    {all_dish_info}

    Answer:
    """
    
    # Use Groq to generate an answer
    response = llm(prompt)
    
    return response

def llm(prompt, model="llama-3.3-70b-versatile"):
    client = Groq(
        api_key=GROQ_API_KEY
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        model=model,
    )

    # print(chat_completion.choices[0].message.content)
    return chat_completion.choices[0].message.content

# Test the QA function
question = ground_truth[0]["question"]
answer = qa_function(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is the primary protein used in Doro Wat?
Answer: The primary protein used in Doro Wat is not specified in the given information, as Doro Wat is not among the listed dishes. However, based on general knowledge, Doro Wat is a traditional Ethiopian dish, and its primary protein is usually chicken.


In [53]:
prompt = prompt2_template.format(question=question, answer_llm=answer)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the primary protein used in Doro Wat?
Generated Answer: The primary protein used in Doro Wat is not specified in the given information, as Doro Wat is not among the listed dishes. However, based on general knowledge, Doro Wat is a traditional Ethiopian dish, and its primary protein is usually chicken.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks and make sure it is json parsable and nothing else is added
to the below structure:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [54]:
df_sample = df_question.sample(n=200, random_state=1)

In [55]:
df_sample

Unnamed: 0,id,question
786,d306030a-75d6-4421-8c87-788b35610eb3,How long does it take to cook Tarte Tatin in t...
522,4f147bad-7ace-4951-a05e-d4e8fb1f3ebd,Is Pumpkin Soup suitable for a vegetarian diet?
811,8f03b61d-e6e3-490e-92da-04bc8d207b18,Do Croquettes contain any meat ingredients?
579,033d2093-952e-4b56-a11b-e990344ccefe,How would you describe the texture and appeara...
573,dbcb585d-c83f-4b93-b925-9b8e641ded80,What is the recommended serving temperature fo...
...,...,...
0,f2ff4980-fda2-4bfe-8a05-aca63be38345,What is the primary protein used in Doro Wat?
344,8d1f94b2-6458-4fb2-a38a-ab67afd6f80a,What is the first step in preparing the Risotto?
286,726da803-6b23-470e-b469-5ff6a1a855f6,How long does it take to cook Hainanese Chicke...
74,1c7876c7-05d5-46e0-a12a-1ef72b241012,Does Borscht have a high spice level?


In [56]:
sample = df_sample.to_dict(orient='records')


In [57]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = qa_function(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    try:
        evaluation = json.loads(evaluation)
        evaluations.append((record, answer_llm, evaluation))
    except:
        pass

 13%|█▎        | 26/200 [02:41<17:57,  6.20s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01k4c06yfhfn1r8137h42k3yet` service tier `on_demand` on tokens per day (TPD): Limit 100000, Used 99443, Requested 1196. Please try again in 9m11.865s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [None]:
evaluations[0]

In [None]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

NameError: name 'evaluations' is not defined

In [None]:
df_eval.relevance.value_counts(normalize=True)


In [None]:
df_eval[df_eval["relevance"] == "NON_RELEVANT"][["question", "answer"]]