# Offline Rag Evaluation

## Load documents with IDs

In [1]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [2]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

## Load ground truth

In [3]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [5]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Index data

In [6]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)


In [7]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [8]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]



## Retrieval

In [9]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [10]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

## The RAG flow

In [11]:
from groq import Groq
import os
from dotenv import load_dotenv

load_dotenv()

client = Groq(
    api_key=os.getenv("GROQ_API_KEY")
)

In [12]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [13]:
def llm(prompt, model='mixtral-8x7b-32768'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [14]:
# previously: rag(query: str) -> str
def rag(query: dict, model='mixtral-8x7b-32768') -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [15]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [16]:
rag(ground_truth[10])

'Yes, sessions are recorded and you can watch them later if you miss a session. This includes both the pre-recorded course videos and the office hours sessions where live questions are answered. However, if you miss a midterm project, you can still receive a certificate, as explained in a previous answer. The context does not mention anything about missing data treatment for this question.'

In [17]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Cosine similarity metric

In [23]:
# Results are different from the video because, we use a different model.

answer_orig = 'Yes, sessions are recorded and you can watch them later if you miss a session. This includes both the pre-recorded course videos and the office hours sessions where live questions are answered. However, if you miss a midterm project, you can still receive a certificate, as explained in a previous answer. The context does not mention anything about missing data treatment for this question.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'


v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

v_llm.dot(v_orig)

0.59431124

In [24]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [25]:
len(ground_truth)

1830

In [27]:
answers = {}

for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc["text"]

    answers[i] = {
        "answer_llm": answer_llm,
        "answer_orig": answer_orig,
        "document": doc_id,
        "question": rec['question'],
        "course": rec["course"]
    }

  0%|          | 0/1830 [00:00<?, ?it/s]

In [None]:
results_mistral8b = [None] * len(ground_truth)

for i, val in answers.items():
    results_mistral8b[i] = val.copy()
    results_mistral8b[i].update(ground_truth[i])

In [None]:
import pandas as pd

In [None]:
df_mistral8b = pd.DataFrame(results_mistral8b)

In [None]:
df_mistral8b.to_csv('data/results-mistral8b.csv', index=False)

## Evaluating Llama3 8b

In [None]:
rag(ground_truth[10], model="llama3-8b-8192")

In [None]:
answers_llama = {}

for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers_llama:
        continue

    answer_llm = rag(rec, model="llama3-8b-8192")
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc["text"]

    answers_llama[i] = {
        "answer_llm": answer_llm,
        "answer_orig": answer_orig,
        "document": doc_id,
        "question": rec['question'],
        "course": rec["course"]
    }

In [None]:
results_llama3 = [None] * len(ground_truth)

for i, val in answers.items():
    results_llama3[i] = val.copy()
    results_llama3[i].update(ground_truth[i])

In [None]:
df_llama3 = pd.DataFrame(results_llama3)
df_llama3.to_csv('data/results-llama3.csv', index=False)

## Cosine similarity

### Mistral 8b

In [None]:
results_mistral8b = df_mistral8b.to_dict(orient='records')

In [None]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)


In [None]:
similarity = []

for record in tqdm(results_mistral8b):
    sim = compute_similarity(record)
    similarity.append(sim)

In [None]:
df_mistral8b['cosine'] = similarity
df_mistral8b['cosine'].describe()

In [None]:
import seaborn as sns

### Llama 3b

In [None]:
results_llama3 = df_llama3.to_dict(orient='records')

similarity_llama = []

for record in tqdm(results_llama3):
    sim = compute_similarity(record)
    similarity_llama.append(sim)

In [None]:
df_llama3b['cosine'] = similarity_llama
df_llama3b['cosine'].describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.histplot(df_mistral8b['cosine'], label='Mistral')
sns.hitsplot(df_llama3['cosine'], label='Llama 3')

plt.title("RAG LLM performance")
plt.xlabel("A->Q->A' Cosine Similarity")
plt.legend()

## LLM-as-a-judge

In [None]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
df_sample = df_llama3.sample(n=150, random_state=1)

In [None]:
samples = df_sample.to_dict(orient='records')

In [None]:
record = samples[0]
record

In [None]:
prompt = prompt1_template.format(**record)
print(prompt)

In [None]:
answer = llm(prompt, model='llama3-8b-8192')

In [None]:
import json

In [None]:
evaluations = []

for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='llama3-8b-8192')
    evaluations.append(evaluation)

In [None]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

In [None]:
df_evaluations = pd.DataFrame(json_evaluations)

In [None]:
df_evaluations.Relevance.value_counts()

In [None]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT']

In [None]:
sample[4]

In [None]:
prompt = prompt2_template.format(**record)
print(prompt)

In [None]:
evaluation = llm(prompt, model='llama3-8b-8192')
print(evaluation)

In [None]:
evaluations_2 = []

for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='llama3-8b-8192')
    evaluations_2.append(evaluation)

In [None]:
json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    json_eval = json.loads(str_eval)
    json_evaluations_2.append(json_eval)

In [None]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)

In [None]:
df_evaluations_2[df_evaluations_2.Relevance == 'NON_RELEVANT']

In [None]:
samples[45]

## Saving all the data

In [None]:
df_mistral8b.to_csv('data/results-mistral8b-cosine.csv', index=False)
df_llama3.to_csv('data/results-llama3b-cosine.csv', index=False)

In [None]:
df_evaluations.to_csv('data/evaluations-aqa.csv', index=False)
df_evaluations_2.to_csv('data/evaluations-qa.csv', index=False)