In [2]:
import requests 

docs_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/03-evaluation/search_evaluation/documents-with-ids.json'

documents = requests.get(docs_url).json()

In [5]:
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1b7b700af90>

In [9]:
def search(question):
    return index.search(
        question,
        boost_dict={'question': 3.0, 'section': 0.3},
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [12]:
instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [13]:
import json

def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [14]:
from openai import OpenAI

openai_client = OpenAI()

def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [15]:
def rag(question):
    search_results = search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [16]:
rag('can I join the course now?')

"Yes, you can still join the course even if you haven't registered. You are eligible to submit the homeworks, but be aware of deadlines for the final projects."

- logs (Q, rerieval results, final answer)
-  

## Generate data

In [17]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

In [22]:
def llm_structured(instructions, user_prompt, output_type, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_type
    )

    return response.output_parsed

In [29]:
data_gen_instructions = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short. 
""".strip()

In [30]:
from pydantic import BaseModel

class Questions(BaseModel):
    questions: list[str]

In [31]:
faq_dataset = documents[:100]

In [32]:
llm_structured(
    data_gen_instructions,
    json.dumps(faq_dataset[0]),
    Questions
)

Questions(questions=["What is the exact date and time of the course's commencement, and how will the first session be structured?", 'How can I ensure that I receive all updates and announcements related to the course?', 'Could you clarify the registration process for the course and any specific links I need to use?', 'Is there a particular platform I need to subscribe to in order to keep track of the course schedule?', "What steps do I need to take in order to join the DataTalks.Club's Slack channel for the course?"])

In [33]:
qs = _

In [34]:
qs.questions

["What is the exact date and time of the course's commencement, and how will the first session be structured?",
 'How can I ensure that I receive all updates and announcements related to the course?',
 'Could you clarify the registration process for the course and any specific links I need to use?',
 'Is there a particular platform I need to subscribe to in order to keep track of the course schedule?',
 "What steps do I need to take in order to join the DataTalks.Club's Slack channel for the course?"]

In [35]:
faq_dataset[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [36]:
search('How can I ensure that I receive all updates and announcements related to the course?')

[{'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp',
  'id': '2f19301f'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'Yes, we will keep all

In [37]:
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

def map_progress(pool, seq, f):
    """Map function f over seq using the provided executor pool while
    displaying a tqdm progress bar. Returns a list of results in submission order.
    """
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [41]:
def process(doc):
    out = llm_structured(
        data_gen_instructions,
        json.dumps(doc),
        Questions
    )

    results = []

    for q in out.questions:
        results.append({
            'question': q,
            'course': doc['course'],
            'id': doc['id']
        })

    return results

In [42]:
process(faq_dataset[10])

[{'course': 'data-engineering-zoomcamp',
  'id': 'ea739c65',
  'question': 'What factors should I consider when determining how many hours per week I will need to dedicate to the course?'},
 {'course': 'data-engineering-zoomcamp',
  'id': 'ea739c65',
  'question': 'Can you provide more details on how my previous experience might influence the time I need to spend each week?'},
 {'course': 'data-engineering-zoomcamp',
  'id': 'ea739c65',
  'question': 'Is there a method or calculation I can use to estimate my weekly hours based on my background?'},
 {'course': 'data-engineering-zoomcamp',
  'id': 'ea739c65',
  'question': 'Are there any additional resources or recommendations for managing my time effectively while taking this course?'},
 {'course': 'data-engineering-zoomcamp',
  'id': 'ea739c65',
  'question': 'What is the expected time commitment for students who may not have any background in data engineering?'}]

In [43]:
with ThreadPoolExecutor(max_workers=6) as pool:
    ground_truth = map_progress(pool, faq_dataset, process)

  0%|          | 0/100 [00:00<?, ?it/s]

LLM as a Judge

In [46]:
ground_truth_flat = [item for sublist in ground_truth for item in sublist]

In [48]:
import pandas as pd

In [49]:
df_ground_truth = pd.DataFrame(ground_truth_flat)

In [54]:
q = ground_truth_flat[10]

In [56]:
search_results = search(question=q['question'])
search_results_id = [d['id'] for d in search_results]

In [67]:
cnt = 0
total_rank = 0.0

for q in tqdm(ground_truth_flat):
    search_results = search(question=q['question'])
    search_results_id = [d['id'] for d in search_results]
    line = [q['id'] == item for item in search_results_id]
    
    for i, item in enumerate(line):        
        if item == True:
            rank = i + 1
            total_rank = total_rank + 1 / rank 

            cnt = cnt + 1
            break

  0%|          | 0/501 [00:00<?, ?it/s]

In [66]:
cnt / len(ground_truth_flat)

0.6786427145708582

In [68]:
total_rank / len(ground_truth_flat)

0.5243512974051897

In [63]:
search_results_id

['63394d91', '29d3d343', '8dc77677', '1f6520ca', '06021091']

In [64]:
q['id']

'acf42bb8'

MRR

In [None]:
example = [
    # 1     2       3     4        5
    [True, True, False, False, False],  # 1 / 1 => 1
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, False, False, False, False], # 0
    [False, True, True, False, False],  # 1 / 2
    [False, False, True, False, False],  # 1 / 3
    [True, False, False, False, False],  # 1
    [True, False, False, False, False],  # 1
    [True, False, False, False, False],  # 1
    [False, False, True, False, False],  # 1 / 3
    [False, False, False, False, False], # 0
]