## Generate ground truth file for evaluation
Purpose of this code is to generate ground_truth_data.csv

In [6]:
from openai import OpenAI
from tqdm.auto import tqdm
from urllib.request import urlopen
import cloudpickle as cp
import json
#client = OpenAI()

In [2]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Generate unique IDs for Documents
To accurately track relevant documents, each document is assigned a unique ID. By maintaining consistent IDs, we can manage changes and updates to the document set without affecting the evaluation process. This helps know which answer goes with which question.

In [3]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}" # Concatenate Document Attributes
    hash_object = hashlib.md5(combined.encode()) # Use the MD5 hashing algorithm to create a hash from the concatenated string. 
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8] # Extract a Substring of the Hash
    return document_id #Assign the ID to the Document using the Hash

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [7]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [8]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


## Use LLM to generate user queries based on documents
This acts as an alternative to human annotation, which is the better approach to generate ground truth data.

In [14]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [15]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [16]:
#for each doc, we generate queries and save the doc-question paires
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions


  0%|          | 0/948 [00:00<?, ?it/s]

KeyboardInterrupt: 

To save some quota, we directly load the pre generated doc-queries pairs :)

In [20]:
!pip install cloudpickle

Collecting cloudpickle
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: cloudpickle
Successfully installed cloudpickle-3.0.0


In [34]:
gen_qs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/raw/main/03-vector-search/eval/results.bin'

In [28]:
loaded_pickle_object = cp.load(urlopen(gen_qs_url))

In [32]:
#sample questions
loaded_pickle_object['1f6520ca']

'["Where can I find the prerequisites for this course?", "How do I check the prerequisites for this course?", "Where are the course prerequisites listed?", "What are the requirements for joining this course?", "Where is the list of prerequisites for the course?"]'

In [36]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [37]:
parsed_resulst

{'c02e79ef': ['When exactly does the course start?',
  "How can I subscribe to the course's public Google Calendar?",
  'What do I need to do before the course starts?',
  'Where can I join the course announcements channel?',
  'Which Slack workspace and channel should I join for the course?'],
 '1f6520ca': ['What prerequisites are required for this course?',
  'Where can I find the prerequisite information?',
  'Does the course have any prerequisites?',
  'Where are the details about the prerequisites listed?',
  'Can you tell me the prerequisite link for this course?'],
 '7842b56a': ['Can I submit homework if I join after the course starts?',
  'If I miss the registration deadline, can I still submit homework?',
  'Is it possible to join the course after it has started?',
  'Are there deadlines for submitting final projects if I join late?',
  'Can I register for the course after the start date and still participate fully?'],
 '0bbf41ec': ["I registered for the Data Engineering Bootc

In [39]:
doc_index = {d['id']: d for d in documents}

final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [40]:
import pandas as pd

In [None]:
df.to_csv('ground-truth-data.csv', index=False) #save ground truth to a file