# Generating ground truth for evaluation    

for each record in FAQ:

    generate 5 questions

In [1]:
import json

In [2]:
with open('documents.json', 'rt') as f:
    docs_raw = json.load(f)

In [3]:
documents = []
for course in docs_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

Adding ID to the documents

In [6]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [7]:
for doc in documents:
    document_id = generate_document_id(doc)
    doc['id'] = document_id

In [11]:
from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    hashes[doc['id']].append(doc)

In [13]:
with open('documents_with_id.json', 'wt') as f:
    json.dump(documents, f)

# Using LLM to generate questions

In [14]:
prompt_template = """
You are a student taking our course. Ask 5 questions a student migh ask based on a fAQ record. The record should contain the answer to the questions,
and the questions should be complete, keep it short if possible. If possible, use as few words as possible from the record. Keep 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [15]:
import ollama

In [16]:
ollama.pull('phi3:mini')

{'status': 'success'}

In [17]:
def phi3(query):
    response = ollama.chat(model='phi3:mini', messages=[
        {'role': 'user',
        'content': query}
    ])
    return response['message']['content']

In [22]:
doc = documents[0]
prompt = prompt_template.format(**doc)

In [23]:
phi3(prompt)

'```json\n[\n  "Can you provide more details about how to access course materials? Specifically regarding accessing during office hours and any other times.",\n  "I\'m interested in the detailed syllabus for this course. Can it be provided or linked somewhere?",\n   "What are some of the prerequisites needed before enrolling into your Data Science course, if there are any mentioned? If not stated explicitly on how to find them.",\n  "Could you elaborate more about what will happen during these \'Office Hours\'\' live and maybe suggest when exactly they occur in relation with class hours or other activities scheduled throughout the day?",\n   "What would be some tips for navigating through this course as a complete beginner, especially regarding handling office hours interaction?"\n]\n```'

In [24]:
doc

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}