In [4]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

print(documents)



In [7]:
# import hashlib

# def generate_document_id(doc):
#     # combined = f"{doc['course']}-{doc['question']}"
#     combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
#     hash_object = hashlib.md5(combined.encode())
#     hash_hex = hash_object.hexdigest()
#     document_id = hash_hex[:8]
#     return document_id

# for doc in documents:
#     doc['id'] = generate_document_id(doc)


from rag.data.loader import DocumentLoader

loader = DocumentLoader()

for doc in documents:
    # Generate and add unique ID
    doc_id = loader.generate_id(doc)
    doc["doc_id"] = doc_id

print(documents)



In [8]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'doc_id': '386dcf67c83b203e5b424f2ba7489370'}

In [9]:
from collections import defaultdict

In [10]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['doc_id']
    hashes[doc_id].append(doc)

In [11]:
len(hashes), len(documents)

(948, 948)

In [12]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [7]:
import json

In [16]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [17]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "doc_id": "bae7a31e6abaddb52b4061dcf238fc61"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [18]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [26]:
from openai import OpenAI
from rag.llm.openai_client import OpenAIClient

custom_client = OpenAIClient(load_env=True)
key = custom_client.api_key

client = OpenAI()
client = OpenAI(api_key=key)


In [27]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [28]:
from tqdm.auto import tqdm

In [23]:
results = {}

In [29]:
for doc in tqdm(documents): 
    doc_id = doc['doc_id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [1]:
import pickle

In [31]:
# Write to binary file
with open('results.pkl', 'wb') as f_out:
    pickle.dump(results, f_out)

In [2]:
with open('results.pkl', 'rb') as f_in:
    results = pickle.load(f_in)

In [10]:
results['09505439c01f8a62ddf2d893d6ffc433']

'[\n  "What alteration is necessary if Docker throws an invalid mode error related to the path?",\n  "How can I resolve a Docker daemon error involving \\Program Files\\Git\\var\\lib\\postgresql\\data?",\n  "What\'s an appropriate mounting path to use instead of \\Program Files\\Git\\var\\lib\\postgresql\\data in Docker?",\n  "Which mounting path adjustments can fix a Docker invalid mode error regarding PostgreSQL data?",\n  "What changes should I make to the mounting path when encountering an invalid mode error with Docker?"\n]'

In [11]:
print(results)



In [None]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

bae7a31e6abaddb52b4061dcf238fc61
3e5d4959603c68a1e154fa2a6bd9d1e8
60a31bbef930b3d6b127405fcd0b618e
386dcf67c83b203e5b424f2ba7489370
6e3550ba00f652ce2fa74706751c4983
f8323339d264dc9d40d9dad5a34c06b5
d10eed624489c36f17500750ff21c868
cb86516adcdcafa29f0758ae6ca28a0b
fa5c1523945f27f6bb5d9a04f2146a7a
a5737de4f33219a4fcfe02f2c746d5a3
c3b5714cc4d5a6db4fd5912404f30c31
c47f302e889f61f7fd3cee025ef939b6
6a439fc15426567b38e84acc6142c461
093b06705cc2cc1141f667e241e14a06
1e1c6528694d3abbfc7488bb3abea5e2
3a90237d0692b97f3f9b322a855cc4fd
9a0107663b3bab95ee3787a55673cde6
b666ae0b62d6a6dfc38396ba510cdc6b
d418a703e60b8555c41dc80863f589dc
2a0887600c276fd6f9d5a5ead914df0b
821ffc7eef9760e690cf6f6f3d01f191
c4138ffefdd3d00b85496250ace2c836
13cc1cfada6c696dab0c83e781774b93
3cc081b0deeb66dac1f6e6f80b859bc3
5a76ede2a7f68e863f6da7276d0ce106
905a9a5de263c002c6b8b311a20f4f03
6cb80ff3a8a0c09a2273b073933b7311
e5a35da13d8561565696b031a6a2f7be
6777d05ae24b183acd0ce819e589662c
cd540c158b2253368bde0deecc09cfea
66ed5eeaca

JSONDecodeError: Invalid \escape: line 3 column 54 (char 149)

In [None]:
'[\n  "What alteration is necessary if Docker throws an invalid mode error related to the path?",\n  "How can I resolve a Docker daemon error involving \\Program Files\\Git\\var\\lib\\postgresql\\data?",\n  "What\'s an appropriate mounting path to use instead of \\Program Files\\Git\\var\\lib\\postgresql\\data in Docker?",\n  "Which mounting path adjustments can fix a Docker invalid mode error regarding PostgreSQL data?",\n  "What changes should I make to the mounting path when encountering an invalid mode error with Docker?"\n]'

In [None]:
json_questions = [
    "What alteration is necessary if Docker throws an invalid mode error related to the path?",
    r"How can I resolve a Docker daemon error involving \Program Files\Git\var\lib\postgresql\data?",
    r"What's an appropriate mounting path to use instead of \Program Files\Git\var\lib\postgresql\data in Docker?",
    "Which mounting path adjustments can fix a Docker invalid mode error regarding PostgreSQL data?",
    "What changes should I make to the mounting path when encountering an invalid mode error with Docker?"
]

parsed_resulst = {}

[
  "What alteration is necessary if Docker throws an invalid mode error related to the path?",
  "How can I resolve a Docker daemon error involving \Program Files\Git\var\lib\postgresql\data?",
  "What's an appropriate mounting path to use instead of \Program Files\Git\var\lib\postgresql\data in Docker?",
  "Which mounting path adjustments can fix a Docker invalid mode error regarding PostgreSQL data?",
  "What changes should I make to the mounting path when encountering an invalid mode error with Docker?"
]


In [67]:
doc_index = {d['id']: d for d in documents}

In [69]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [71]:
import pandas as pd

In [75]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [77]:
df.to_csv('ground-truth-data.csv', index=False)

In [78]:
!head ground-truth-data.csv

question,course,document
When does the course begin?,data-engineering-zoomcamp,c02e79ef
How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
Where can I find the prerequisites for this course?,data-engineering-zoomcamp,1f6520ca
How do I check the prerequisites for this course?,data-engineering-zoomcamp,1f6520ca
Where are the course prerequisites listed?,data-engineering-zoomcamp,1f6520ca
What are the requirements for joining this course?,data-engineering-zoomcamp,1f6520ca
