In [1]:
import json
import hashlib
from collections import defaultdict

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[1]

{'text': 'GitHub - DataTalksClub/data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

If we don’t include text in our hashes we can see that they are not unique 

In [3]:
def generate_document_id_reduced(doc):
   
    combined = f"{doc['course']}-{doc['question']}"

    # creates an MD5 hash object using the hashlib.md5() function; 
    # combined.encode() method converts the combined string into bytes, which is required by the md5 function.
    hash_object = hashlib.md5(combined.encode())
    # generates the hexadecimal representation of the MD5 hash.
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

for doc in documents:
    doc['id'] = generate_document_id_reduced(doc)

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)


In [4]:
documents[1]

{'text': 'GitHub - DataTalksClub/data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '58fa8869'}

In [5]:
len(doc_id) == len(hashes)

False

In [6]:

for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

ca3dc12d 2
960fb254 2
67d2f21c 2
297f443c 2


We have collisions due to duplicate questions

In [7]:
hashes["ca3dc12d"]

[{'text': 'This error comes up on the Spark video 5.3.1 - First Look at Spark/PySpark,\nbecause as at the creation of the video, 2021 data was the most recent which utilised csv files but as at now its parquet.\nSo when you run the command spark.createDataFrame(df1_pandas).show(),\nYou get the Attribute error. This is caused by the pandas version 2.0.0 which seems incompatible with Spark 3.3.2, so to fix it you have to downgrade pandas to 1.5.3 using the command pip install -U pandas==1.5.3\nAnother option is adding the following after importing pandas, if one does not want to downgrade pandas version (source) :\npd.DataFrame.iteritems = pd.DataFrame.items\nNote that this problem is solved with Spark versions from 3.4.1',
  'section': 'Module 5: pyspark',
  'question': "AttributeError: 'DataFrame' object has no attribute 'iteritems'",
  'course': 'data-engineering-zoomcamp',
  'id': 'ca3dc12d'},
 {'text': 'Another alternative is to install pandas 2.0.1 (it worked well as at the time of

### Save the indexed docs 

In [8]:
# Create content based ids 

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    # creates an MD5 hash object using the hashlib.md5() function; 
    # combined.encode() method converts the combined string into bytes, which is required by the md5 function.
    hash_object = hashlib.md5(combined.encode())
    # generates the hexadecimal representation of the MD5 hash.
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id


In [9]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [10]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

Based on this document we will use LLM to generate synthetic ground truth dataset (by generating 5 questions).

In [11]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [12]:
from openai import OpenAI

In [13]:
# client = OpenAI(base_url="https://api.perplexity.ai")

In [14]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='llama-3-sonar-small-32k-online',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [15]:
from tqdm.auto import tqdm

In [16]:
# It costs about 4$

# results = {}

# for doc in tqdm(documents): 
#     doc_id = doc['id']
#     if doc_id in results:
#         continue

#     questions = generate_questions(doc)
#     results[doc_id] = questions

In [17]:
# Trained 
import pickle

with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)



In [18]:
results['1f6520ca']

'["Where can I find the prerequisites for this course?", "How do I check the prerequisites for this course?", "Where are the course prerequisites listed?", "What are the requirements for joining this course?", "Where is the list of prerequisites for the course?"]'

In [19]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

JSONDecodeError: Invalid \escape: line 6 column 59 (char 414)

In [20]:
# There was a non-parsable question 
json_question = [
r"How can I resolve the Docker error 'invalid mode: \Program Files\Git\var\lib\postgresql\data'?",
"What should I do if I encounter an invalid mode error in Docker on Windows?",
"What is the correct mounting path to use in Docker for PostgreSQL data on Windows?",
"Can you provide an example of a correct Docker mounting path for PostgreSQL data?",
r"How do I correct the mounting path error in Docker for \Program Files\Git\var\lib\postgresql\data'?"
]
json_question

["How can I resolve the Docker error 'invalid mode: \\Program Files\\Git\\var\\lib\\postgresql\\data'?",
 'What should I do if I encounter an invalid mode error in Docker on Windows?',
 'What is the correct mounting path to use in Docker for PostgreSQL data on Windows?',
 'Can you provide an example of a correct Docker mounting path for PostgreSQL data?',
 "How do I correct the mounting path error in Docker for \\Program Files\\Git\\var\\lib\\postgresql\\data'?"]

In [21]:
# print doc_id 
results["58c9f99f"]= json.dumps(json_question)

In [22]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

### Save as CSV file

In [23]:
doc_index = {d['id']: d for d in documents}

In [27]:
final_results = []

for doc_id, questions in parsed_results.items():
    if doc_id in doc_index.keys():
        course = doc_index[doc_id]['course']
        for q in questions:
            final_results.append((q, course, doc_id))

In [28]:
import pandas as pd

In [29]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [30]:
df

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4545,How should I destroy infrastructure created us...,mlops-zoomcamp,886d1617
4546,What is the first step to destroy AWS infrastr...,mlops-zoomcamp,886d1617
4547,Can I destroy infrastructure created with GitH...,mlops-zoomcamp,886d1617
4548,What command initializes Terraform with specif...,mlops-zoomcamp,886d1617


In [31]:
df.to_csv('ground-truth-data.csv', index=False)