# Evaluation metrics for retrieval
* https://youtu.be/APMrUnC_dy0?list=PL3MmuxUbc_hIB4fSqLy_0AfTjVLpgjV3R
* https://youtu.be/bpxi6fKcyLw?list=PL3MmuxUbc_hIB4fSqLy_0AfTjVLpgjV3R
* Evaluation metrics: https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-vector-search/eval/evaluation-metrics.md

In [32]:
# Load Python libraries
import requests
import hashlib
import json
import os
from openai import OpenAI
import pickle

In [3]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
docs_raw = docs_response.json()

In [4]:
documents = []

for course in docs_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
# Create ID's in the documents
# https://youtu.be/bpxi6fKcyLw?list=PL3MmuxUbc_hIB4fSqLy_0AfTjVLpgjV3R&t=422
def generate_document_id(doc):
    combined = f"{doc['course']}--{doc['question']}--{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    doc_id = hash_hex[:8]
    return doc_id

In [9]:
generate_document_id(documents[1])

'325dc7a4'

In [10]:
# Generate ID's for all records in documents
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [11]:
documents[1:3]

[{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '325dc7a4'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '0ba05b5a'}]

In [14]:
# Save document with ID's as new JSON file
with open('docs_with_ids.json', 'wt') as fout:
    json.dump(documents, fout, indent=2)

In [15]:
!ls -la

total 1384
drwxrwxrwx+ 3 codespace codespace   4096 Jul 15 19:10 .
drwxrwxrwx+ 7 codespace root        4096 Jul 15 13:10 ..
drwxrwxrwx+ 2 codespace codespace   4096 Jul 15 18:16 .ipynb_checkpoints
-rw-rw-rw-  1 codespace codespace 693170 Jul 15 18:33 ElasticSearch_example.ipynb
-rw-rw-rw-  1 codespace codespace   5104 Jul 15 19:08 Retrieval_Eval_Metrics.ipynb
-rw-rw-rw-  1 codespace codespace 699257 Jul 15 19:10 docs_with_ids.json


In [16]:
!head docs_with_ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "bf582b8c"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [17]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
# https://youtu.be/bpxi6fKcyLw?list=PL3MmuxUbc_hIB4fSqLy_0AfTjVLpgjV3R&t=964

# Set OPENAI_API_KEY
os.environ['OPENAI_API_KEY'] = 'API_KEY'

# ChatGPT client
client = OpenAI()

In [27]:
# Create a prompt
doc = documents[2]
doc

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp',
 'id': '0ba05b5a'}

In [28]:
prompt = prompt_template.format(**doc)
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [29]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [None]:
# https://youtu.be/bpxi6fKcyLw?list=PL3MmuxUbc_hIB4fSqLy_0AfTjVLpgjV3R&t=1129
# This will cost ~ 4 USD
results = {}
for doc in documents: 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

In [30]:
!wget https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-vector-search/eval/results.bin

--2024-07-15 19:46:56--  https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-vector-search/eval/results.bin
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘results.bin’

results.bin             [ <=>                ] 284.73K  --.-KB/s    in 0.03s   

2024-07-15 19:46:56 (8.75 MB/s) - ‘results.bin’ saved [291566]



In [49]:
# Already processed file
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

UnpicklingError: invalid load key, '\x0a'.

In [47]:
os.listdir(os.getcwd())
os.getcwd()

'/workspaces/LLM-zoomcamp/03-vector-search'

In [42]:
!ls -la

total 1680
drwxrwxrwx+ 3 codespace codespace   4096 Jul 15 19:57 .
drwxrwxrwx+ 7 codespace root        4096 Jul 15 13:10 ..
drwxrwxrwx+ 2 codespace codespace   4096 Jul 15 18:16 .ipynb_checkpoints
-rw-rw-rw-  1 codespace codespace 693170 Jul 15 18:33 ElasticSearch_example.ipynb
-rw-rw-rw-  1 codespace codespace  15141 Jul 15 19:57 Retrieval_Eval_Metrics.ipynb
-rw-rw-rw-  1 codespace codespace 699257 Jul 15 19:10 docs_with_ids.json
-rw-rw-rw-  1 codespace codespace 291566 Jul 15 19:46 results.bin
