In [1]:
import groq

In [2]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '9b1256e32511', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'X40R5Gw2RQag9GkQLmpAdg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-11-26 09:04:44--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-11-26 09:04:44--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-11-26 09:04:44 (67.0 MB/s) - ‘documents.json’ saved [658332/658332]



In [6]:
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {


In [3]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

In [4]:
documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [15]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [16]:
for i in range(len(documents)):
    doc = documents[i]
    doc['doc_id'] = generate_document_id(doc)

In [17]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'doc_id': 'c02e79ef'}

In [21]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "doc_id": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"
response = es.indices.delete(index=index_name)
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [19]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'doc_id': 'c02e79ef'}

In [22]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [23]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp',
 'doc_id': '7842b56a'}

In [60]:
def retrieve_documents(query,
           index_name="course-questions",
           max_results=5,
           course="data-engineering-zoomcamp"):  
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        # "fields": ["question^3", "text", "section"],
                        "fields": ["question", "text^3"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [None]:
user_question = "How do I join the course after it has started?"

retrieve_documents(user_question)

In [45]:
api_key = 'gsk_***'

In [68]:
from groq import Groq
#client = Groq(api_key=api_key)
client = Groq()

In [69]:
response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[{"role": "user", "content": "The course already started. Can I still join?"}]
)
print(response.choices[0].message.content)

I'd be happy to help you with your inquiry.  However, I don't have information about a specific course. Can you please provide more details about the course, such as:

* Where is the course being offered (e.g. online, in-person, university)?
* Who is offering the course (e.g. instructor, organization)?
* What type of course is it (e.g. academic, professional, hobby)?

This will help me better understand your situation and provide a more informed answer.

That being said, in general, it's possible to join a course after it has started, but it depends on various factors, such as:

* Course format: Online courses might be more flexible, while in-person courses might be more difficult to join mid-semester.
* Instructor's discretion: The instructor may allow late registration, but may also have specific requirements or conditions for joining late.
* Availability of materials: You might need to catch up on missed materials, assignments, and discussions.

Please provide more information, and 

In [30]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [38]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [39]:
print(prompt)

You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: How do I join the course after it has started?

CONTEXT:

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final 

In [40]:
response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[{"role": "user", "content": prompt}]
)
print(response.choices[0].message.content)

You can still join the course after it has started as you're still eligible to submit the homeworks, but be aware of the deadlines for turning in the final projects.


In [66]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_groq(prompt, model="llama-3.1-70b-versatile"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

# use ask_groq and model="llama3-8b-8192" if using groq

def qa_bot(user_question, course="data-engineering-zoomcamp"):
    context_docs = retrieve_documents(user_question, course=course)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_groq(prompt)
    return answer

In [44]:
qa_bot("How can I get a certificate if I do the course in the self-paced mode?")

'Unfortunately, no, you cannot get a certificate if you take the course in the self-paced mode. According to our course policy, a certificate is only awarded to students who finish the course with a "live" cohort. This is because the course requires peer-reviewing capstones after submitting a project, which can only be done when the course is running.'

In [25]:
import pandas as pd

In [26]:
df_eval = pd.read_csv('ground-truth-data.csv')

In [28]:
df_eval.course.value_counts()

course
data-engineering-zoomcamp    2123
machine-learning-zoomcamp    1830
mlops-zoomcamp                674
Name: count, dtype: int64

In [29]:
df_mlops = df_eval[df_eval.course == 'mlops-zoomcamp']

In [31]:
queries = df_mlops.to_dict(orient='records')

In [33]:
q = queries[0]

In [34]:
q

{'question': 'Where can I find the structure for questions in this course?',
 'course': 'mlops-zoomcamp',
 'document': '0560e827'}

In [61]:
results = []

for q in tqdm(queries):
    doc_id = q['document']
    docs = retrieve_documents(query=q['question'], course=q['course'])
    relevance = [d['doc_id'] == doc_id for d in docs]
    results.append(relevance)

  0%|          | 0/674 [00:00<?, ?it/s]

In [55]:
results[:10]

[[False, False, False, True, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, True]]

Hitrate
    
    [[False, False, False, True, False], 1
     [False, True, False, False, False], 1
     [True, False, False, False, False], 1
     [False, False, False, False, False],  0
     [False, False, True, False, False], 1
     [True, False, False, False, False], 1
     [True, False, False, False, False], 1
     [False, True, False, False, False], 1
     [False, True, False, False, False], 1
     [False, False, False, False, True]] 1

     hitrate = 90%


MRR 
      1      2      3      4     5    
    [[False, False, False, True, False], 0.25
     [False, True, False, False, False], 0.5
     [True, False, False, False, False], 1
     [False, False, False, False, False],  0
     [False, False, True, False, False], 0.33
     [True, False, False, False, False], 1
     [True, False, False, False, False], 1
     [False, True, False, False, False], 0.5
     [False, True, False, False, False], 0.5
     [False, False, False, False, True]] 0.2

1 / (rank + 1)

In [42]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [62]:
hit_rate(results), mrr(results)

(0.9109792284866469, 0.8394411473788325)

In [63]:
hit_rate(results), mrr(results)

(0.9109792284866469, 0.8394411473788325)

LLM as a Judge

In [80]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to question form the user
also taking into the account the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Explanation": "[Provide a brief explanation for your evaluation]",
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT"
}}
""".strip()

In [74]:
doc_index = {d['doc_id']: d for d in documents}

In [65]:
q

{'question': 'How do you specify the variable file when running terraform destroy?',
 'course': 'mlops-zoomcamp',
 'document': '886d1617'}

In [71]:
generated_answer = qa_bot(user_question=q['question'], course=q['course'])

In [72]:
print(generated_answer)

To specify the variable file when running terraform destroy, you would use the following command:

```bash
terraform destroy --var-file vars/prod.tfvars
```


In [121]:
def llm_as_a_judge(q):
    generated_answer = qa_bot(user_question=q['question'], course=q['course'])
    # print(generated_answer)
    doc_original = doc_index[q['document']]

    prompt = prompt1_template.format(
        answer_orig=doc_original['text'],
        question=q['question'],
        answer_llm=generated_answer
    )

    # print(prompt)
    relevance = ask_groq(prompt)
    # print(relevance)
    return generated_answer, relevance

In [122]:
q

{'question': 'What is the purpose of this FAQ document?',
 'course': 'mlops-zoomcamp',
 'document': '0560e827',
 'judgement': {'Explanation': 'The generated answer directly quotes the original answer in response to a user question that inquires about the purpose of the FAQ document. The generated answer accurately captures the essence of the original answer, providing a precise and clear response to the question.',
  'Relevance': 'RELEVANT'}}

In [123]:
relevance_judgements = []

for q in tqdm(queries[:5]):
    try:
        answer, judgement = llm_as_a_judge(q)
        result = q.copy()
        parsed = json.loads(judgement)
        result['answer'] = answer
        result['explanation'] = parsed['Explanation']
        result['relevance'] = parsed['Relevance']
        result['original_doc'] = doc_index[q['document']]['text']
        relevance_judgements.append(result)
    except:
        print(f"problems with parsing answer for {q}")

  0%|          | 0/5 [00:00<?, ?it/s]

In [124]:
df_rel = pd.DataFrame(relevance_judgements)

In [125]:
df_rel.relevance.value_counts()

relevance
RELEVANT           4
PARTLY_RELEVANT    1
Name: count, dtype: int64

In [126]:
df_rel[df_rel.relevance == 'PARTLY_RELEVANT'].to_dict(orient='records')

[{'question': 'What should be included in the problem title for my questions?',
  'course': 'mlops-zoomcamp',
  'document': '0560e827',
  'answer': 'According to our FAQ database, for your problem title, you should follow the format: [Problem title]. This should be a clear and concise description of the issue or problem you are experiencing.\n\nExample of a properly formatted question:\nProblem: I cloned the public repo, made edits, committed and pushed them to my own repo. Now I want to get the recent commits from the public repo without overwriting my own changes to my own repo.',
  'explanation': 'The generated answer provides guidance on how to structure a problem title in an FAQ, which is slightly related to the original answer. However, the original answer is more focused on providing a template for problem and solution descriptions, and it also references another FAQ document for inspiration. The generated answer only addresses a minor aspect of the original answer and does not 