In [None]:
RAG - Retrieval Augmented Generation // Generowanie Rozszerzonego Pobierania

#### LLM Zoomcamp 1.3 - Retrieval and Search

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [27]:
!/llm-zoomcamp-1.png

/bin/bash: /llm-zoomcamp-1.png: No such file or directory


In [25]:
# file/llm-zoomcamp-1.png
!workspaces/llm-zoomcamp/file/llm-zoomcamp-1.png

/bin/bash: workspaces/llm-zoomcamp/file/llm-zoomcamp-1.png: No such file or directory


In [1]:
import minsearch

In [None]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
          docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [None]:
SELECT * WHERE course = 'data-engineering-zoomcamp';

In [9]:
q = 'the course has already started, can I still enroll?'

In [7]:
index.fit(documents)

<minsearch.Index at 0x73518bcaad10>

In [10]:
boost = {'questions': 3.0, 'section': 0.5}

results = index.search(
    query=q,
    filter_dict={'course' : 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)

In [17]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

#### LLM Zoomcamp 1.4 - Generating Answers with OpenAI GPT

In [11]:
from openai import OpenAI

In [12]:
client = OpenAI()q

In [20]:
q

'the course has already started, can I still enroll?'

In [29]:
response = client.chat.completions.create(
    model='gpt-3.5-turbo-16k',
    # messages=[{"role": "user", "content": "is it too late to join the course?"}]
    messages=[{"role": "user", "content": q}]
)

In [30]:
response

ChatCompletion(id='chatcmpl-9ctSOcG5NolTgIvvXPoMWWch3aieA', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I'm sorry, but I am an AI language model and do not have the specific information about courses and enrollments. You would need to contact the institution or organization offering the course to inquire about enrolling after the course has already started. They will be able to provide you with the most accurate information regarding enrollment.", role='assistant', function_call=None, tool_calls=None))], created=1719056668, model='gpt-3.5-turbo-16k-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=63, prompt_tokens=18, total_tokens=81))

In [31]:
response.choices[0].message.content

"I'm sorry, but I am an AI language model and do not have the specific information about courses and enrollments. You would need to contact the institution or organization offering the course to inquire about enrolling after the course has already started. They will be able to provide you with the most accurate information regarding enrollment."

In [13]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT  doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [14]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [32]:
prompt = prompt_template.format(question=q, context=context).strip()

In [33]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT  doesn't contain the answer, output NONE

QUESTION: the course has already started, can I still enroll?

CONTEXT:
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only

In [34]:
response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    # messages=[{"role": "user", "content": "is it too late to join the course?"}]
    messages=[{"role": "user", "content": prompt}]
)

response.choices[0].message.content

"Yes, you can still join the course after the start date. Even if you don't register, you're still eligible to submit the homeworks. Just be aware of deadlines for turning in the final projects."

#### LLM Zoomcamp 1.4.2 - Exploring Alternatives to OpenAI

https://mistral.ai/

#### LLM Zoomcamp 1.5 - The RAG Flow Cleaning and Modularizing Code

In [36]:
def search(query):
    boost = {'questions': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course' : 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results
    

In [37]:
search('how do I run kafaka')

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial tha

In [38]:
query = ('how do I run kafaka')
search_results = search(query)

In [43]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT  doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [44]:
build_prompt(query, search_results)
# prompt = build_prompt(query, search_results)

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database\nUse only the facts from the CONTEXT when answering the QUESTION.\nIf the CONTEXT  doesn't contain the answer, output NONE\n\nQUESTION: how do I run kafaka\n\nCONTEXT:\nsection: Workshop 1 - dlthub\nquestion: How do I install the necessary dependencies to run the code?\nanswer: Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).\n\nsection: General course-related questions\nquestion: How do I use Git / GitHub for this course?\nanswer: After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on y

In [49]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [50]:
query = 'how do I run kafka?'
search_result = search(query)
prompt = build_prompt(query, search_result)
answer = llm(prompt)

In [51]:
answer

'To run Kafka, you need to create a virtual environment and run the requirements.txt and python files in that environment by following the steps provided in the answer.'

In [52]:
query = 'how do I run kafka?'

def rag(query):
    search_result = search(query)
    prompt = build_prompt(query, search_result)
    answer = llm(prompt)
    return answer

In [53]:
rag(query)

'To run Kafka, you need to create a virtual environment and install the necessary requirements using the provided steps.'

In [54]:
rag('the course has already started, can I still enroll?')

'Yes, even if the course has already started, you can still enroll. Remember that there will be deadlines for turning in the final projects, so make sure not to leave everything for the last minute.'

#### LLM Zoomcamp 1.6 - Search with Elasticsearch

In [57]:
!wget https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/elastic-search.md

--2024-06-22 12:32:32--  https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/elastic-search.md
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘elastic-search.md’

elastic-search.md       [ <=>                ] 174.46K  --.-KB/s    in 0.03s   

2024-06-22 12:32:32 (4.89 MB/s) - ‘elastic-search.md’ saved [178650]

