In [1]:
import openai

In [2]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-06-25 08:21:46--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-25 08:21:47--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-06-25 08:21:47 (98.0 MB/s) - ‘documents.json’ saved [658332/658332]



In [3]:
import json

In [4]:
with open('./documents.json', 'rt') as f_in:
    documents_all = json.load(f_in)

In [6]:
documents_all[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [8]:
documents = []

for course in documents_all:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [9]:
len(documents)

948

In [None]:
SELECT * FROM ... WHERE course = 'data-engineering-zoomcamp'

In [1]:
import elasticsearch

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es_client = Elasticsearch(hosts=['http://localhost:9200'])

In [4]:
es_client.info()

ObjectApiResponse({'name': 'c567b10143a1', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'oi30js35T9mreF-n1i-MKg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
index_name = "course-questions"


In [17]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

response = es_client.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [18]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████| 948/948 [00:29<00:00, 32.08it/s]


In [5]:
query = "I just discovered the course. Can I still join?"

In [6]:
documents[2]

NameError: name 'documents' is not defined

In [58]:
def retrieve(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    relevant_docs = []
    for hit in response['hits']['hits']:
        doc = hit['_source']
        relevant_docs.append(doc)

    return relevant_docs

In [14]:
from openai import OpenAI

In [16]:
openai_client = OpenAI()

In [51]:
prompt_template = """
You're a course teaching assistant. You need to answer a QUESTION from students based on
the provided CONTEXT. If the provided CONTEXT doesn't contain the answer, say "I don't know"

QUESTION: {query}

CONTEXT:

{context}
""".strip()

context_template = """
section: {section}
question: {question}
answer: {text}
""".strip()

def build_prompt(query, context_documents):
    context = ""

    for doc in context_documents:
        context_piece = context_template.format(**doc)
        context = context + '\n\n' + context_piece
    
    context = context.strip()

    prompt = prompt_template.format(query=query, context=context)

    return prompt

In [46]:
def llm(prompt):
    oai_response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    
    return oai_response.choices[0].message.content

In [47]:
def rag(query):
    context_documents = retrieve(query)
    prompt = build_prompt(query, context_documents)
    response = llm(prompt)
    return response

In [62]:
query = "Can I attend the program if I'm a few days late?"

In [63]:
context_documents = retrieve(query)
prompt = build_prompt(query, context_documents)

In [64]:
print(prompt)

You're a course teaching assistant. You need to answer a QUESTION from students based on
the provided CONTEXT. If the provided CONTEXT doesn't contain the answer, say "I don't know"

QUESTION: Can I attend the program if I'm a few days late?

CONTEXT:

section: Module 4: analytics engineering with dbt
question: Build - Why do my Fact_trips only contain a few days of data?
answer: Make sure you use:
dbt run --var ‘is_test_run: false’ or
dbt build --var ‘is_test_run: false’
(watch out for formatted text from this document: re-type the single quotes). If that does not work, use --vars '{'is_test_run': 'false'}' with each phrase separately quoted.

section: General course-related questions
question: Course - Can I get support if I take the course in the self-paced mode?
answer: Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are a

In [65]:
rag("how can I install kafka?")

"I don't know."

In [50]:
print(_)

It looks like installing Kafka isn't directly addressed in the provided context. However, I'll offer guidance based on general Kafka installation steps. 

To install Apache Kafka, you can follow these steps:

1. **Download Kafka**: 
   - Visit the [Apache Kafka downloads page](https://kafka.apache.org/downloads).
   - Download the latest stable binary release.

2. **Extract the Archive**:
   - Unzip or tar the downloaded file to your desired directory.

   ```sh
   tar -xzf kafka_2.13-2.8.0.tgz
   cd kafka_2.13-2.8.0
   ```

3. **Start the Kafka Environment**:
   - Kafka requires a running Zookeeper instance. You can find the scripts to start Zookeeper and Kafka in the `bin` directory.

   - Start Zookeeper:
     ```sh
     bin/zookeeper-server-start.sh config/zookeeper.properties
     ```

   - Start Kafka server:
     ```sh
     bin/kafka-server-start.sh config/server.properties
     ```

4. **Create Topics and Start Producing/Consuming Messages**:
   - Create a topic:
     ```sh
   