In [1]:
import sys
import os
import json
from datetime import date
from openai import OpenAI

In [2]:
sys.path.append('..')

In [3]:
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])

In [4]:
q = 'How do I run Kafka?'

In [5]:
def get_response(client, model, system_context, assistant_context, user_context):
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system", "content": system_context,
                "role": "assistant", "content": assistant_context,
                "role": "user", "content": user_context
            }
        ]
    )
    
    output = response.choices[0].message.content
    return output

In [6]:
print(get_response(client,
             model="gpt-3.5-turbo",
             system_context = "",
             assistant_context = "",
             user_context = q))

To run Kafka, you need to follow these steps:

1. Install Kafka on your system by downloading the Kafka binaries from the Apache Kafka website.
2. Unzip the downloaded file to a directory on your system.
3. Start Zookeeper, which is required for Kafka to run. You can start Zookeeper by running the following command in the Kafka directory:

``` 
bin/zookeeper-server-start.sh config/zookeeper.properties 
```

4. Start the Kafka server by running the following command in the Kafka directory:

```
bin/kafka-server-start.sh config/server.properties
```

5. Create a topic in Kafka by running the following command:

```
bin/kafka-topics.sh --create --topic myTopic --bootstrap-server localhost:9092 --partitions 1 --replication-factor 1
```

6. Produce messages to the Kafka topic by running the following command:

```
bin/kafka-console-producer.sh --topic myTopic --bootstrap-server localhost:9092
```

7. Consume messages from the Kafka topic by running the following command:

```
bin/kafka-cons

In [7]:
import minsearch

In [8]:
# Preprocessing 
with open('../documents.json', 'rt') as f_in:
    raw_documents = json.load(f_in)
    docs = []
    for course_dict in raw_documents:
        for doc in course_dict['documents']:
            doc["course"] = course_dict['course']
            docs.append(doc)

In [9]:
index = minsearch.Index(
    text_fields = ["text", "section", "question"],
    keyword_fields = ["course"]
)
index.fit(docs)

<minsearch.Index at 0x70637344e8f0>

In [10]:
def search(q, index):

    boost = {'question': 3.0, 'text': 1.0}
    
    docs_retrieved = index.search(
        query = q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 30
    )

    return docs_retrieved

In [11]:
print(search(q, index))

[{'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java', 'section': 'Module 6: streaming with kafka', 'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal', 'course': 'data-engineering-zoomcamp'}, {'text': "Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you'll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate)\nAlso the virtual environment should be created only to run the python file. Docker images should first all be up and running.", 'section': 'Module 6: streaming with kafka'

In [12]:
def create_context(docs_retrieved):

    context = ""
    for doc_retrieved in docs_retrieved:
        context = context + f"Section: {doc_retrieved['section']}\nQuestion: {doc_retrieved['question']}\nAnswer: {doc_retrieved['text']}\n\n"

    return context

In [13]:
print(create_context(search(q, index)))

Section: Module 6: streaming with kafka
Question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
Answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

Section: Module 6: streaming with kafka
Question: Module “kafka” not found when trying to run producer.py
Answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate)
Also the virtual environment should be created only to run the python file. Docker images should first all be up and running.

Secti

In [14]:
def create_prompt(q, docs_retrieved):

    prompt_template="""Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: 
    {question}
    
    CONTEXT: 
    {context}

    """.strip()

    context = create_context(docs_retrieved)

    prompt = prompt_template.format(question=q, context=context, date = date.today()).strip()

    return prompt

In [15]:
print(create_prompt(q, search(q, index)))

Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: 
    How do I run Kafka?
    
    CONTEXT: 
    Section: Module 6: streaming with kafka
Question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
Answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

Section: Module 6: streaming with kafka
Question: Module “kafka” not found when trying to run producer.py
Answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - but for

In [16]:
def get_response_llm_context(client, index, q, model):

    docs_retrieved = search(q, index)
    prompt = create_prompt(q, docs_retrieved)

    response = get_response(client = client, model = model, system_context="", assistant_context="", user_context=prompt)
    return response

In [17]:
print(get_response_llm_context(client, index, q, 'gpt-3.5-turbo'))

To run Kafka, for a Java application, in the project directory, the command to be used is:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java 

For a Python Kafka application, you will need to create a virtual environment and run the Python files inside that environment. The steps involved are:
1. To create a virtual env and install packages, run these commands just once:
```
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
```
2. For activating the environment, you need to run this every time:
```
source env/bin/activate
```
3. To deactivate the environment, run:
```
deactivate
```

For a Python Kafka application, before running the streaming.py script, you may need to ensure you have the necessary dependencies installed. To install the dependencies needed for Python3 script 06-streaming/python/avro_example/producer.py, run the following commands:
```
confluent-kafka: pip install confluent-kafka or conda install

In [18]:
from elasticsearch import Elasticsearch

In [19]:
es_client = Elasticsearch("http://localhost:9200")

In [20]:
index_anatomy = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [21]:
index_name = "course-questions"

In [22]:
es_client.indices.create(index=index_name,
                         body=index_anatomy)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [23]:
from tqdm.auto import tqdm

for doc in tqdm(docs):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████████████████████| 948/948 [00:29<00:00, 31.62it/s]


In [36]:
query = "Can I still enroll it?"

In [39]:
def search(q, es_client):

    search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": q,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    docs_retrieved = []
    for hit in es_client.search(index='course-questions', body=search_query)['hits']['hits']:
        docs_retrieved.append(hit['_source'])

    return docs_retrieved

In [40]:
search(q=query, es_client=es_client)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Create a new branch for development, then you can merge it to the main branch\nCreate a new branch and switch to this branch. It allows yo

In [41]:
def get_response_llm_context(client, es_client, q, model):

    docs_retrieved = search(q, es_client)
    prompt = create_prompt(q, docs_retrieved)

    response = get_response(client = client, model = model, system_context="", assistant_context="", user_context=prompt)
    return response

In [45]:
print(get_response_llm_context(client, es_client, q=query, model="gpt-3.5-turbo"))

No, you cannot enroll after the start date. However, you can still submit homeworks and access course materials at your own pace after the course finishes.
