In [None]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

In [4]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': 'c7ba5b8ed9fd', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'kOPrPb1hTzOtCPqkjBgPNA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
# es.indices.delete(index=index_name)
response = es.indices.create(index=index_name, body=index_settings)

response

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/_oHRFIFCRIOM_pQRpxUTpQ] already exists')

In [7]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

print('done')

  0%|          | 0/948 [00:00<?, ?it/s]

done


In [8]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [9]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [17]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [18]:
user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to su...

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishe...

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependenc...

Section: General course-related questions
Question: How do I use Git / GitHub for this course?
Answer: After you create a GitHub account, you should clone the cour...

Section: Workshop 1 - dlthub
Question: How do I install the necessary dependencies to run the code?
Answer: Answer: To run the provided code, ensure that the 'dlt[duckd...



In [20]:
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "The course already started. Can I still join?"}]
)
print(response.choices[0].message.content)

Yes, it’s often possible to join a course after it has already started, although this largely depends on the specific policies of the institution or organization offering the course. Here are a few steps you can take to increase your chances of being able to join:

1. **Contact the Instructor or Administration**: Reach out directly to the course instructor or the administration office. They may be able to make an exception for you or provide guidance on how to catch up.

2. **Check the Enrollment Policies**: Look at the course catalog or the institution's website for information on late enrollment policies. Some institutions have formal procedures for adding a course late and can even have waitlists for popular classes.

3. **Self-Study**: If you’re allowed to join late, be prepared to put in extra effort to catch up. Ask for any materials or assignments you’ve missed and see if you can get notes from classmates.

4. **Auditing**: If the course is closed for new enrollments, inquire wh

In [24]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

context_docs = retrieve_documents(user_question)

context_result = ""

for doc in context_docs:
    doc_str = context_template.format(**doc)
    context_result += ("\n\n" + doc_str)

context = context_result.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [26]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [27]:
print(prompt)

You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: How do I join the course after it has started?

CONTEXT:

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final 

In [28]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content
answer

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [29]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

def build_context(documents):
    context_result = ""

    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()

In [36]:
prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

In [38]:
def ask_openai(prompt, model="gpt-4o"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

In [39]:
def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [41]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'If you\'re encountering the error "invalid reference format: repository name must be lowercase" when working with Docker on Windows, this issue might be related to how you\'re specifying volume paths. Windows paths can sometimes cause issues due to casing or spaces.\n\nHere are some steps you can try to resolve the issue:\n\n1. **Move Data to a Folder Without Spaces:**\n   Move your data to a folder path that does not contain spaces. For instance, move `"C:/Users/Alexey Grigorev/git/..."` to `"C:/git/..."`.\n\n2. **Replace the `-v` Part in Your Command:**\n   Try different ways of specifying the volume mapping in your Docker command. Here are several examples:\n   - `-v /c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data`\n   - `-v //c:/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data`\n   - `-v /c/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data`\n   - `-v //c/some/path/ny_taxi_postgres_data:/var/lib/postgresql/data`\n   - `--volume //driveletter/path/ny_taxi

In [42]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")

'It seems the issue you\'re facing is related to the port and user authentication for PostgreSQL. Here are some potential solutions you can try:\n\n1. **Port Issue**:\n    - The default port 5432 might be taken by another instance of PostgreSQL on your local machine. Try using a different port, such as 5431.\n    - You can set the port when you create your Docker container. For example:\n      ```sh\n      docker run -p 5431:5432 ...\n      ```\n    - Then, modify your connection string to use the new port:\n      ```python\n      engine = create_engine(\'postgresql://root:root@localhost:5431/ny_taxi\')\n      ```\n\n2. **Password Authentication Issue**:\n    - Ensure that the username and password in your connection string are correct.\n    - If you have a PostgreSQL service running on your system that you are not connecting to, try stopping it to eliminate conflicts.\n\n3. **User Authentication Issue**:\n    - Verify whether the user "root" exists and has the correct permissions.\n  

In [43]:
qa_bot("how can I run kafka?")

'In order to run Kafka, depending on the specific task you need to perform (e.g., running a Kafka producer/consumer), here are the steps you can follow based on the provided CONTEXT:\n\n1. **Run Kafka Java Producer/Consumer/Kstreams:**\n   - Navigate to your project directory.\n   - Use the following command to run your Java producer, consumer, or KStreams application in the terminal:\n     ```bash\n     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n     ```\n   - Replace `<jar_name>` with the actual name of your jar file.\n\n2. **Run Kafka Python Producer:**\n   - Create a virtual environment and install the necessary dependencies:\n     ```bash\n     python -m venv env\n     source env/bin/activate\n     pip install -r requirements.txt\n     ```\n   - To activate the virtual environment, use:\n     ```bash\n     source env/bin/activate\n     ```\n   - To deactivate it, use:\n     ```bash\n     deactivate\n     ```\n   - Please note t