### Calling OpenAI API

In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI

In [3]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
openai_client = OpenAI(
  api_key=OPENAI_API_KEY
)

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  store=True,
  messages=[
    {"role": "user", "content": "How can i use .env files with python example"}
  ]
)

print(completion.choices[0].message.content);

Using `.env` files in Python applications is a common practice for managing environment variables and configuration settings without hardcoding them in your source code. The most popular way to do this is by using the `python-dotenv` library, which simplifies the process of loading environment variables from a `.env` file.

Here's a step-by-step guide along with an example:

### Step 1: Install `python-dotenv`

If you don't have `python-dotenv` installed yet, you can install it using pip:

```bash
pip install python-dotenv
```

### Step 2: Create a `.env` File

Create a file named `.env` in your project directory. This file should contain your environment variables in the format `KEY=VALUE`. For example:

```plaintext
# .env
API_KEY=your_api_key_here
DATABASE_URL=sqlite:///my_database.db
DEBUG=True
```

### Step 3: Load the Environment Variables in Your Python Code

You can now load the environment variables from the `.env` file in your Python script. Here’s an example:

```python
impo

### Retrieval

In [4]:
# search-engine in the search-engine folder installed via uv add.
import minsearch
import json

In [5]:
documents = []

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [24]:
# creating index
index = minsearch.Index(
            text_fields=["question", "text", "section"],
            keyword_fields=["course"]
)

index.fit(documents)

# Performing a search
q="the course has already started, can i still enroll?"

boost = {"question": 3.0,
         "section": 0.5,
        }

results = index.search(
    query=q,
    filter_dict= {'course': 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)

In [26]:
results[0]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

### Generation with OpenAI

In [27]:
from openai import OpenAI

In [31]:
client = OpenAI()

response = client.chat.completions.create(
                model = "gpt-4o-mini",
                messages=[{"role":"user", "content":q}]
                )

response.choices[0].message.content

"Whether you can still enroll in a course that has already started typically depends on the institution or program's policies. Here are a few steps you can take:\n\n1. **Check the Enrollment Policy:** Look at the course or program's official website to see if they allow late enrollment.\n\n2. **Contact the Instructor or Admissions Office:** Reach out directly to the course instructor or the admissions office for clarification. They can provide specific guidance based on the program's policies.\n\n3. **Consider a Waitlist:** Some programs may have a waitlist for students who want to enroll after the course has begun.\n\n4. **Look for Similar Courses:** If enrollment is not possible, check if there are similar courses offered in the future.\n\n5. **Online Options:** If it’s an online course, you may have a better chance of enrolling late, as many online programs are more flexible.\n\nBe proactive in your inquiry, and you may find a solution!"

### Cleaned RAG Flow

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def elastic_search(query):

    """
    """

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "most_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    # extracting the results
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def prompt_builder(query: str, search_results: list)->str:

    """
    """
    
    prompt_template= """You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}""".strip()

    context = ""


    for doc in search_results:
        context = context+f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

def llm(prompt:str)->str:

    """
    """
    
    client = OpenAI()
    response = client.chat.completions.create(
                model = "gpt-4o-mini",
                messages=[{"role":"user", "content":prompt}]
                )

    return response.choices[0].message.content


# The RAG
query = 'how do I run kafka?'
def rag(query):
    search_results = elastic_search(query)
    prompt = prompt_builder(query, search_results)
    answer = llm(prompt)
    return answer



In [14]:
q="How to run Kafka"
rag(query=q)

  response = es_client.search(index=index_name, body=search_query)


"Running Apache Kafka involves several steps, including installation, configuration, and starting the necessary services. Below is a guide to help you get started with running Kafka:\n\n### Prerequisites\n\n1. **Java Installation**: Kafka runs on the Java Virtual Machine (JVM), so you need to have Java installed. Kafka requires Java 8 or higher:\n   - You can check if Java is installed by running `java -version` in your terminal.\n   - If it's not installed, download and install the JDK from the official [Oracle](https://www.oracle.com/java/technologies/javase-jdk11-downloads.html) or [OpenJDK](https://openjdk.java.net/) websites.\n\n2. **zookeeper**: Kafka depends on ZooKeeper for managing distributed brokers. However, recent versions of Kafka (from 2.8.0) allow you to run Kafka without ZooKeeper, so check your version.\n\n### Step 1: Download Kafka\n\n1. Download Kafka from the official [Apache Kafka website](https://kafka.apache.org/downloads).\n\n2. Extract the downloaded archive:\

### ElasticSearch

In [8]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [9]:
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

ObjectApiResponse({'name': '52935299c7d8', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'FrWtgshEQPC7d6Athokahw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [13]:
index_settings={
                "settings": {
                    "number_of_shards": 1,
                    "number_of_replicas": 0
                },
                "mappings": {
                    "properties": {
                        "text": {"type": "text"},
                        "section": {"type": "text"},
                        "question": {"type": "text"},
                        "course": {"type": "keyword"} 
                        }
                    }
                }

index_name = "course-question"

#es_client.indices.create(index=index_name, body=index_settings)

In [12]:
# Indexing process
for doc in documents:
    es_client.index(index=index_name, document=doc)

In [13]:
query="I just discovered the course. Can I still join it?"
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)

  response = es_client.search(index=index_name, body=search_query)


In [14]:
# extracting the results
result_docs = []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

result_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [15]:
def elastic_search(query):

    """
    """

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    # extracting the results
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_score'])

    return result_docs

In [16]:
elastic_search(query=query)

  response = es_client.search(index=index_name, body=search_query)


[30.49997, 30.49997, 30.49997, 26.447298, 26.447298]