!git clone https://github.com/alexeygrigorev/minsearch.git

In [1]:
!pip install minsearch

Collecting minsearch
  Downloading minsearch-0.0.3-py3-none-any.whl.metadata (6.1 kB)
Downloading minsearch-0.0.3-py3-none-any.whl (9.3 kB)
Installing collected packages: minsearch
Successfully installed minsearch-0.0.3


In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.minsearch.Index at 0x77e4b502fda0>

## OpenAI

In [None]:
# from openai import OpenAI

In [None]:
# client = OpenAI()

In [None]:
# response = client.chat.completions.create(
#     model='gpt-4o',
#     messages=[{"role": "user", "content": q}]
# )

# response.choices[0].message.content

"It's not uncommon for courses to accept enrollments even after they have started, but policies can vary widely depending on the institution or provider offering the course. Here are a few steps you can take to find out if you can still enroll:\n\n1. **Check the Course Platform**: If the course is offered online, visit the course's webpage for information about late enrollment policies.\n\n2. **Contact the Instructor**: Reach out to the course instructor or lead facilitator. They may be willing to make an exception or provide you with the necessary information.\n\n3. **Reach Out to Administrative Offices**: Contact the academic or administrative office responsible for course enrollments. This might be the registrar's office, student services, or a similar department.\n\n4. **Review Deadlines and Policies**: Look for any publicly available documentation outlining the deadlines and policies regarding late enrollments.\n\n5. **Consider Catching Up**: Be prepared to quickly catch up on any

## hf

In [12]:
import os
from dotenv import load_dotenv, find_dotenv

# Load variables from .env file
load_dotenv()

# Get the token from environment variable
# os.environ["HUGGINGFACE_TOKEN"]
# os.environ.get("HUGGINGFACE_TOKEN", None)
token = os.getenv("HUGGINGFACE_TOKEN", None)
# token

In [13]:
from huggingface_hub import login

# Perform login – this stores the token in your local cache securely
login(token=token)

# print("✅ Successfully logged in to Hugging Face Hub.")

In [18]:
# from transformers import pipeline

# pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", device_map="auto", token=token)

# # Chat-style input
# messages = [
#     {
#         "role": "user",
#         "content": "Who are you?",
#     },
# ]

# # Get model response
# response = pipe(messages)

# print(response[0]["generated_text"])

In [19]:
import os
from huggingface_hub import InferenceClient
# from openai import OpenAI

# client = OpenAI(
#     base_url="https://router.huggingface.co/hf-inference/models/HuggingFaceH4/zephyr-7b-beta/v1",
#     api_key=os.environ["HF_TOKEN"],
# )
client = InferenceClient(
    provider="hf-inference",
    api_key=os.environ["HUGGINGFACE_TOKEN"],
)

completion = client.chat.completions.create(
    model="HuggingFaceH4/zephyr-7b-beta",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
)

print(completion.choices[0].message)

ChatCompletionOutputMessage(role='assistant', content='The capital of France is Paris (French: Paris). Other major cities in France include Marseille, Lyon, and Toulouse.', tool_call_id=None, tool_calls=None)


## llm-rag

In [20]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [21]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [22]:
def llm(prompt):
    response = client.chat.completions.create(
        # model='gpt-4o',
        model="HuggingFaceH4/zephyr-7b-beta",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [24]:
rag(query)

'To run Kafka in Java:\n1. Navigate to the project directory.\n2. Run the following command in the terminal:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\nTo run Kafka in Python:\n1. Create a virtual environment and activate it:\n   ```\n   python -m venv env\n   source env/bin/activate\n   ```\n2. Run the following command to install the necessary dependencies:\n   ```\n   pip install -r ../requirements.txt\n   ```\n3. Run the Python file in the virtual environment:\n   ```\n   python <python_file>.py\n   ```\n\nNote: If you\'re using Windows, the activation command for the virtual environment is slightly different:\n   ```\n   source env/Scripts/activate\n   ```\n\nTo install the necessary dependencies for the provided code:\n1. Run the following command:\n   ```\n   !pip install dlt[duckdb]\n   ```\n\nNote: Be sure to have duckdb pip installed before running the command.\n\nTo run the build script in the Spark 

In [25]:
rag('the course has already started, can I still enroll?')

'Based on the context provided, the answer to the question "the course has already started, can I still enroll?" is yes, according to the section "General course-related questions" in the FAQ database. The exact statement that supports this answer is "yes, even if you don\'t register, you\'re still eligible to submit the homeworks." However, it is important to be aware that there will be deadlines for turning in the final projects, so don\'t leave everything for the last minute.'

In [26]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## Elasticsearch

In [27]:
from elasticsearch import Elasticsearch

In [28]:
es_client = Elasticsearch('http://localhost:9200') 

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [38]:
if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists, skipping creation.")

Index 'course-questions' already exists, skipping creation.


In [30]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [31]:
from tqdm.auto import tqdm

In [32]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [33]:
query = 'I just disovered the course. Can I still join it?'

In [34]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [35]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [36]:
rag(query)

"Based on the given context, here's the answer to your question:\n\nYes, you can still join the course even if you discover it after the start date. While you won't be registering, you're still eligible to submit the homeworks, but keep in mind that there will be deadlines for turning in the final projects, so it's best not to leave everything for the last minute. The materials will also be kept after the course finishes, so you can follow the course at your own pace after it ends or continue preparing for the next cohort. \n\nAs for what you can do before the course starts, you can install and set up all the dependencies and requirements, review the prerequisites and syllabus, and ensure that you're comfortable with the subjects. Additionally, you can ask questions in the Slack channel, but it's recommended to search the channel and the FAQ document first before reaching out for support. Lastly, you don't need a confirmation email after registering since registration is just to gauge 