In [1]:
import minsearch
import json

# Search engine

In [2]:
with open('documents.json', 'rt') as f:
    docs_raw = json.load(f)

In [3]:
documents = []
for course in docs_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
index = minsearch.Index(text_fields=['question', 'text', 'section'],
                keyword_fields=['course'])

In [6]:
q = 'The course has already started, can I still enroll?'
index.fit(documents)

<minsearch.Index at 0x7fc72d793340>

In [7]:
boost = {'question': 3, 'section':0.5}
results = index.search(query=q,
             boost_dict=boost,
             filter_dict={'course': 'data-engineering-zoomcamp'})

# OpenAI API

In [8]:
from openai import OpenAI

In [9]:
api_key = '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'
client = OpenAI(api_key=api_key)

In [10]:
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": q}
    ]
)

In [11]:
completion.choices[0].message.content

"Whether you can still enroll in a course after it has started depends on several factors, including the policies of the institution offering the course, the flexibility of the instructor, and how far along the course is. Here are some steps you can take to find out:\n\n1. **Check the Enrollment Deadline**: Look at the course description or the academic calendar to see if there is a specific cut-off date for enrollment.\n\n2. **Contact the Instructor**: Reach out to the course instructor directly. They may allow late enrollment, especially if you have a strong reason or background that would help you catch up quickly.\n\n3. **Speak with the Registrar or Academic Advisor**: The registrar’s office or your academic advisor can provide information on the institution's policies about late enrollment and guide you through the process if it's possible.\n\n4. **Review Course Materials**: Determine how much content you have missed and assess your ability to catch up. If it's feasible, make a pl

In [12]:
promp_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use only the facts from the CONTEXT when answering the question. If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [13]:
context = ""
for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    break

In [14]:
prompt = promp_template.format(question=q, context=context).strip()

In [15]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt}
    ]
)
response.choices[0].message.content

'Yes, even if the course has started, you are still eligible to enroll and submit the homeworks. Be aware of the deadlines for turning in the final projects.'

Modularizing the code:

In [16]:
def search(query):
    boost = {'question': 3, 'section':0.5}
    results = index.search(query=query,
             boost_dict=boost,
             filter_dict={'course': 'data-engineering-zoomcamp'},
             num_results=5)
    return results

In [17]:
def build_prompt(query, search_results):
    promp_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use only the facts from the CONTEXT when answering the question. 
QUESTION: {question}

CONTEXT: {context}
""".strip()
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return promp_template.format(question=query, context=context).strip()

In [18]:
def llm_search(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ])
    return response.choices[0].message.content

In [19]:
def rag(query):
    query = "how do i run kafka"
    search_results = search(query)
    prompt = build_prompt(query=query, search_results=search_results)
    answer = llm(prompt)
    return answer

# Comparing with ElasticSearch

In [20]:
from elasticsearch import Elasticsearch

In [21]:
es_client = Elasticsearch('http://localhost:9200')

In [22]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [23]:
from tqdm.auto import tqdm

In [24]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [25]:
query = 'Can I join the course half-way through'
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [26]:
response = es_client.search(index=index_name, body=search_query)

In [27]:
response

ObjectApiResponse({'took': 43, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 398, 'relation': 'eq'}, 'max_score': 52.235485, 'hits': [{'_index': 'course-questions', '_id': 'Ofvt15EBjgNIBg5Cr7F1', '_score': 52.235485, '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}}, {'_index': 'course-questions', '_id': 'O_vt15EBjgNIBg5Cr7GO', '_score': 35.915157, '_source': {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and sylla

In [28]:
result_docs = []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [29]:
def elastic_search(query):
    index_name = 'course-questions'
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [30]:
result_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [31]:
def rag_elastic(query):

    result_docs = elastic_search(query)
    prompt = build_prompt(query=query, search_results=result_docs)
    answer = llm_search(prompt)
    return answer

In [32]:
rag_elastic('How do I run Kafka?')

'To run Kafka, you need to follow the appropriate commands based on the context provided about Java Kafka. Specifically, if you want to run a producer, consumer, or KStreams application using Java in the terminal, you would navigate to your project directory and execute the following command:\n\n```java\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nReplace `<jar_name>` with the actual name of your compiled jar file.\n\nThis command runs the `JsonProducer.java` file, and similar commands can be used for other components by specifying the corresponding Java file.'

# Replacing openai API with llama

In [33]:
from llama_cpp import Llama
import ollama

In [34]:
llm = Llama.from_pretrained(
	repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
	filename="Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf",
    n_gpu_layers=-1,
    n_threads=-1,
    verbose=True
)

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /home/bach/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/./Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str        

In [35]:
def llama(query):
    response = llm.create_chat_completion(
        messages=[
            {"role": "user", "content": query}
        ])
    return response['choices'][0]['message']['content']


In [36]:
ollama.pull('phi3:mini')

{'status': 'success'}

In [45]:
def phi3(query):
    response = ollama.chat(model='phi3:mini', messages=[
        {'role': 'user',
        'content': query}
    ])
    return response['message']['content']



In [46]:
def rag_elastic_phi3(query):
    result_docs = elastic_search(query)
    prompt = build_prompt(query=query, search_results=result_docs)
    answer = phi3(prompt)
    return answer

In [47]:
response = ollama.chat(model='phi3:mini', messages=[
        {'role': 'user',
        'content': 'who are you'}
    ])

In [48]:
rag_elastic_phi3('What is the aim of the course?')

'The aim of this course is not explicitly mentioned in your provided context, but it can be inferred that participants will learn data engineering skills through practical tasks such as submitting homework for grading on a points system based on the amount and type of questions answered correctly within FAQs or tutorials shared publicly. The prerequisites suggest an expected competency level in areas like cloud computing, Python programming using Anaconda environment with Terraform toolkits, Git usage along with understanding concepts related to data engineering such as handling datasets (Homeworks), frequently asked questions regarding the course content/tools (FAQs), and best practices shared through Learning in Public.'