In [11]:
import minsearch
import json

# Search engine

In [12]:
with open('documents.json', 'rt') as f:
    docs_raw = json.load(f)

In [13]:
documents = []
for course in docs_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [14]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [15]:
index = minsearch.Index(text_fields=['question', 'text', 'section'],
                keyword_fields=['course'])

In [16]:
q = 'The course has already started, can I still enroll?'
index.fit(documents)

<minsearch.Index at 0x7ff31005fdc0>

In [17]:
boost = {'question': 3, 'section':0.5}
results = index.search(query=q,
             boost_dict=boost,
             filter_dict={'course': 'data-engineering-zoomcamp'})

# OpenAI API

In [18]:
from openai import OpenAI

In [19]:
client = OpenAI()

In [20]:
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": q}
    ]
)

In [21]:
completion.choices[0].message.content

"It depends on the specific policies of the course and the institution offering it. Here are a few steps you can take:\n\n1. **Check the Course Website:** Sometimes the course description or FAQ section will mention deadlines for enrollment and whether late enrollment is possible.\n\n2. **Contact the Instructor:** Reach out to the course instructor directly. They often have the discretion to allow late enrollments.\n\n3. **Consult the Admissions Office:** If it’s a college or university course, the admissions office or the registrar can provide information on whether you can still join the course.\n\n4. **Consider Auditing:** Some courses allow you to audit, meaning you can attend and participate without receiving credit. This could be an option if formal enrollment is no longer possible.\n\n5. **Online Notification:** For online courses, platforms like Coursera, edX, or Udacity might offer rolling admissions, meaning you can start the course at any time, though you should check the sp

In [22]:
promp_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use only the facts from the CONTEXT when answering the question. If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [23]:
context = ""
for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    break

In [24]:
prompt = promp_template.format(question=q, context=context).strip()

In [25]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt}
    ]
)
response.choices[0].message.content

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

Modularizing the code:

In [26]:
def search(query):
    boost = {'question': 3, 'section':0.5}
    results = index.search(query=query,
             boost_dict=boost,
             filter_dict={'course': 'data-engineering-zoomcamp'},
             num_results=5)
    return results

In [27]:
def build_prompt(query, search_results):
    promp_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use only the facts from the CONTEXT when answering the question. 
QUESTION: {question}

CONTEXT: {context}
""".strip()
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return promp_template.format(question=query, context=context).strip()

In [28]:
def llm_search(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ])
    return response.choices[0].message.content

In [29]:
def rag(query):
    query = "how do i run kafka"
    search_results = search(query)
    prompt = build_prompt(query=query, search_results=search_results)
    answer = llm(prompt)
    return answer

# Comparing with ElasticSearch

In [30]:
from elasticsearch import Elasticsearch

docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [31]:
es_client = Elasticsearch('http://localhost:9200')

In [33]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [34]:
from tqdm.auto import tqdm

In [35]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [36]:
query = 'Can I join the course half-way through'
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [37]:
response = es_client.search(index=index_name, body=search_query)

In [38]:
response

ObjectApiResponse({'took': 43, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 398, 'relation': 'eq'}, 'max_score': 52.950016, 'hits': [{'_index': 'course-questions', '_id': '816r35EBdZm3Il4Asr8j', '_score': 52.950016, '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}}, {'_index': 'course-questions', '_id': '9V6r35EBdZm3Il4Asr82', '_score': 36.514423, '_source': {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and sylla

In [39]:
result_docs = []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [40]:
def elastic_search(query):
    index_name = 'course-questions'
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [41]:
result_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [42]:
def rag_elastic(query):

    result_docs = elastic_search(query)
    prompt = build_prompt(query=query, search_results=result_docs)
    answer = llm_search(prompt)
    return answer

In [43]:
rag_elastic('How do I run Kafka?')

'To run Kafka, specifically for running producer, consumer, kstreams, etc., in the terminal using Java, follow these steps:\n\n1. Navigate to the project directory.\n2. Run the following command in the terminal:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n   \nMake sure to replace `<jar_name>` with the appropriate name of your JAR file.'

# Replacing openai API with llama

In [44]:
from llama_cpp import Llama
import ollama

In [45]:
# llm = Llama.from_pretrained(
# 	repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
# 	filename="Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf",
#     n_gpu_layers=-1,
#     n_threads=-1,
#     verbose=True
# )

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /home/bach/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/./Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str        

In [35]:
# def llama(query):
#     response = llm.create_chat_completion(
#         messages=[
#             {"role": "user", "content": query}
#         ])
#     return response['choices'][0]['message']['content']


In [47]:
ollama.pull('phi3:mini')

{'status': 'success'}

In [48]:
def phi3(query):
    response = ollama.chat(model='phi3:mini', messages=[
        {'role': 'user',
        'content': query}
    ])
    return response['message']['content']



In [49]:
def rag_elastic_phi3(query):
    result_docs = elastic_search(query)
    prompt = build_prompt(query=query, search_results=result_docs)
    answer = phi3(prompt)
    return answer

In [51]:
rag_elastic_phi3('What is the aim of the course?')

'The aim of the course is not explicitly mentioned in the provided context. However, based on the details given about preparations required before starting such as setting up Google Cloud account, installing dependencies like Anaconda Python3, Terraform and Git knowledge from prerequisites reviewing, it can be inferred that the purpose of this DataTalksClub data-engineering course is to teach students how to manage their work on GitHub through hands-on experience with coding assignments (Homeworks), FAQs contributions for peer learning, open-source projects (Learning in Public) and possibly leveraging cloud technologies. This suggests a comprehensive approach towards building practical data engineering skills combined with community engagement activities around the same platform where they are being learned - GitHub DataTalksClub.'

# Streamlit UI for our RAG

In [53]:
!conda install streamlit -y

Channels:
 - defaults
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/bach/anaconda3/envs/llm_engineering

  added / updated specs:
    - streamlit


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-5.0.1               |  py310h06a4308_0         550 KB
    arrow-cpp-16.1.0           |       hc1eb8f0_0        12.1 MB
    blinker-1.6.2              |  py310h06a4308_0          28 KB
    cachetools-5.3.3           |  py310h06a4308_0          23 KB
    gitpython-3.1.43           |  py310h06a4308_0         285 KB
    libabseil-20240116.2       | cxx17_h6a678d5_0         1.3 MB
    libcurl-8.9.1              |       h251f7ec_0         439 KB
    libgrpc-1.62.2             |       h2d74bed_0         8.0 MB
    libprotobuf-4.25.3         |       he621ea3_0         2.8 MB
    orc-