In [1]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

--2024-06-17 13:09:09--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-06-17 13:09:10 (12.8 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
!ls -l

total 12
-rw-rw-rw- 1 codespace codespace 3005 Jun 17 12:24 homework.ipynb
-rw-rw-rw- 1 codespace codespace 3832 Jun 17 13:09 minsearch.py
-rw-rw-rw- 1 codespace codespace 1695 Jun 17 13:09 rag-intro.ipynb


In [4]:
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

--2024-06-17 13:22:06--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-06-17 13:22:06 (3.20 MB/s) - ‘documents.json’ saved [658332/658332]



In [5]:
!tree

[34;42m.[00m
├── [34;42m__pycache__[00m
│   └── minsearch.cpython-310.pyc
├── documents.json
├── homework.ipynb
├── minsearch.py
└── rag-intro.ipynb

1 directory, 5 files


In [1]:
import os
from dotenv import load_dotenv
import minsearch
import json
from openai import OpenAI
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [6]:
index.fit(documents)

<minsearch.Index at 0x749c883067a0>

In [7]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [9]:
def search(query):
    """
    Search the indexed documents for a given query, filtered by course.

    Args:
        query (str): The search query.

    Returns:
        list: A list of search results, each being a dictionary containing 
              relevant document fields.
    """
    
    boost = {"question": 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5
    )
    
    return results

In [10]:
def build_prompt(query, search_results):
    """
    Build a prompt for the language model based on the search query and search results.

    Args:
        query (str): The search query.
        search_results (list): A list of dictionaries containing the search results.

    Returns:
        str: A formatted prompt string for the language model.
    """
    
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
        
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [11]:
def llm(prompt):
    """
    Generate a response from the language model based on the provided prompt.

    Args:
        prompt (str): The prompt string to be sent to the language model.

    Returns:
        str: The response generated by the language model.
    """
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content
    

In [12]:
def rag(query):
    """
    Perform a Retrieval-Augmented Generation (RAG) process to answer a query.

    Args:
        query (str): The query to be answered.

    Returns:
        str: The answer generated by the language model.
    """
    
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    
    return answer
    

In [13]:
rag("The course has started, can I still join?")

"Yes, even if the course has started, you can still join and submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

#### Running ElasticSearch:

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [14]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [15]:
es_client = Elasticsearch("http://localhost:9200")

In [16]:
es_client.info()

ObjectApiResponse({'name': '070402397dd9', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'KLR0QMTlTsqp3EfL7bVUMQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [17]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [18]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:37<00:00, 25.26it/s]


In [19]:
query = "I just discovered the course, can I still join it?"

In [20]:
def elastic_search(query):
    """
    Perform a search on an Elasticsearch index with a specified query.

    This function constructs a search query to match documents containing the given query
    within specified fields ("question", "text", "section") with a boosting factor of 3 for
    the "question" field. It filters results to only include documents from the "data-engineering-zoomcamp"
    course. The search returns the top 5 matching documents.

    Args:
        query (str): The search query string to match against the index.

    Returns:
        list: A list of documents (dicts) that match the search query. Each document is
              represented as a dictionary containing the fields and values from the search hits.
    """
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])
    
    return result_docs
    

In [22]:
def rag(query):
    """
    Perform a Retrieval-Augmented Generation (RAG) process to answer a query.

    Args:
        query (str): The query to be answered.

    Returns:
        str: The answer generated by the language model.
    """
    
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    
    return answer
    

In [25]:
rag(query)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."