In [4]:
!wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

--2024-06-25 11:51:19--  https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json [following]
--2024-06-25 11:51:19--  https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/main/notebooks/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json’


2024-06-25 11:51:19 (90.3 MB/s) - ‘documents.json’ saved [658332/658332]



In [5]:
import json

In [6]:
with open('./documents.json', 'rt') as f_in:
    documents_all = json.load(f_in)

In [7]:
documents_all[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [8]:
documents = []

for course in documents_all:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [9]:
len(documents)

948

In [None]:
SELECT * FROM ... WHERE course = 'data-engineering-zoomcamp'

In [10]:
import elasticsearch

In [11]:
from elasticsearch import Elasticsearch

In [12]:
es_client = Elasticsearch(hosts=['http://localhost:9200'])

In [13]:
es_client.info()

ObjectApiResponse({'name': '4e4404153c04', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'kVrnIZ0KRluFteNHxw3sYA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [14]:
index_name = "course-questions"


In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

response = es_client.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [16]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████| 948/948 [00:24<00:00, 39.22it/s]


In [18]:
query = "I just discovered the course. Can I still join?"

In [19]:
def retrieve(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    relevant_docs = []
    for hit in response['hits']['hits']:
        doc = hit['_source']
        relevant_docs.append(doc)

    return relevant_docs

In [20]:
from openai import OpenAI

In [22]:
openai_client = OpenAI(
    api_key="doesn't matter",
    base_url='http://localhost:11434/v1/',
)

In [39]:
prompt_template = """
You're a course teaching assistant. You need to answer a QUESTION from students based on
the provided CONTEXT.

QUESTION: {query}

CONTEXT:

{context}
""".strip()

context_template = """
section: {section}
question: {question}
answer: {text}
""".strip()

def build_prompt(query, context_documents):
    context = ""

    for doc in context_documents:
        context_piece = context_template.format(**doc)
        context = context + '\n\n' + context_piece
    
    context = context.strip()

    prompt = prompt_template.format(query=query, context=context)

    return prompt

In [40]:
def llm(prompt):
    oai_response = openai_client.chat.completions.create(
        model="phi3",
        messages=[{"role": "user", "content": prompt},]
    )
    
    return oai_response.choices[0].message.content

In [41]:
def rag(query):
    context_documents = retrieve(query)
    prompt = build_prompt(query, context_documents)
    response = llm(prompt)
    return response

In [42]:
query = "I just found out about the course. Can I still join it?"

In [43]:
rag(query)

" While specific course start dates vary, generally, you cannot join a course after its official start date unless there are exceptions provided by the instructor or institution. However, based on the context provided:\n\n- You can still participate in submitting homework assignments even if you haven't registered officially before the start date. There will be deadlines for final projects though, so it's best to plan accordingly and not leave everything until the last moment.\n- The course materials will remain available after the course ends, allowing self-paced learning at any time thereafter. This means you can continue working on assignments or prepare for future cohorts even if you join late. \n- It's advisable to register as soon as possible to ensure a smooth experience and access to all resources provided by the course.\n\nFor specific guidance about joining after the start date, please consult your instructor directly."