In [39]:
from openai import OpenAI
from dotenv import load_dotenv,find_dotenv
import instructor
import os
import min_search
import json
from pydantic import BaseModel, Field

with open('documents.json', 'r') as file:
    documents = json.load(file)
    
load_dotenv(find_dotenv())

client = instructor.patch(OpenAI(), mode=instructor.Mode.TOOLS)

In [20]:
documents[2]['documents']

[{'text': 'MLOps Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course, and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\n[Problem description]\n[Solution description]\n(optional) Added by Name',
  'section': '+-General course questions',
  'question': 'Format for questions: [Problem title]'},
 {'text': 'Approximately 3 months. For each module, about 1 week with possible deadline extensions (in total 6~9 weeks), 2 weeks for working on the capstone project and 1 week for peer review.',
  'section': '+-General course questions',
  'question': 'What is the expected duration of this course or that for each module?'},
 {'text': 'The difference is the Orchestration and Monitoring modules. Those videos will be re-recorded. The rest should mostly be the same.\nAlso all of the homeworks will be changed for the 2023 coho

In [22]:
docs = [dict(doc_dict, course=doc_dicts['course']) for doc_dicts in documents for doc_dict in doc_dicts['documents']]

In [75]:
index = min_search.Index(

    text_fields= ['question', 'text','section'],
    keyword_fields = ['course']
)

In [76]:
index.fit(docs)

<min_search.Index at 0x7fb8df4fd1e0>

In [77]:
boost = { 
    'question': 3.0, 
    'section': 0.5
    }

index.search(
    filter_dict={'course':'data-engineering-zoomcamp'},
    query = 'Can I still join the course after the start date?',
    boost_dict=boost,
    num_results = 3
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [36]:
response = client.chat.completions.create(
    model = 'gpt-4o',
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "The course has already started?"}
    ]
)

In [37]:
response.choices[0].message.content

'To help you better, could you please specify which course you are referring to? If you mention the name of the course or the institution offering it, I can provide more accurate information.'

In [43]:
class ResponseModel(BaseModel):
    question: str = Field(..., description='The question to answer')
    answer: str = Field(..., description='The answer to the question')
    section: str = Field(..., description='The information information you took to answer the question')
    confidence: str = Field(..., description='The confidence of the answer in %')


In [41]:
response = client.chat.completions.create(
    model = 'gpt-4o',
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "The course has already started?"}
    ], 
    response_model = ResponseModel
)

In [60]:
prompt_template = """
Role: You are an AI course assistant programmed to answer questions based solely on provided reference material.

Input:
Question: {question}
Reference Material: {context}

Instructions:
1. Analyze the question and reference material thoroughly.
2. Extract only relevant facts from the reference material that directly address the question.
3. Construct a clear, concise answer using these facts.
4. Identify the specific section or part of the reference material used for the answer.
5. Assess your confidence in the answer on a scale of 0 to 100.
6. If confidence is below 60 or critical information is missing, set answer to "Insufficient information to provide a confident response."
7. Double-check that your response uses NO external knowledge or assumptions.

Output your response in this exact JSON format:
{{
    "question": "{question}",
    "answer": "Your response based strictly on the reference material",
    "section": "Specific section or part of the reference material used",
    "confidence": Numerical value between 0 and 100 based on the information provided on the material, if the question is not possible to be infered from the material the confidence should be 0 other wise increase given the information provided
}}

Validation:
- Ensure all JSON fields are present and correctly formatted.
- Verify that the confidence value is a number between 0 and 100.
- Confirm that the answer field contains no external information.

"""

In [66]:
def get_course_response( question: str, *, index: min_search.Index = index, course: str = 'data-engineering-zoomcamp', boost: dict = boost) -> ResponseModel:
    context = ""

    for doc in index.search(filter_dict={'course': course}, query=question, boost_dict=boost):
        context += f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n"
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions about courses"},
            {"role": "user", "content": prompt_template.format(
                question=question,
                context=context
            )
                }
        ],
        response_model=ResponseModel
    )
    return response

# Example usage:
# response = get_course_response(client, index, "Can I still join the course after the start date?", "data-engineering-zoomcamp", boost)


In [67]:
response = get_course_response("Are you a pussy?")
response

ResponseModel(question='Are you a pussy?', answer='Insufficient information to provide a confident response.', section='General course-related questions', confidence='0')

In [72]:
class ResponseModel(BaseModel):
    question: str = Field(..., description='The question to answer')
    answer: str = Field(..., description='The answer to the question')
    section: str = Field(..., description='The information information you took to answer the question')
    confidence: str|int = Field(..., description='The confidence of the answer in %')


def search( question: str, *,
            index: min_search.Index = index, 
            course: str = 'data-engineering-zoomcamp', 
            boost: dict = { 
                'question': 3.0, 
                'section': 0.5
    }
    
    ) -> ResponseModel:
    index.search(
    filter_dict={'course':course},
    query = question,
    boost_dict=boost,
    num_results = 3
    )

    context = ""

    for doc in index.search(filter_dict={'course': course}, query=question, boost_dict=boost):
        context += f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n"
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions about courses"},
            {"role": "user", "content": prompt_template.format(
                question=question,
                context=context
            )
                }
        ],
        response_model=ResponseModel
    )
    return response

In [73]:
search("How do I run kafka?")

ResponseModel(question='How do I run kafka?', answer='Insufficient information to provide a confident response.', section='None', confidence=0)

In [78]:
index.search("How do I run kafka?")

[{'text': "Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you'll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate)\nAlso the virtual environment should be created only to run the python file. Docker images should first all be up and running.",
  'section': 'Module 6: streaming with kafka',
  'question': 'Module “kafka” not found when trying to run producer.py',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'If you get an error while running the command python3 stream.py worker\nRun pip uninstall kafka-python\nThen run pip install kafka-python==1.4.6\nWhat is the use of  Redpanda ?

In [79]:
# Creating a total modular approach
def search_results(docs:list[dict],*,
                   text_fields:list[str]=['question', 'text','section'],
                   keyword_fields:list[str]=['course'],
                   **kwargs):
    index = min_search.Index(

    text_fields= text_fields,
    keyword_fields = keyword_fields
)
    index.fit(docs)
    def search(query: str):
        return index.search(query,**kwargs)
    return search


boost = { 
    'question': 3.0, 
    'section': 0.5
    }

index = min_search.Index(

    text_fields= ['question', 'text','section'],
    keyword_fields = ['course']
)
index.fit(docs)

boost = { 
    'question': 3.0, 
    'section': 0.5
    }

index.search(
    filter_dict={'course':'data-engineering-zoomcamp'},
    query = 'Can I still join the course after the start date?',
    boost_dict=boost,
    num_results = 3
)


In [85]:

s = search_results(docs,boost_dict=boost,filter_dict={'course':'data-engineering-zoomcamp'},num_results=3, text_fields=['question', 'text','section'],keyword_fields=['course'])



In [None]:
index = min_search.Index(

    text_fields= ['question', 'text','section'],
    keyword_fields = ['course']
)
index.fit(docs)

boost = { 
    'question': 3.0, 
    'section': 0.5
    }

index.search(
    filter_dict={'course':'data-engineering-zoomcamp'},
    query = 'Can I still join the course after the start date?',
    boost_dict=boost,
    num_results = 3
)

In [86]:
s('Can i kill a dog?')

[{'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Create a new branch for development, then you can merge it to the main branch\nCreate a new branch and switch to this branch. It allows you to make changes. Then you can commit and push the changes to the “main” branch.',
  'section': 'Module 4: analytics engineering with dbt',
  'question': "Dbt+git - It appears that I can't edit the files because I'm in read-only mode. Does anyone know how I can change that?",
  'course': 'data-engineering-zoomcamp'},
 {'text': 'When a CSV file is compressed using Gzip, it is saved with a ".csv.gz" file extension. This file type is also known as a Gzip compressed CSV file. When you want to read a Gzip compressed CSV file using Pandas, you c

In [89]:
class ResponseModel(BaseModel):
    question: str = Field(..., description='The question to answer')
    answer: str = Field(..., description='The answer to the question')
    section: str = Field(..., description='The information information you took to answer the question')
    confidence: str|int = Field(..., description='The confidence of the answer in %')

def build_prompt(question:str,context:str) -> str:
    prompt_template = """
Role: You are an AI course assistant programmed to answer questions based solely on provided reference material.

Input:
Question: {question}
Reference Material: {context}

Instructions:
1. Analyze the question and reference material thoroughly.
2. Extract only relevant facts from the reference material that directly address the question.
3. Construct a clear, concise answer using these facts.
4. Identify the specific section or part of the reference material used for the answer.
5. Assess your confidence in the answer on a scale of 0 to 100.
6. If confidence is below 60 or critical information is missing, set answer to "Insufficient information to provide a confident response."
7. Double-check that your response uses NO external knowledge or assumptions.

Output your response in this exact JSON format:
{{
    "question": "{question}",
    "answer": "Your response based strictly on the reference material",
    "section": "Specific section or part of the reference material used",
    "confidence": Numerical value between 0 and 100 based on the information provided on the material, if the question is not possible to be infered from the material the confidence should be 0 other wise increase given the information provided
}}

Validation:
- Ensure all JSON fields are present and correctly formatted.
- Verify that the confidence value is a number between 0 and 100.
- Confirm that the answer field contains no external information.

"""
    return prompt_template.format(
                question=question,
                context=context
            )


def llm_answer(prompt:str) -> dict[str,str]:
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions about courses"},
            {"role": "user", "content": prompt}
        ],
        response_model=ResponseModel
    )
    return response.model_dump()

def search( question: str, *,
            index: min_search.Index = index, 
            course: str = 'data-engineering-zoomcamp', 
            boost: dict = { 
                'question': 3.0, 
                'section': 0.5
    }
    
    ) -> ResponseModel:
    index.search(
    filter_dict={'course':course},
    query = question,
    boost_dict=boost,
    num_results = 3
    )

    context = ""

    for doc in index.search(filter_dict={'course': course}, query=question, boost_dict=boost):
        context += f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n"
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions about courses"},
            {"role": "user", "content": prompt_template.format(
                question=question,
                context=context
            )
                }
        ],
        response_model=ResponseModel
    )
    return response

In [90]:
search = search_results(docs,boost_dict=boost,filter_dict={'course':'data-engineering-zoomcamp'},num_results=3, text_fields=['question', 'text','section'],keyword_fields=['course'])
def rag(query:str)-> dict:
    s = search(query)
    prompt = build_prompt(query,s)
    return llm_answer(prompt)
    


In [91]:
rag("Can I still join the course after the start date?")

{'question': 'Can I still join the course after the start date?',
 'answer': "Yes, even if you don't register, you're still eligible to submit the homeworks.",
 'section': 'General course-related questions',
 'confidence': 100}

In [93]:
from elasticsearch import Elasticsearch

In [117]:
es_client = Elasticsearch('http://localhost:9200')

In [118]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/fhVdWuq4TGqznnoqfgjfZA] already exists')

In [97]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [130]:
pd.DataFrame(documents).sample(n=10).head()

Unnamed: 0,text,section,question,course
282,Seed files loaded from directory with name ‘se...,Module 4: analytics engineering with dbt,When loading github repo raise exception that ...,data-engineering-zoomcamp
531,Instead use the method “.get_feature_names_out...,3. Machine Learning for Classification,FutureWarning: Function get_feature_names is d...,machine-learning-zoomcamp
790,Problem description: I have one column day_of_...,Miscellaneous,Getting day of the year from day and month column,machine-learning-zoomcamp
360,"Use both repartition and coalesce, like so:\nd...",Module 5: pyspark,Repartition the Dataframe to 6 partitions usin...,data-engineering-zoomcamp
231,Answer: The 2022 NYC taxi data parquet files a...,error: Error while reading table: trips_data_a...,"Question: for homework 3 , we need all 12 parq...",data-engineering-zoomcamp


In [132]:
pd.DataFrame(docs).head()

Unnamed: 0,text,section,question,course
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp


In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=documents )

In [119]:
query = 'I just discovered the course. Can I still join?'

In [183]:
def elastic_search(query:str):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name,body=search_query)
    return response['hits']['hits']



In [193]:
query = "How do I execute a command in a running docker container?"

In [194]:
abc = elastic_search(query)

In [195]:
abc

[{'_index': 'course-questions',
  '_id': 'YgnVbpABYTodIUsMPGRb',
  '_score': 84.67765,
  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'course-questions',
  '_id': 'gQnVbpABYTodIUsMPWQv',
  '_score': 51.6054,
  '_source': {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or

In [196]:
context = ""

for retrieval in abc:
    source = retrieval['_source']
    context_template = """
Q: {question}
A: {text}
""".strip()
    context += context_template.format(question=source['question'], text=source['text']).strip()




In [202]:
prompt = prompt_template.format(question=query, context=context).strip()
len(prompt)


1458

In [201]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [203]:
encoding = tiktoken.encoding_for_model("gpt-4o")

NameError: name 'tiktoken' is not defined

In [109]:
def rag(query):
    s = elastic_search(query)
    prompt = build_prompt(query, s)
    return llm_answer(prompt)



In [110]:
rag(query)

{'question': 'I just discovered the course. Can I still join?',
 'answer': "Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'confidence': 100}