In [7]:
import sys
import os
import json
from openai import OpenAI

In [2]:
sys.path.append('..')

In [3]:
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])

In [4]:
def get_response(client, model, system_context, assistant_context, user_context):
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system", "content": system_context,
                "role": "assistant", "content": assistant_context,
                "role": "user", "content": user_context
            }
        ]
    )
    
    output = response.choices[0].message.content
    return output

In [5]:
get_response(client,
             model="gpt-3.5-turbo",
             system_context = "You 're an agent expert in data science.",
             assistant_context = "",
             user_context = "What is overfitting?")

'Overfitting is a term used in machine learning to describe a model that performs well on training data but poorly on new, unseen data. This occurs when a model is too complex and learns the noise and fluctuations in the training data, rather than the underlying patterns or relationships. As a result, the model fails to generalize well to new data and makes incorrect predictions. Overfitting can be mitigated by techniques such as cross-validation, regularization, and early stopping.'

In [6]:
import minsearch

In [14]:
# Preprocessing 
with open('../documents.json', 'rt') as f_in:
    raw_documents = json.load(f_in)
    docs = []
    for course_dict in raw_documents:
        for doc in course_dict['documents']:
            doc["course"] = course_dict['course']
            docs.append(doc)

In [15]:
doc

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

In [16]:
index = minsearch.Index(
    text_fields = ["text", "section", "question"],
    keyword_fields = ["course"]
)

In [17]:
q = 'Is the course already started? Can I still enroll?'

In [18]:
index.fit(docs)

<minsearch.Index at 0x79d61882fdf0>

In [35]:
boost = {'text': 3.0, 'section': 0.5, 'question': 2.0}

docs_retrieved = index.search(
    query = q,
    filter_dict = {'course': 'data-engineering-zoomcamp'},
    boost_dict = boost,
    num_results = 10
)

In [29]:
get_response(client = client, model="gpt-4o", system_context="", assistant_context="", user_context= q)

"Whether you can still enroll in a course that has already started typically depends on several factors, including the institution's policies, the type of course, and how far along the course is. Here are some general steps you can take to find out:\n\n1. **Check the Institution's Website:** Look for enrollment deadlines or policies regarding late enrollment.\n   \n2. **Contact the Admissions Office:** Reach out to the admissions or registrar's office of the institution offering the course. They can provide specific information and possible accommodations.\n\n3. **Speak with the Instructor:** Sometimes, instructors have the discretion to allow late enrollments, especially if you can catch up on missed work.\n\n4. **Consider the Course Format:** For some online courses, especially those that are self-paced or asynchronous, late enrollment might be more feasible.\n\nWould you like to share more details about the course and institution you're interested in?"

In [48]:
prompt_template="""
You are a teaching assistant. Answer the QUESTION based on the CONTEXT from the faq database.
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: 
{question}

CONTEXT: 
{context}
""".strip()

In [49]:
context = ""

for doc_retrieved in docs_retrieved:

    context = context + f"Section: {doc_retrieved['section']}\nQuestion: {doc_retrieved['question']}\nAnswer: {doc_retrieved['text']}\n\n"

In [52]:
prompt = prompt_template.format(question=q, context=context).strip()

In [53]:
print(prompt)

You are a teaching assistant. Answer the QUESTION based on the CONTEXT from the faq database.
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: 
Is the course already started? Can I still enroll?

CONTEXT: 
Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - When will the course start?
Answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Re

In [54]:
get_response(client = client, model="gpt-4o", system_context="", assistant_context="", user_context= prompt)

"Yes, the course has already started. However, you can still enroll and participate in the course. Even if you didn't register before it started, you're eligible to submit the homework. Keep in mind that there will be deadlines for turning in the final projects, so avoid leaving everything until the last minute."