In [1]:
import pandas as pd
import json
from elasticsearch import Elasticsearch



In [1]:
json

NameError: name 'json' is not defined

In [5]:
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [11]:
from sentence_transformers import SentenceTransformer

In [2]:
df_ground_truth = pd.read_csv('../module-3/ground-truth-data.csv')

In [3]:
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [7]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [6]:
doc_idx = {d['id']: d for d in documents}
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [12]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [13]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [14]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [15]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [12]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

In [16]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [17]:
from huggingface_hub import InferenceClient
import os

In [18]:
HF_API_TOKEN = os.getenv('HF_API_TOKEN')
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta", token=HF_API_TOKEN)

In [19]:
def llm(prompt):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat_completion(messages, max_tokens=1000)
    
    return response.choices[0].message.content

In [20]:
# previously: rag(query: str) -> str
def rag(query: dict) -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt)
    return answer

In [21]:
rag(ground_truth[10])

"QUESTION: Are sessions recorded if I miss one?\n\nANSWER: Yes, everything is recorded in the course, so you won't miss anything if you have to miss a session. You can ask your questions for office hours in advance, and they will be covered during the live stream. Alternatively, you can ask questions in Slack. Some sessions may also be live, but they are also recorded and can be found in the course playlist on YouTube. However, if you are having an issue specifically with a homework problem, such as a problem with the accuracy during training, you should refer to the solution description, which may involve setting class_mode='binary' while reading the data or choosing the correct optimizer, batch size, or learning rate. If you are looking for a useful resource for missing data treatment, a guide can be found on Kaggle. And as for obtaining a certificate, if you miss the midterm project, it's still possible. Please refer to the previous answer for more information."

In [22]:
print(_)

QUESTION: Are sessions recorded if I miss one?

ANSWER: Yes, everything is recorded in the course, so you won't miss anything if you're unable to attend a session. You can ask your questions for office hours in advance, and they will be covered during the live stream. Additionally, you can always ask questions in Slack. The office hours sessions are also recorded for your convenience. (General course-related questions section)


Note: In case you're looking to solve a specific issue related to a missing data treatment, there's a useful resource available on Kaggle: https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python/notebook. (2. Machine Learning for Regression section)


If you have any doubts regarding the midterm project, and you're concerned about the certificate, I can assure you that as long as you complete all the assignments and meet the course requirements, you'll receive a certificate at the end of the course, regardless of whether you miss a 

In [23]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [24]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [25]:
answer_orig = doc_idx['5170565b']['text']
answer_llm = "Yes, everything is recorded in the course, so you won't miss anything if you're unable to attend a session. You can ask your questions for office hours in advance, and they will be covered during the live stream. Additionally, you can always ask questions in Slack. The office hours sessions are also recorded for your convenience."

In [26]:
v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

In [27]:
v_llm.dot(v_orig)

0.82825035

In [50]:
client = InferenceClient("deepset/roberta-base-squad2", token=HF_API_TOKEN)

In [23]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

In [51]:
search_results = question_text_vector_knn(ground_truth[10])

In [52]:
prompt = build_prompt(ground_truth[10]['question'], search_results)

In [33]:
prompt

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: Are sessions recorded if I miss one?\n\nCONTEXT: \nsection: General course-related questions\nquestion: What if I miss a session?\nanswer: Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.\n\nsection: General course-related questions\nquestion: Is it going to be live? When?\nanswer: The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.\n\nsection: 8. Neural Networks and Deep Learn

In [53]:
messages = [{"role": "user", "content": prompt}]
response = client.chat_completion(messages)

Server https://api-inference.huggingface.co/models/deepset/roberta-base-squad2/v1/chat/completions does not seem to support chat completion. Falling back to text generation. Error:  (Request ID: UIQvSL_rTRALr4JE6kWPO)

Bad request:
Error in `inputs.question`: field required
Error in `inputs.context`: field required


BadRequestError:  (Request ID: bT--5RQnlP5wZgz3Gwvzy)

Bad request:
Error in `inputs.question`: field required
Error in `inputs.context`: field required

In [34]:
import requests
headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}

In [46]:
API_URL = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"

In [47]:
def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [48]:
data = query({"inputs": {
        'question': ground_truth[10]['question'],
        'context': prompt
    } 
})

In [49]:
print(data)

{'score': 0.015489970333874226, 'start': 165, 'end': 174, 'answer': '\nQUESTION'}
