In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [2]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp',
 'id': 'ea739c65'}

In [3]:
import pandas as pd

In [4]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [5]:
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [6]:
ground_truth[10]

{'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp',
 'document': '5170565b'}

In [7]:
# to fetch by id
doc_idx = {d['id']: d for d in documents}

doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

In [8]:
# use elsastic search and index the documents

from sentence_transformers import SentenceTransformer

In [9]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [10]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

In [12]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [13]:
from tqdm.auto import tqdm

In [14]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [15]:
# Retrieval of results

def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [16]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [17]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'id': '5170565b'},
 {'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'id': '39fda9f0'},
 {'question': 'The same accuracy on epochs',
  'course': 'machine-learning-zoomcamp',
  'section': '8. Neural Networks an

## The RAG flow

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.
        
        QUESTION: {question}
        
        CONTEXT: 
        {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [19]:
import anthropic

In [20]:
client = anthropic.Anthropic()

#### Use cheaper model => claude-3-haiku-20240307

In [41]:
def llm(prompt, model="claude-3-5-sonnet-20240620"):
    response = client.messages.create(
        # model="claude-3-haiku-20240307",
        model = model,
        max_tokens = 1024,
        messages = [
            {"role": "user", "content": prompt}
        ]
    )

    # print usage
    print(response.usage)
    
    return response.content

In [42]:
# previously: rag(query: str) -> str
def rag(query: dict, model="claude-3-5-sonnet-20240620") -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    
    return answer

In [33]:
rag(ground_truth[10])

Usage(input_tokens=424, output_tokens=118)


[TextBlock(text='Based on the provided context, I can answer your question as follows:\n\nYes, sessions are recorded if you miss one. The course FAQ states that "Everything is recorded, so you won\'t miss anything." This applies to both the pre-recorded course videos and the occasional live office hours sessions. If you miss a session, you will be able to watch the recording later. Additionally, you can ask questions in advance for office hours, which will be covered during the live stream, and these sessions are also recorded. You can always ask questions in Slack as well.', type='text')]

In [24]:
doc_idx['5170565b']['text']

'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'

## Cosine Similarity Metric

In [34]:
# ==== haiku model ===
# answer_orig = """
# Yes, the sessions are recorded, so you won't miss anything if you miss a live session. The course videos are pre-recorded,
# and the occasional live office hours sessions are also recorded. You can access all the recordings in the course playlist on
# YouTube.
# """

answer_orig = """
Yes, sessions are recorded if you miss one. The course FAQ states that "Everything is recorded, so you won't miss anything."
This applies to both the pre-recorded course videos and the occasional live office hours sessions. If you miss a session, you
will be able to watch the recording later. Additionally, you can ask questions in advance for office hours, which will be
covered during the live stream, and these sessions are also recorded. You can always ask questions in Slack as well.
"""

answer_llm = """
Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and
we will cover them during the live stream. Also, you can always ask questions in Slack."
"""

In [35]:
v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)

In [36]:
# compute cosine similarity between "v_llm" and "v_orig"
v_llm.dot(v_orig) # diff due to openAI being used in vid

# Has a high degree of similarity

# haiku mode result is 0.5698763

0.71898925

In [37]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [38]:
len(ground_truth)

1830

In [39]:
answers = {}

didn't run due to API costs --- used output from video

```python
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

```

interested in => `answers.values()`

Also didn't run code below --- **All code below uses openAI**

```python
# updates "ground_truth" and outputs results to csv file

results_gpt4o = [None] * len(ground_truth)

for i, val in answers.items():
    results_gpt4o[i] = val.copy()
    results_gpt4o[i].update(ground_truth[i])

df_gpt4o = pd.DataFrame(results_gpt4o)

!mkdir data

df_gpt4o.to_csv('data/results-gpt4o.csv', index=False)
```

## Code below can be used for faster processing as it executes in parallel

*Note:* Best to use if have better PC resources -- costs can vary depending on model...sometimes high

```python
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results


def process_record(rec):
    model = 'gpt-3.5-turbo'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }


print(process_record(ground_truth[10]))
```

```python

results_gpt35 = map_progress(pool, ground_truth, process_record)

df_gpt35 = pd.DataFrame(results_gpt35)
df_gpt35.to_csv('data/results-gpt35.csv', index=False)

# check results
!head data/results-gpt35.csv
```

# Video 4.3 Ends Here