# Solutions to homework #3

In [1]:
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm
import requests
import numpy as np
import pandas as pd
import warnings

  from tqdm.autonotebook import tqdm, trange


In [2]:
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

## Solution to question #1

In [4]:
user_question = "I just discovered the course. Can I still join it?"
vectors = embedding_model.encode(user_question)
print(f'Value of first vector: {round(vectors[0], 3):.3f}.')

Value of first vector: 0.078.


## Solution to question #2

In [5]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'

In [6]:
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [7]:
docs = [doc for doc in documents if doc['course'] in ['machine-learning-zoomcamp']]

In [8]:
embeddings = []
for doc in tqdm(docs):
    qa_text = f'''{doc['question']} {doc['text']}'''
    embeddings.append(embedding_model.encode(qa_text))

  0%|          | 0/375 [00:00<?, ?it/s]

In [9]:
X = np.array(embeddings)
print(f'Shape of the embeddings: {X.shape}')

Shape of the embeddings: (375, 768)


## Solution to question #3

In [10]:
v = vectors[0]
scores = X.dot(v)
max_score = scores.max()
print(f'Highest score from the first vector and the embeddings: {round(max_score, 4):.4f}.')

Highest score from the first vector and the embeddings: 0.0127.


In [11]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

## Solution to question #4

In [12]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

In [13]:
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
len(ground_truth)

1830

In [15]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [16]:
gt_embeddings = []
for doc in tqdm(ground_truth):
    q_text = doc['question']
    gt_embeddings.append(embedding_model.encode(q_text))

  0%|          | 0/1830 [00:00<?, ?it/s]

In [17]:
gt_X = np.array(gt_embeddings)

In [18]:
gt_X.shape

(1830, 768)

In [19]:
search_engine = VectorSearchEngine(documents=ground_truth, embeddings=gt_X)
search_engine.search(vectors, num_results=5)

[{'question': 'If I join the course late, can I still participate?',
  'course': 'machine-learning-zoomcamp',
  'document': 'ee58a693'},
 {'question': 'Will I be able to obtain a certificate if I join the course after it has started?',
  'course': 'machine-learning-zoomcamp',
  'document': 'ee58a693'},
 {'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can I start the course anytime?',
  'course': 'machine-learning-zoomcamp',
  'document': '636f55d5'},
 {'question': 'What is the initial step after joining the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0a278fb2'}]

In [20]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [21]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [22]:
def evaluate(ground_truth, vectors, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function.search(vectors)
        relevance = [d['document'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [23]:
evaluate(ground_truth, vectors, search_engine)

  0%|          | 0/1830 [00:00<?, ?it/s]

{'hit_rate': 0.01912568306010929, 'mrr': 0.008002645502645499}

## Solution for question #5