# Homework 3

https://github.com/DataTalksClub/llm-zoomcamp/blob/main/cohorts/2024/03-vector-search/homework.md

### Q1:

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

In [2]:
user_question = "I just discovered the course. Can I still join it?"
embedding_vector = embedding_model.encode(user_question)
embedding_vector[0]

0.07822264

In [3]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [4]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [5]:
len(documents)

948

In [6]:
filtered_documents = []
course_name = 'machine-learning-zoomcamp'

for doc in documents:
    if doc['course'] == course_name:
        filtered_documents.append(doc)

In [7]:
len(filtered_documents)

375

In [8]:
filtered_documents[1]

{'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
 'section': 'General course-related questions',
 'question': 'Is it going to be live? When?',
 'course': 'machine-learning-zoomcamp',
 'id': '39fda9f0'}

### Q2:

In [9]:
import numpy as np

embeddings = []
operations = []
for doc in filtered_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)
    doc["qa_vector"] = embedding.tolist()
    operations.append(doc)

X = np.array(embeddings)
X.shape

(375, 768)

### Q3:

In [10]:
dot_product = np.dot(embedding_vector, embedding_vector)
dot_product

0.99999994

In [11]:
scores = X.dot(embedding_vector)
scores.max()

0.6506573

### Q4:

In [12]:
embedding_vector = np.array(embedding_vector, dtype=float)

In [13]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        # No need to convert v_query; it's already a numeric array
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]
search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
search_engine.search(embedding_vector, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693',
  'qa_vector': [0.0806286484003067,
   -0.06663887947797775,
   0.025273090228438377,
   -0.013004394248127937,
   0.07587282359600067,
   -0.05946267768740654,
   -0.021883828565478325,
   0.0029000290669500828,
   0.0007928750710561872,
   -0.005222407169640064,
   -0.03365180641412735,
   -0.02791348658502102,
   0.058116547763347626,
   0.039748337119817734,
   0.054418567568063736,
   -0.03825174272060394,
   0.0630573704838

In [14]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [15]:
def calculate_hitrate(search_engine, ground_truth, num_results):
    total_queries = len(ground_truth)
    correct_matches = 0

    for query in ground_truth:
        question = query['question']
        document = query['document']
        embedding_vector = embedding_model.encode(question)
        results = search_engine.search(embedding_vector, num_results=num_results)

        for result in results:
            if result['id'] == document:
                correct_matches += 1
                break

    hitrate = correct_matches / total_queries
    return hitrate

hitrate = calculate_hitrate(search_engine, ground_truth, num_results=5)
print(hitrate)

0.9398907103825137


### Q5:

In [16]:
len(embedding_model.encode("This is a simple sentence"))

768

In [17]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'bbbf7af350fc', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'l1bLG6PfTSWF2YIvXiuWjA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [18]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "qa_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [19]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [20]:
for doc in operations:
    try:
        es_client.index(index=index_name, body=doc)
    except Exception as e:
        print(e)

In [21]:
search_term = "I just discovered the course. Can I still join it?"
vector_search_term = embedding_model.encode(search_term)

In [35]:
body = {
    "knn": {
        "field": "qa_vector",
        "query_vector": vector_search_term,
        "k": 5,
        "num_candidates": 10000, 
    },
    "size": 5,
}

In [36]:
res = es_client.search(index=index_name, body=body, _source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'dIbItpABMxSebqwPgLDU',
  '_score': 0.82532895,
  '_source': {'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
   'section': 'General course-related questions',
   'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'id': 'ee58a693',
   'qa_vector': [0.0806286484003067,
    -0.06663887947797775,
    0.025273090228438377,
    -0.013004394248127937,
    0.07587282359600067,
    -0.05946267768740654,
    -0.021883828565478325,
    0.0029000290669500828,
    0.0007928750710561872,
    -0.005222407169640064,
    -0.03365180641412735,
    -0.02791348658502102,
 

### Q6:

In [40]:
def calculate_hitrate_es(search_engine, ground_truth, embedding_model: SentenceTransformer, num_results):
    total_queries = len(ground_truth)
    correct_matches = 0

    for query in ground_truth:
        question = query['question']
        document = query['document']
        embedding_vector = embedding_model.encode(question)
        results = search_engine.search(embedding_vector, num_results=num_results)

        for result in results:
            if result['id'] == document:
                correct_matches += 1
                break

    hitrate = correct_matches / total_queries
    return hitrate

hitrate_es = calculate_hitrate_es(search_engine, ground_truth, embedding_model, num_results=5)
print(hitrate_es)

0.9398907103825137
