In [27]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json
import requests
from sentence_transformers import SentenceTransformer
import numpy as np

In [3]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [4]:
user_question = "I just discovered the course. Can I still join it?"

In [6]:
embedding_vector = embedding_model.encode(user_question)

print(embedding_vector[0])

0.078222625


In [8]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [10]:
ml_zoomcamp_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

print(f"Number of documents after filtering: {len(ml_zoomcamp_documents)}")

Number of documents after filtering: 375


In [14]:
embeddings = []

for doc in ml_zoomcamp_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)

print(f"Shape of X: '{X.shape}'")

Shape of X: '(375, 768)'


In [16]:
v = embedding_model.encode(user_question)

scores = X.dot(v)

highest_score = np.max(scores)
print(f"The highest score in the results is: {highest_score}")

The highest score in the results is: 0.6506572961807251


In [17]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [24]:
search_engine = VectorSearchEngine(documents=ml_zoomcamp_documents, embeddings=X)

relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

print("Ground truth keys:", ground_truth[0].keys())

Ground truth keys: dict_keys(['question', 'course', 'document'])


In [25]:
def calculate_hit_rate(search_engine, ground_truth, num_results=5):
    hits = 0
    for gt in ground_truth:
        v_query = embedding_model.encode(gt['question'])
        results = search_engine.search(v_query, num_results)
        result_ids = [res['id'] for res in results]
        if gt['document'] in result_ids:  
            hits += 1
    return hits / len(ground_truth)

In [26]:
hit_rate = calculate_hit_rate(search_engine, ground_truth, num_results=5)
print(f"Hit-rate for VectorSearchEngine with num_results=5: {hit_rate}")

Hit-rate for VectorSearchEngine with num_results=5: 0.9398907103825137


In [29]:
es = Elasticsearch(hosts=["http://localhost:9200"])

In [31]:
index_name = 'ml-zoomcamp-docs'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "course": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "embedding": {
                "type": "dense_vector",
                "dims": 768
            }
        }
    }
}

es.indices.create(index=index_name, body=index_settings)

  if es.indices.exists(index=index_name):
  es.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ml-zoomcamp-docs'})

In [32]:
def generate_actions(docs, embeddings):
    for i, doc in enumerate(docs):
        action = {
            "_index": index_name,
            "_id": doc['id'],
            "_source": {
                "course": doc['course'],
                "question": doc['question'],
                "text": doc['text'],
                "embedding": embeddings[i]
            }
        }
        yield action

In [33]:
bulk(es, generate_actions(ml_zoomcamp_documents, embeddings))

user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question).tolist()

  bulk(es, generate_actions(ml_zoomcamp_documents, embeddings))


In [34]:
search_query = {
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {
                    "query_vector": v
                }
            }
        }
    },
    "size": 1
}

In [35]:
response = es.search(index=index_name, body=search_query)
highest_score_id = response['hits']['hits'][0]['_id']

print(f"The ID of the document with the highest score: {highest_score_id}")

The ID of the document with the highest score: ee58a693


  response = es.search(index=index_name, body=search_query)


In [36]:
def search_elasticsearch(query, es, index_name='ml-zoomcamp-docs', num_results=5):
    v_query = embedding_model.encode(query).tolist()
    search_query = {
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                    "params": {
                        "query_vector": v_query
                    }
                }
            }
        },
        "size": num_results
    }
    response = es.search(index=index_name, body=search_query)
    return [hit['_id'] for hit in response['hits']['hits']]

In [37]:
def calculate_hit_rate_elasticsearch(ground_truth, es, num_results=5):
    hits = 0
    for gt in ground_truth:
        results = search_elasticsearch(gt['question'], es, num_results=num_results)
        if gt['document'] in results:
            hits += 1
    return hits / len(ground_truth)

In [38]:
hit_rate_elastic = calculate_hit_rate_elasticsearch(ground_truth, es, num_results=5)
print(f"Hit-rate for Elasticsearch with num_results=5: {hit_rate_elastic}")

  response = es.search(index=index_name, body=search_query)


Hit-rate for Elasticsearch with num_results=5: 0.9398907103825137
