# Semantic Search

In [19]:
import json
from elasticsearch import Elasticsearch
from tqdm import tqdm

es_client = Elasticsearch("http://localhost:9200")
es_client.info()

ObjectApiResponse({'name': '66e0f9f00ace', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'FJ0AMI9NSC-2-hfgszITsg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Embeddings

In [37]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

In [38]:

with open("./documents.json") as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in tqdm(docs_raw):
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        doc["text_embedding"] = model.encode(doc["text"]).tolist()
        documents.append(doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [51:15<00:00, 1025.10s/it]


In [39]:
_index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_embedding": {"type": "dense_vector", "dims": 768, "index" :True, "similarity": "cosine"}
        }
    }
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=_index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [40]:
for doc in tqdm(documents):
    try:
        es_client.index(index=index_name,document=doc)
    except Exception as e:
        print(e)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [17:26<00:00,  1.10s/it]


# Query Test

In [44]:
search_term = "Linux or Mac?"
vec_search_team = model.encode(search_term)

In [45]:
query = {
    "field": "text_embedding",
    "query_vector": vec_search_team,
    "k": 5,
    "num_candidates": 10000
}

result = es_client.search(
    query={
        "match": {
            "course": "data-engineering-zoomcamp"
        }
    },
    index=index_name, knn=query, source=["text", "section", "course", "question"]
)
result["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'x9FMapcBpKyLRyp8SYV9',
  '_score': 1.5507109,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'u9FMapcBpKyLRyp8SYUj',
  '_score': 1.3968079,
  '_source': {'question': 'Environment - Do I need both GitHub Codespaces and GCP?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Choose the approach that aligns the most with your idea for the end project\nOne of those should suffice. However, BigQuery, which is part of GCP, will be used, so learning that is probably a better option. Or you can set up a local environment for most of this course.'}},
 {'_index': 'course-questions',
  '_id': 'uNFMapcBpKyLRy