In [1]:
from elasticsearch import Elasticsearch, helpers
import json

In [2]:
from indexing.enums.index_names import IndexName
from indexing.mappings.mappings import ESMappings

In [3]:
# Password for the 'elastic' user generated by Elasticsearch
from shared.configs.elastic_search_config import ELASTIC_PASSWORD

# Create the client instance
es = Elasticsearch(
    "https://localhost:9200",
    ca_certs="config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD),
    verify_certs=False
)

  _transport = transport_class(


In [14]:
es.info()



ObjectApiResponse({'name': 'DESKTOP-5628FPD', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'kfVRx5jQRzeDrEWE0UZmog', 'version': {'number': '8.6.1', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '180c9830da956993e59e2cd70eb32b5e383ea42c', 'build_date': '2023-01-24T21:35:11.506992272Z', 'build_snapshot': False, 'lucene_version': '9.4.2', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Create BM25 index

In [None]:
# create index
index_name = IndexName.UWATERLOO_COURSES_INDEX.value
mapping = ESMappings.get_mapping_from_index_name(index_name)
es.indices.create(index=index_name, body=mapping)

In [None]:
# load data
with open("../scraping/contents/waterloo/output.json", "r") as f:
    data = json.load(f)

In [None]:
def data_gen(data: list):
    for item in data:
        yield {
            "_index": IndexName.UWATERLOO_COURSES_INDEX,
            "_source": item
        }

In [None]:
helpers.bulk(es, data_gen(data))

In [None]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX.value, body={"from": 0, "size": 1, "query": {"match": {"courseDescription": "Machine Intelligence"}}})

In [16]:
es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'uwaterloo-courses-dpr'})

In [15]:
es.indices.delete(index=IndexName.UWATERLOO_COURSES_INDEX.value)



NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [uwaterloo-courses-dpr]', uwaterloo-courses-dpr, index_or_alias)

In [40]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
query = "quantum field theory"
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
input_ids = tokenizer(query, return_tensors="pt")["input_ids"]
embeddings = model(input_ids).pooler_output.tolist()[0]


In [41]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body={"from": 0, "size": 5, "fields": ["courseName", "courseDescription"], "knn": {"field": "courseDescEncoding", 
                                                                                                 "query_vector": embeddings,
                                                                                                 "k": 10,
                                                                                                 "num_candidates": 100}})

  es.search(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body={"from": 0, "size": 5, "fields": ["courseName", "courseDescription"], "knn": {"field": "courseDescEncoding",


ObjectApiResponse({'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10, 'relation': 'eq'}, 'max_score': 0.012879057, 'hits': [{'_index': 'uwaterloo-courses-dpr', '_id': '0i6OQoYBVoGyBCeVsX7Y', '_score': 0.012879057, '_source': {'courseCode': 'PHYS 342', 'courseName': 'Electricity and Magnetism 2', 'courseDescription': "Electric and magnetic fields in media, auxiliary fields, Maxwell's equations, electromagnetic waves, electric and magnetic properties of matter. ", 'courseDescEncoding': [0.4018005132675171, 0.05958801880478859, 0.15298646688461304, 0.12181499600410461, 0.022525854408740997, 0.05837227404117584, -0.43410107493400574, -0.15936537086963654, -0.22311516106128693, -0.44660353660583496, -0.39026010036468506, 0.7343267202377319, -0.22529852390289307, 0.4483717381954193, 0.11712152510881424, 0.6333798766136169, 0.2345161736011505, 0.17092835903167725, -0.1487361043691635, -0.39300283789634705, -0.297

In [9]:
es.indices.delete(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value)



ObjectApiResponse({'acknowledged': True})

In [10]:
es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body=ESMappings.get_mapping_from_index_name(IndexName.UWATERLOO_COURSES_INDEX_DPR.value))

uwaterloo-courses-dpr


  es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body=ESMappings.get_mapping_from_index_name(IndexName.UWATERLOO_COURSES_INDEX_DPR.value))


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'uwaterloo-courses-dpr'})

In [11]:
print(ESMappings.get_mapping_from_index_name(IndexName.UWATERLOO_COURSES_INDEX.value))

uwaterloo-courses
ESMappings.UWATERLOO_BM25_INDEX_MAPPING


In [12]:
ESMappings.get_mapping_from_index_name('uwaterloo-courses-dpr')

uwaterloo-courses-dpr


<ESMappings.UWATERLOO_DPR_INDEX_MAPPING: {'mappings': {'properties': {'courseCode': {'type': 'text'}, 'courseName': {'type': 'text'}, 'courseDescription': {'type': 'text'}, 'courseDescEncoding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'l2_norm'}}}}>

In [4]:
ESMappings.UWATERLOO_DPR_INDEX_MAPPING

<ESMappings.UWATERLOO_DPR_INDEX_MAPPING: {'mappings': {'properties': {'courseCode': {'type': 'text'}, 'courseName': {'type': 'text'}, 'courseDescription': {'type': 'text'}, 'courseDescEncoding': {'type': 'dense_vector', 'dims': 768, 'index': True, 'similarity': 'l2_norm'}}}}>