In [None]:
from elasticsearch import Elasticsearch, helpers
import json

In [None]:
from indexing.enums.index_names import IndexName
from indexing.mappings.mappings import ESMappings

In [None]:
# Password for the 'elastic' user generated by Elasticsearch
from shared.configs.elastic_search_config import ELASTIC_PASSWORD

# Create the client instance
es = Elasticsearch(
    "https://localhost:9200",
    ca_certs="config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD),
    verify_certs=False
)

In [None]:
es.info()

# Create BM25 index

In [None]:
# create index
index_name = IndexName.UWATERLOO_COURSES_INDEX.value
mapping = ESMappings.get_mapping_from_index_name(index_name)
es.indices.create(index=index_name, body=mapping)

In [None]:
# load data
with open("../scraping/contents/waterloo/output.json", "r") as f:
    data = json.load(f)

In [None]:
def data_gen(data: list):
    for item in data:
        yield {
            "_index": IndexName.UWATERLOO_COURSES_INDEX,
            "_source": item
        }

In [None]:
helpers.bulk(es, data_gen(data))

# BM25 Inference

In [None]:
# es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX.value)

In [None]:
# es.indices.delete(index=IndexName.UWATERLOO_COURSES_INDEX.value)

In [None]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX.value, body={"from": 0, "size": 1, "query": {"match": {"courseDescription": "Machine Intelligence"}}})

# DPR Inference

In [None]:
es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body=ESMappings.get_mapping_from_index_name(IndexName.UWATERLOO_COURSES_INDEX_DPR.value))

In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
query = "Artificial Intelligence, Linear algebra, Math"
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
input_ids = tokenizer(query, return_tensors="pt")["input_ids"]
embeddings = model(input_ids).pooler_output.tolist()[0]


In [None]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body={"from": 0, "size": 5, "_source": ["courseName", "courseDescription"], "knn": {"field": "courseDescEncoding", 
                                                                                                 "query_vector": embeddings,
                                                                                                 "k": 10,
                                                                                                 "num_candidates": 100}})

# T5 Inference

In [None]:
from sentence_transformers import SentenceTransformer
query = "Artificial Intelligence, Linear algebra, Math"
model = SentenceTransformer("sentence-transformers/sentence-t5-base")
embeddings = model.encode(query)

In [None]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX_T5.value, body={"from": 0, "size": 5, "_source": ["courseName", "courseDescription"], "knn": {"field": "courseDescEncoding", 
                                                                                                 "query_vector": embeddings,
                                                                                                 "k": 10,
                                                                                                 "num_candidates": 100}})