In [17]:
from elasticsearch import Elasticsearch, helpers
import json

In [18]:
from indexing.enums.index_names import IndexName
from indexing.mappings.mappings import ESMappings

In [19]:
# Password for the 'elastic' user generated by Elasticsearch
from shared.configs.elastic_search_config import ELASTIC_PASSWORD

# Create the client instance
es = Elasticsearch(
    "https://localhost:9200",
    ca_certs="config/certs/http_ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD),
    verify_certs=False
)

  _transport = transport_class(


In [None]:
es.info()

# Create BM25 index

In [None]:
# create index
index_name = IndexName.UWATERLOO_COURSES_INDEX.value
mapping = ESMappings.get_mapping_from_index_name(index_name)
es.indices.create(index=index_name, body=mapping)

In [None]:
# load data
with open("../scraping/contents/waterloo/output.json", "r") as f:
    data = json.load(f)

In [None]:
def data_gen(data: list):
    for item in data:
        yield {
            "_index": IndexName.UWATERLOO_COURSES_INDEX,
            "_source": item
        }

In [None]:
helpers.bulk(es, data_gen(data))

# BM25 Inference

In [None]:
# es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX.value)

In [None]:
# es.indices.delete(index=IndexName.UWATERLOO_COURSES_INDEX.value)

In [None]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX.value, body={"from": 0, "size": 1, "query": {"match": {"courseDescription": "Machine Intelligence"}}})

# DPR Inference

In [None]:
es.indices.create(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body=ESMappings.get_mapping_from_index_name(IndexName.UWATERLOO_COURSES_INDEX_DPR.value))

In [26]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
query = "Machine Intelligence"
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
input_ids = tokenizer(query, return_tensors="pt")["input_ids"]
embeddings = model(input_ids).pooler_output.tolist()[0]

In [27]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body={"from": 0, "size": 5, "_source": ["courseName", "courseDescription"], "knn": {"field": "courseDescEncoding", 
                                                                                                 "query_vector": embeddings,
                                                                                                 "k": 10,
                                                                                                 "num_candidates": 100}})

  es.search(index=IndexName.UWATERLOO_COURSES_INDEX_DPR.value, body={"from": 0, "size": 5, "_source": ["courseName", "courseDescription"], "knn": {"field": "courseDescEncoding",


ObjectApiResponse({'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10, 'relation': 'eq'}, 'max_score': 0.012241292, 'hits': [{'_index': 'uwaterloo-courses-dpr', '_id': 'CTOvXIYBqrXUtKBqaQZJ', '_score': 0.012241292, '_source': {'courseName': 'Multivariate Statistics', 'courseDescription': 'The theory and application of multivariate statistics, with particular emphasis upon the use of the computer.'}}, {'_index': 'uwaterloo-courses-dpr', '_id': 'SDOxXIYBqrXUtKBqPApV', '_score': 0.012241292, '_source': {'courseName': 'Multivariate Statistics', 'courseDescription': 'The theory and application of multivariate statistics, with particular emphasis upon the use of the computer.'}}, {'_index': 'uwaterloo-courses-dpr', '_id': '7jOzXIYBqrXUtKBqWw4F', '_score': 0.011992848, '_source': {'courseName': 'Digital Computation', 'courseDescription': 'Computer systems, problem solving, data and programs, structured programming

# T5 Inference

In [21]:
from sentence_transformers import SentenceTransformer
query = "Machine Intelligence"
model = SentenceTransformer("sentence-transformers/sentence-t5-base")
embeddings = model.encode(query)

In [23]:
es.search(index=IndexName.UWATERLOO_COURSES_INDEX_T5.value, body={"from": 0, "size": 5, "_source": ["courseCode", "courseName", "courseDescription"], "knn": {"field": "courseDescEncoding", 
                                                                                                 "query_vector": embeddings,
                                                                                                 "k": 10,
                                                                                                 "num_candidates": 100}})

  es.search(index=IndexName.UWATERLOO_COURSES_INDEX_T5.value, body={"from": 0, "size": 5, "_source": ["courseCode", "courseName", "courseDescription"], "knn": {"field": "courseDescEncoding",


ObjectApiResponse({'took': 4, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10, 'relation': 'eq'}, 'max_score': 0.7780152, 'hits': [{'_index': 'uwaterloo-courses-t5', '_id': 'jTPBXIYBqrXUtKBqixRO', '_score': 0.7780152, '_source': {'courseCode': 'ECE 457B', 'courseName': 'Fundamentals of Computational Intelligence', 'courseDescription': 'Fundamentals and recent advances in computational intelligence. Building accurate models with collected data or rules bases. Model-based prediction and classification. Concepts in machine learning, supervised and unsupervised learning, artificial neural networks, deep learning, feature extraction, feature selection, dimensionality reduction, classification and clustering, support vector machines.  Approximate reasoning based on fuzzy set theory. Performance metrics to assess the validity of produced models. Multiple examples and case studies such as autonomous driving, intelligent ma

In [29]:
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
query = "Machine Intelligence"
scores = model.predict([(query, "Goals and methods of artificial intelligence. Methods of general problem solving. Knowledge representation and reasoning. Planning. Reasoning about uncertainty. Machine learning. Multi-agent systems. Natural language processing. "), (query, 'This is the second in a two-course sequence that introduces analytical and statistical methods commonly used in business for accounting and finance professionals.')])


In [30]:
scores

array([  2.4125557, -11.298454 ], dtype=float32)