In [42]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [70]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},


    }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [44]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████| 948/948 [00:19<00:00, 47.72it/s]


In [45]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [46]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [47]:
import pandas as pd

In [48]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [49]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [50]:
ground_truth

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I receive course announcements?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where do I join the Slack channel?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'Where can I find the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'How do I check the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520ca'},
 {'question': 'Where are the course prerequisites listed?',
  'course': 'data-engineering-zoomcamp',
  'document': '1f6520c

In [51]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course = q['course'])
    relevance = [d['id']== doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████████████████████████████████████████████████████████████████| 5088/5088 [00:11<00:00, 441.12it/s]


In [52]:
relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [True

- hit-rate (recall) only look if the id right id are included in list
- Mean Reciprocal Rank (mrr) look if id are in list and thte potition are right

In [53]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt +=1

    return cnt / len(relevance_total)
        

In [54]:
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]== True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)
        

In [55]:
hit_rate(relevance_total), mrr(relevance_total)

(0.6755110062893082, 0.5491778039832289)

In [56]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)


<minsearch.Index at 0x701fc8457320>

In [57]:
def minsearch_search(query, course):
    boost = {"question": 3.0, "section":0.5}

    results = index.search(
        query = query,
        filter_dict= {'course':course},
        boost_dict = boost,
        num_results=5
    )

    return results

In [58]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course = q['course'])
    relevance = [d['id']== doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████████████████████████████████████████████████████████████████| 5088/5088 [00:14<00:00, 348.10it/s]


In [63]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7012578616352201, 0.6009171907756817)

compared to using es : (0.6757075471698113, 0.54914177148847)


In [60]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id']== doc_id for d in results]
        relevance_total.append(relevance)
        
    return {
        'hit_rate':hit_rate(relevance_total),
        'mrr':mrr(relevance_total)
    }

In [61]:
evaluate(ground_truth, lambda q : elastic_search(q['question'], q['course']))

100%|██████████████████████████████████████████████████████████████████████| 5088/5088 [00:08<00:00, 602.06it/s]


{'hit_rate': 0.6755110062893082, 'mrr': 0.5491778039832289}

In [62]:
evaluate(ground_truth, lambda q : minsearch_search(q['question'], q['course']))

100%|██████████████████████████████████████████████████████████████████████| 5088/5088 [00:14<00:00, 353.13it/s]


{'hit_rate': 0.7012578616352201, 'mrr': 0.6009171907756817}