## Evaluate retrieval metrics for text
From the ChatGPT generated faqs, we are going to evaluate if we can retrieve the relevant documents.

In [8]:
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import pandas as pd

In [2]:


with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

# create index, make sure you have new one
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [5]:
# actually index all the documents
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [6]:
# generate search query
# note that compared to previous functions, this one takes the course type as input
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [7]:
# actually search for something
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

### Iterating through queries in ground truth data

In [10]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [28]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'],course=q['course'])
    # check if document shows up during search
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [29]:
# check out how that total relevance list looks like
#note that some may be empty because chatGPT essentially 
# didnt get the assignment and just output "question 1 2 3..."

relevance_total

[[True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [],
 [],
 [],
 [],
 [],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, True, False, False, False],
 [False, True, False, False, False],
 [True

### Evaluate the different metrics

In [30]:
# lets setup an example                 # hit rate   # MRR
example = [
    [True, False, False, False, False], # 1,            1
    [False, False, False, False, False], # 0            0
    [False, False, False, False, False], # 0            0
    [False, False, False, False, False], # 0            0
    [False, False, False, False, False], # 0            0
    [True, False, False, False, False], # 1             1
    [True, False, False, False, False], # 1             1
    [True, False, False, False, False], # 1             1
    [True, False, False, False, False], # 1             1
    [True, False, False, False, False], # 1             1
    [False, False, True, False, False],  # 1            1/3
    [False, False, False, False, False], # 0            0
]
                                        #7/12 (0.58)    6.33/12 (.53)

### Hit Rate (recall)
Hit rate will look at each row, and if there is at least one hit, then we all good.

In [41]:
def hit_rate(relevance_total):
    cnt = 0
    # count if any True found on line
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [42]:
hit_rate(example)

0.5833333333333334

### Mean reciprocal rank MRR
It is similar to hit rate, but it will weight depending on the rank of the result

In [43]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [44]:
mrr(example)

0.5277777777777778

In [45]:
# lets check out the actual values for the full ground truch
m1 = hit_rate(relevance_total)
m2 = mrr(relevance_total)

print(f'hit rate is {m1:.3f} and mrr is {m2:.3f}')

hit rate is 0.740 and mrr is 0.603


## Evaluating same but with minsearch

In [46]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
import minsearch # alexeys small and fast search engine


--2024-09-20 22:14:25--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-09-20 22:14:26 (10.8 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [47]:
# initialize class, tell the search engine what is searchable and what are keywords
index = minsearch.Index(
    text_fields=['text','section','question'],
    keyword_fields=['course','id']
)

#actually train the search engine
index.fit(docs=documents)

<minsearch.Index at 0x134d34550>

In [48]:
# set up fcn for actually doing a search
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [49]:
# get doc hits using a different search engine
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'],course=q['course'])
    # check if document shows up during search
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [51]:
# lets check out the actual values for the full ground truth using minsearch
m1 = hit_rate(relevance_total)
m2 = mrr(relevance_total)

print(f'hit rate is {m1:.3f} and mrr is {m2:.3f}') 

#note that both are slightly better using minsearch, but not significantly
# ES data was : 
# hit rate is 0.740 and mrr is 0.603

hit rate is 0.772 and mrr is 0.661


## Create a more generic evaluation

In [52]:
def evaluate(ground_truth, search_function):
    ''' 
    This function uses any specific search engine you set up to 
    check out the ground truch, and it evaluates the 2 metrics 
    '''
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [53]:
# try elasticsearch
evaluate(ground_truth, lambda q: elastic_search(query=q['question'],course=q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7395720769397017, 'mrr': 0.6029788920106625}

In [54]:
# try minsearch
evaluate(ground_truth, lambda q: minsearch_search(query=q['question'],course=q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}