In [26]:
import pandas as pd

## Ingest ground truth

In [27]:
df_question = pd.read_csv('../data/ground-truth-data.csv')

In [28]:
df_question

Unnamed: 0,id,question
0,1,In which county is Geirangerfjord located?
1,1,What months are best for visiting Geirangerfjord?
2,1,What activities are available at Geirangerfjor...
3,1,Is Geirangerfjord designated as a UNESCO World...
4,1,What natural features and viewpoints can I exp...
...,...,...
745,150,In which county is Romsdalsfjord located?
746,150,What months are best to visit Romsdalsfjord?
747,150,What outdoor activities are recommended at Rom...
748,150,What natural features and sights make Romsdals...


In [29]:
ground_truth = df_question.to_dict(orient='records')

In [30]:
ground_truth[0]

{'id': 1, 'question': 'In which county is Geirangerfjord located?'}

## Check search vs truth

In [31]:
df = pd.read_csv('../data/data.csv')

In [32]:
documents = df.to_dict(orient='records')

In [33]:
documents[0]

{'id': 1,
 'attraction': 'Geirangerfjord',
 'activity_type': 'Sightseeing / Nature',
 'county': 'Møre og Romsdal',
 'time_to_visit': 'May, June, July, August, September',
 'description': 'One of Norway’s most iconic fjords, surrounded by steep cliffs, waterfalls, and small farms. It is a UNESCO World Heritage Site and offers boat tours, kayaking, and breathtaking viewpoints such as Dalsnibba.'}

In [34]:
import minsearch

In [35]:
index = minsearch.Index(
    text_fields = ['attraction', 'activity_type', 'county', 'time_to_visit',
       'description'],
    keyword_fields=['id']
)

In [36]:
index.fit(documents)

<minsearch.minsearch.Index at 0x7e56bed05370>

In [37]:
def search(query):
    boost = {
        'attraction': 3,
        'activity_type': 2,
        'county': 2,
        'time_to_visit': 1,
        'description': 1
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [38]:
search('In which county is Romsdalsfjord located?')

[{'id': 150,
  'attraction': 'Romsdalsfjord',
  'activity_type': 'Sightseeing / Nature',
  'county': 'Møre og Romsdal',
  'time_to_visit': 'May, June, July, August, September',
  'description': 'A stunning fjord known for its steep cliffs, waterfalls, and charming coastal villages. Ideal for boat tours, kayaking, and scenic photography.'},
 {'id': 17,
  'attraction': 'Nidaros Cathedral',
  'activity_type': 'Historical / Religious',
  'county': 'Trøndelag',
  'time_to_visit': 'January, February, March, April, May, June, July, August, September, October, November, December',
  'description': 'Located in Trondheim, this Gothic cathedral is Norway’s most important church. Built over the burial site of Saint Olav, it is the northernmost medieval cathedral in the world.'},
 {'id': 107,
  'attraction': 'Østfold Museum Halden',
  'activity_type': 'Culture / History',
  'county': 'Viken',
  'time_to_visit': 'May, June, July, August, September',
  'description': 'A regional museum with exhibitio

In [39]:
from tqdm.auto import tqdm

In [40]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [41]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [42]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [43]:
evaluate(ground_truth, lambda q: search(q['question']))

100%|██████████| 750/750 [00:03<00:00, 241.03it/s]


{'hit_rate': 0.992, 'mrr': 0.9733777777777778}