In [34]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents_with_ids = json.load(f_in)

In [35]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch("http://localhost:9200")
# Define the index mapping
index_mapping = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "Context": {"type": "text"},
            "Response": {"type": "text"},
            "ID": {
                "type": "keyword",
            }
        }
    }
}

index_name = "conversations"


# To delete the index
es_client.indices.delete(index=index_name, ignore_unavailable=True)

# Create the index with the mapping
es_client.indices.create(index=index_name, body=index_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'conversations'})

In [36]:
from tqdm.auto import tqdm

for doc in tqdm(documents_with_ids):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/2051 [00:00<?, ?it/s]

In [None]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["Context", "Response"],
                        "type": "cross_fields",
                    }
                }
            }
        },
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [38]:
import pandas as pd

df_ground_truth = pd.read_csv("ground-truth-data_ContextOnly_new_latest.csv")

ground_truth = df_ground_truth.to_dict(orient="records")

In [39]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [40]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [62]:
import minsearch

index = minsearch.Index(
    text_fields=["Context", "Response"],
    keyword_fields=["ID"],  # Pass an empty list for keyword fields
)

index.fit(documents_with_ids)

<minsearch.Index at 0x7fd8d61fb070>

In [42]:
def search_minsearch(query):
    boost = {"Context": 1.0, "Response": 1.0}

    search_results = index.search(query=query, boost_dict=boost, num_results=5)

    return search_results

In [43]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["ID"]
        results = search_function(q)
        relevance = [d["ID"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [77]:
evaluate(ground_truth, lambda q: elastic_search(q["Context"]))

  0%|          | 0/10255 [00:00<?, ?it/s]

{'hit_rate': 0.6404680643588493, 'mrr': 0.5075426621160373}

In [45]:
evaluate(ground_truth, lambda q: search_minsearch(q["Context"]))

  0%|          | 0/10255 [00:00<?, ?it/s]

{'hit_rate': 0.48132618235007313, 'mrr': 0.34432959531935703}