In [13]:
from elasticsearch import Elasticsearch
from tqdm import tqdm
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import math

build elasticsearch dcoker image

docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [16]:
ground_truth = pd.read_csv("../data_output\ground-truth-retrieval.csv")
ground_truth = ground_truth.to_dict(orient="records")
ground_truth

[{'id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
  'question': 'How do you think the concept of cityphobia can be used to inform policy decisions aimed at reducing urban poverty and inequality, particularly in areas with high levels of social exclusion?'},
 {'id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
  'question': "What role do you believe the 'body' component of city love should play in the development of policies aimed at promoting physical and mental well-being in urban areas?"},
 {'id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_1',
  'question': "How do you think the'soul' component of city love, encompassing social cohesion and community engagement, can be leveraged to enhance the resilience of urban communities in the face of economic and social challenges?"},
 {'id': 'Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M_1_2',
  '

In [31]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def ndcg(relevance_total):
    def dcg(relevance):
        return sum((2**rel - 1) / math.log2(i + 2) for i, rel in enumerate(relevance))

    def idcg(relevance):
        return dcg(sorted(relevance, reverse=True))

    scores = []
    for relevance in relevance_total:
        if sum(relevance) == 0:
            scores.append(0.0)
        else:
            scores.append(dcg(relevance) / idcg(relevance))

    return sum(scores) / len(scores)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for row in tqdm(ground_truth):
        results = search_function(row["question"])
        relevance = [d["chunk_id"] == row["id"] for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
        "ndsg": ndcg(relevance_total),
    }

In [66]:
def read_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_mode(model_name):
    print(f"Loading model: {model_name}")
    return SentenceTransformer(model_name)


def fetch_documents():
    print("Fetching documents...")

    directory_path = "../json_data"

    # List all files in the directory
    files = os.listdir(directory_path)

    documents = []
    for file in files:
        print(f"Reading file: {file}")
        data = read_json(f"{directory_path}/{file}")
        documents.extend(data)
        print(f"Fetched {len(documents)} documents")
    return documents


def setup_elasticsearch(index_name, model, url_es="http://localhost:9200"):
    print("Setting up Elasticsearch...")
    es_client = Elasticsearch(url_es)

    index_settings = {
        "settings": {"number_of_shards": 1, "number_of_replicas": 0},
        "mappings": {
            "properties": {
                "doc_id": {"type": "keyword"},
                "page_num": {"type": "integer"},
                "chunk_id": {"type": "keyword"},
                "text": {"type": "text"},
                "text_vector": {
                    "type": "dense_vector",
                    "dims": model.get_sentence_embedding_dimension(),
                    "index": True,
                    "similarity": "cosine",
                },
            }
        },
    }

    es_client.indices.delete(index=index_name, ignore_unavailable=True)
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Elasticsearch index '{index_name}' created")
    return es_client


def index_documents(es_client, documents, model, index_name):
    print("Indexing documents...")
    for doc in tqdm(documents):
        doc["text_vector"] = model.encode(doc["text"]).tolist()
        es_client.index(index=index_name, document=doc)
    print(f"Indexed {len(documents)} documents")


def init_elasticsearch(model_name, index_name):
    model = load_mode(model_name)
    documents = fetch_documents()
    es_client = setup_elasticsearch(index_name, model)
    index_documents(es_client, documents, model, index_name)

In [18]:
def elastic_search_knn(
    field,
    vector,
    # course,
    index_name,
):
    es_client = Elasticsearch("http://localhost:9200")

    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        # "filter": {"term": {"course": course}},
    }

    search_query = {
        "knn": knn,
        "_source": ["doc_id", "page_num", "chunk_id", "text"],
    }

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

In [67]:
init_elasticsearch("all-mpnet-base-v2", "esearchtext_model_all-mpnet-base-v2")

Loading model: all-mpnet-base-v2
Fetching documents...
Reading file: Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M.json
Fetched 60 documents
Reading file: How-do-local-governments-respond-to-central-mandate-in-affo_2024_Journal-of-.json
Fetched 113 documents
Reading file: Inclusive-cities--Less-crime-requires-more-lo_2024_Journal-of-Urban-Manageme.json
Fetched 118 documents
Reading file: sideris_gonzales_ong.json
Fetched 171 documents
Reading file: The_High_Cost_of_Free_Parking.json
Fetched 190 documents
Setting up Elasticsearch...
Elasticsearch index 'esearchtext_model_all-mpnet-base-v2' created
Indexing documents...


100%|██████████| 190/190 [00:11<00:00, 16.85it/s]

Indexed 190 documents





In [21]:
model = load_mode("all-mpnet-base-v2")
index_name = "esearchtext_model_all-mpnet-base-v2"


def elastic_search_knn_to_evaluate(query):
    return elastic_search_knn("text_vector", model.encode(query), index_name)

Loading model: all-mpnet-base-v2


In [32]:
evaluate(ground_truth, elastic_search_knn_to_evaluate)

100%|██████████| 150/150 [00:06<00:00, 23.24it/s]


{'hit_rate': 0.7066666666666667,
 'mrr': 0.5179999999999999,
 'ndsg': 0.5650279641723469}

In [68]:
init_elasticsearch("all-MiniLM-L6-v2", "esearchtext_model_all-minilm-l6-v2")

Loading model: all-MiniLM-L6-v2
Fetching documents...
Reading file: Cityphilia-and-cityphobia--A-multi-scalar-search-for_2024_Journal-of-Urban-M.json
Fetched 60 documents
Reading file: How-do-local-governments-respond-to-central-mandate-in-affo_2024_Journal-of-.json
Fetched 113 documents
Reading file: Inclusive-cities--Less-crime-requires-more-lo_2024_Journal-of-Urban-Manageme.json
Fetched 118 documents
Reading file: sideris_gonzales_ong.json
Fetched 171 documents
Reading file: The_High_Cost_of_Free_Parking.json
Fetched 190 documents
Setting up Elasticsearch...
Elasticsearch index 'esearchtext_model_all-minilm-l6-v2' created
Indexing documents...


100%|██████████| 190/190 [00:06<00:00, 31.19it/s]

Indexed 190 documents





In [69]:
model = load_mode("all-MiniLM-L6-v2")
index_name = "esearchtext_model_all-minilm-l6-v2"


def elastic_search_knn_to_evaluate(query):
    return elastic_search_knn("text_vector", model.encode(query), index_name)

Loading model: all-MiniLM-L6-v2


In [70]:
evaluate(ground_truth, elastic_search_knn_to_evaluate)

100%|██████████| 150/150 [00:03<00:00, 43.30it/s]


{'hit_rate': 0.64, 'mrr': 0.4779999999999999, 'ndsg': 0.5184641925761038}