## Evaluate retrieval metrics for text
From the ChatGPT generated faqs, we are going to evaluate if we can retrieve the relevant documents.

In [6]:
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import pandas as pd
from sentence_transformers import SentenceTransformer

## 1. Read the data with all 5 faqs per doc

In [7]:
# read in the 5 faqs w ids for each question
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [8]:
# different pre-trained models available, we will use cosine similarity
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



In [9]:
# test it
v = model.encode('I just discovered the course, can I still join?')
print(f'length is {len(v)}')

#notes: this vector, when comparing to others, will get a max of 1 during dot prduct. 

length is 384


## 2. setup Elasticsearch for vectorization indexing

In [10]:
# set up elasticsearch
# we want a vector for question, answer and q+a
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {"type":"dense_vector","dims":384,"index":True,"similarity":"cosine"},
            "text_vector": {"type":"dense_vector","dims":384,"index":True,"similarity":"cosine"},
            "question_text_vector": {"type":"dense_vector","dims":384,"index":True,"similarity":"cosine"},

        }
    }
}

index_name = "course-questions"

# create index, make sure you have new one
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

## 3. vectorize q, a, q+a and index w/elasticsearch

In [12]:
# vectorize
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text
    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [13]:
#index it
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

## 4. make an actual search query

In [80]:
# create the proper query settings
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5, #get me 5 nearest neighbors matching this search term
        "num_candidates": 10000, #group of documents that search will look into
        "filter": {
                "term": {
                    "course": course
                }
            }
    }

    search_query = {
            "knn": knn,
            "_source": ["text", "section", "question", "course", "id"]
        }
    # search properly using semantic search
    response = es_client.search(
        index=index_name,
        body =search_query,
        )
       
    #parse results
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [81]:
# generate full query from incoming text
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [82]:
# we try a proper query
query = 'I just discovered the course. Can I still join it?'
course = 'data-engineering-zoomcamp'
q = {'question':query,
     'course':course}

response=question_vector_knn(q)

response

[{'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': '7842b56a'},
 {'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': '7842b56a'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we 

## 5. Evaluate against groound truth

In [83]:
def hit_rate(relevance_total):
    ''' 
    This fcn evaluates hit rate
    '''
    cnt = 0
    # count if any True found on line
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    '''
    this fcn evaluates mean reciprocal rank
    '''
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    ''' 
    This function uses any specific search engine you set up to 
    check out the ground truch, and it evaluates the 2 metrics 
    '''
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [84]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

### Question vector

In [85]:
eval_metrics = evaluate(ground_truth, question_vector_knn)
print(eval_metrics)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7445429003674087, 'mrr': 0.6475542107917298}


NOTE: evaluation metrics take time because we are vectorizing things
These eval metrics do better than the pure text-only one we had done before. As a reference, it was, using minsearch:
{'hit_rate': 0.7722066133563864, 'mrr': 0.661454506159499}


Let's look now at other types of vectorizations and how they fare

### Text vector

In [86]:
# generate full query from incoming text
def text_vector_knn(q):
    question = q['question'] #remember this is the actual question on the faq
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('text_vector', v_q, course)

In [87]:
eval_metrics = evaluate(ground_truth, text_vector_knn)
print(eval_metrics)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7983574670412794, 'mrr': 0.6850803256249559}


### Question-text vector

In [88]:
# generate full query from incoming text
def question_text_vector_knn(q):
    question = q['question'] #remember this is the actual question on the faq
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [89]:
eval_metrics = evaluate(ground_truth, question_text_vector_knn)
print(eval_metrics)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8960449535336071, 'mrr': 0.8064980909156398}


Alexey also looks at combining the vectorization as an actual parameter in elasticsearch.

Have a look at how he did it (it is just using a code generated by ChatGPT):

[jupyter notebook](https://github.com/DataTalksClub/llm-zoomcamp/blob/main/03-vector-search/eval/evaluate-vector.ipynb)

Overall it was a whole lot of effort for a worse performing search engine.

Will not recreate it here for the sake of time, and to keep this notebook clean.
