# Fetch data

In [152]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [153]:
ground_truth[0] 

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [154]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

# Evaluating retrieval

In [155]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

```
boost = {'question': 1.5, 'section': 0.1}
```
What's the hit rate for this approach?


In [156]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7f4cacef46d0>

In [158]:

def minsearch_search(query, course, boost):
    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

metrics = evaluate(ground_truth=ground_truth, search_function=lambda q: minsearch_search(q['question'], q['course'], boost = {'question': 1.5, 'section': 0.1}))
print(metrics)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}


# Embeddings

---
The latest version of minsearch also supports vector search. We will use it:


```python
from minsearch import VectorSearch
```

We will also use TF-IDF and Singular Value Decomposition to create embeddings from texts. You can refer to our "Create Your Own Search Engine" workshop if you want to know more about it.

```python
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
```
Let's create embeddings for the "question" field:


```python
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)
```

In [159]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline


In [160]:
texts_q2  = []

for doc in documents:
    t = doc['question']
    texts_q2.append(t)

pipeline_q2 = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline_q2.fit_transform(texts_q2)

# Q2. Vector search for question

---

Now let's index these embeddings with minsearch:


```python
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

```
Evaluate this seach method. What's MRR for it?



In [161]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7f4cfbae3110>

In [164]:

def search_q2(q):
    # 取得查詢向量
    query_vec = pipeline_q2.transform([q['question']]) # (1,128)
    return vindex.search(
        query_vector=query_vec, 
        filter_dict={'course': q['course']},
        num_results=5
    )
# 執行評估
metrics_q2 = evaluate(ground_truth, search_q2)
print(metrics_q2)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3571284489590088}


# Q3. Vector search for question and answer

---
We only used question in Q2. We can use both question and answer:


```python
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)
```

Using the same pipeline (min_df=3 for TF-IDF vectorizer and n_components=128` for SVD), evaluate the performance of this approach

What's the hitrate?

In [19]:
texts_q3  = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts_q3.append(t)

pipeline_q3 = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline_q3.fit_transform(texts_q3)

In [20]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7f4dfbba8450>

In [21]:
def search_function(q):
    # 取得查詢向量
    query_vec = pipeline_q3.transform([q['question']])
    return vindex.search(
        query_vector=query_vec, 
        filter_dict={'course': q['course']},
        num_results=5
    )

# 執行評估
metrics = evaluate(ground_truth, search_function)
print(metrics)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717707657949719}


# Q4. Qdrant
---
Now let's evaluate the following settings in Qdrant:
```
text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5
```
What's the MRR?

In [178]:
from fastembed.embedding import TextEmbedding
from qdrant_client import QdrantClient, models
import numpy as np
from tqdm import tqdm

# 文字嵌入模型初始化
model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")
model_jina = "jinaai/jina-embeddings-v2-small-en"
# Qdrant 客戶端
client = QdrantClient("http://localhost:6333")


In [None]:
from uuid import uuid4

collection_name = "q4_evaluation"

client.delete_collection(collection_name=collection_name)
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=512, distance=models.Distance.COSINE),
)


import uuid

# 建立向量並上傳
points = []
for idx, doc in tqdm(enumerate(documents)):
    

    full_text = doc["question"] + " " + doc["text"]

    points.append(
        models.PointStruct(
            id=idx,
            vector=models.Document(text=full_text, model=model_jina),
            payload={"document": doc["id"], "question": doc["question"], "text": doc["text"]},
        )
    )

client.upsert(collection_name=collection_name, points=points)


1
(512,)


UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Bad request: Empty update request"},"time":0.000032928}'

In [173]:
def search_q4(q, limit: int = 5):
    print(q)
    # query_text = q['question']
    full_text = doc["question"] + " " + doc["text"]
    query_vector = list(model.embed([full_text]))[0]
    
    return client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=limit,
        with_payload=True
    )


    
metrics_q4 = evaluate(ground_truth, search_q4)
print(metrics_q4)


  0%|          | 0/4627 [00:00<?, ?it/s]

{'question': 'When does the course begin?', 'course': 'data-engineering-zoomcamp', 'document': 'c02e79ef'}




TypeError: 'ScoredPoint' object is not subscriptable

In [136]:
evaluate(ground_truth, search_q4)

  0%|          | 0/4627 [00:00<?, ?it/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]