In [1]:
import requests
import pandas as pd

url_prefix = (
    "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/"
)
docs_url = url_prefix + "search_evaluation/documents-with-ids.json"
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + "search_evaluation/ground-truth-data.csv"
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient="records")

In [3]:
from tqdm.auto import tqdm


def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["document"]
        results = search_function(q)
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

# Q1. Minsearch Text

In [10]:
import minsearch

In [13]:
text_fields = ["question", "section", "text"]
keyword_fields = ["course", "id"]

index = minsearch.Index(
    text_fields=text_fields, keyword_fields=keyword_fields
)

index.fit(documents)

<minsearch.minsearch.Index at 0x24ed6f4ef60>

In [14]:
def minsearch_search(query, course):
    boost = {"question": 1.5, "section": 0.1}
    results = index.search(
        query=query, filter_dict={"course": course}, boost_dict=boost, num_results=5
    )

    return results

In [15]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = minsearch_search(query=q["question"], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)


  0%|          | 0/4627 [00:00<?, ?it/s]

100%|██████████| 4627/4627 [00:12<00:00, 360.13it/s]


In [16]:
hit_rate(relevance_total), mrr(relevance_total)

(0.848714069591528, 0.7288235717887772)

# Q2. Vector Search

In [17]:
from minsearch import VectorSearch

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [19]:
texts = []

for doc in documents:
    t = doc["question"]
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [20]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x24ed880a090>

In [32]:
def minsearch_vector_search(vector, course):
    return vindex.search(vector, filter_dict={"course": course}, num_results=5)


def question_text_vector(query, course):
    v_q = pipeline.transform(query)
    return minsearch_vector_search(v_q, course)

In [40]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = question_text_vector(query=[q["question"]], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)


100%|██████████| 4627/4627 [00:06<00:00, 681.55it/s]


In [41]:
hit_rate(relevance_total), mrr(relevance_total)

(0.48173762697212014, 0.3571284489590088)

# Q3. Vector Search for Q&A

In [42]:
texts = []

for doc in documents:
    t = doc["question"] + " " + doc["text"]
    texts.append(t)

In [43]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [44]:
vindex = VectorSearch(keyword_fields={"course"})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x24edd233350>

In [45]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = question_text_vector(query=[q["question"]], course=q["course"])
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:10<00:00, 435.95it/s]


In [46]:
hit_rate(relevance_total), mrr(relevance_total)

(0.8210503566025502, 0.6717707657949719)

# Q4. Qdrant

In [47]:
from qdrant_client import QdrantClient, models

In [60]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
embedding_size = 512

In [61]:
qd_client = QdrantClient("http://localhost:6333")

In [62]:
collection_name = "zoomcamp-faq"
qd_client.delete_collection(collection_name=collection_name)

qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=embedding_size, distance=models.Distance.COSINE
    ),
)

True

In [63]:
qd_client.create_payload_index(
    collection_name=collection_name, field_name="course", field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [64]:
points = []

for i, doc in enumerate(documents):
    text = doc["question"] + " " + doc["text"]
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(id=i, vector=vector, payload=doc)
    points.append(point)

In [65]:
qd_client.upsert(collection_name=collection_name, points=points)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [72]:
def qdrant_vector_search(question, course):
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(text=question, model=model_handle),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True,
    )
    return query_points

In [83]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    results = qdrant_vector_search(question=q["question"], course=q["course"])
    relevance = [d.payload["id"] == doc_id for d in results.points]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [01:18<00:00, 59.28it/s]


In [84]:
hit_rate(relevance_total), mrr(relevance_total)

(0.9299762264966501, 0.8517722066133576)

# Q5. Cosine Similarity

In [96]:
import numpy as np

In [98]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [85]:
results_url = url_prefix + "rag_evaluation/data/results-gpt4o-mini.csv"
df_results = pd.read_csv(results_url)

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3), TruncatedSVD(n_components=128, random_state=1)
)

In [87]:
pipeline.fit(
    df_results.answer_llm + " " + df_results.answer_orig + " " + df_results.question
)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [109]:
v_llm = pipeline.transform(df_results.answer_llm)
v_orig = pipeline.transform(df_results.answer_orig)

In [110]:
cosine_scores = [cosine(u, v) for u, v in zip(v_llm, v_orig)]

In [111]:
np.mean(cosine_scores)

np.float64(0.8415841233490403)

# Q6. Rogue

In [112]:
from rouge import Rouge

rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [119]:
def get_rogue_1_f1(row):
    scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
    return scores["rouge-1"]["f"]

In [121]:
rogue_1_f1_scores = df_results.apply(get_rogue_1_f1, axis=1)

In [123]:
rogue_1_f1_scores.mean()

np.float64(0.3516946452113943)