# End-to-end retrieval evaluation

Fetch Wikipedia Data

In [1]:
import csv
import json
from typing import List
import concurrent.futures

import ir_measures
import numpy as np
from tqdm import tqdm
from ir_measures import *
from openai import OpenAI
from cohere import ClientV2
from pydantic import BaseModel
from dotenv import load_dotenv
from usearch.index import Index
from mediawiki import MediaWiki

load_dotenv()

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\Abdulrahman\AppData\Local\sagemaker\sagemaker\config.yaml


True

In [None]:
wikipedia = MediaWiki(lang="ar")

In [2]:
results = wikipedia.search("الثورة التونسية")

In [3]:
pages = [wikipedia.page(result) for result in results]

In [4]:
pages_full = [
    {
        "id": page.pageid,
        "text": page.content,
        "meta": {"title": page.title, "summary": page.summarize(chars=256)},
    }
    for page in pages
    if page
]

In [5]:
json.dump(pages_full, open("data.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

Chunk 

In [6]:
from chunking import ClusterSemanticChunker
from uuid import uuid4

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\Abdulrahman\AppData\Local\sagemaker\sagemaker\config.yaml


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
text_splitter = ClusterSemanticChunker()

In [8]:
for doc in pages_full:
    doc["chunks"] = text_splitter.split_text(doc["text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
[ClusterSemanticChunker] Embedding sentences: 100%|██████████| 7/7 [00:15<00:00,  2.24s/it]
[ClusterSemanticChunker] Calculating reward: 100%|██████████| 621/621 [00:00<00:00, 28205.78it/s]
[ClusterSemanticChunker] Embedding sentences: 100%|██████████| 7/7 [00:13<00:00,  1.96s/it]
[ClusterSemanticChunker] Calculating reward: 100%|██████████| 587/587 [00:00<00:00, 32600.09it/s]
[ClusterSemanticChunker] Embedding sentences: 100%|██████████| 19/19 [00:41<00:00,  2.16s/it]
[ClusterSemanticChunker] Calculating reward: 100%|██████████| 1757/1757 [00:00<00:00, 42565.16it/s]
[ClusterSemanticChunker] Embedding sentences: 100%|██████████| 4/4 [00:07<00:00,  1.96s/it]
[ClusterSemanticChunker] Calculating reward: 100%|██████████| 367/367 [00:00<00:00, 45901.58it/s]
[ClusterSemanticChunker] Embedding sentences: 100%|███

In [9]:
chunks = []
for doc in pages_full:
    for chunk in doc["chunks"]:
        chunk = {
            "id": doc["id"],
            "chunk_id": str(uuid4()),
            "text": chunk,
            "meta": doc["meta"],
        }
        chunks.append(chunk)

In [10]:
json.dump(chunks, open("chunks.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

Generate Queries

In [13]:
client = OpenAI()

In [18]:
class Query(BaseModel):
    """A query about a passage."""
    queries: List[str]

In [19]:
def generate_queries(passage: str) -> Query:
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Generate queries in Arabic about the following passage. Provided are also metadata about the document the passage is from."},
            {"role": "user", "content": f"""
            Title:
            {passage["meta"]["title"]}

            Summary of the document:
            {passage["meta"]["summary"]}
             
            Passage:
            {passage["text"]}
            """},
        ],
        response_format=Query,
    )
    return completion.choices[0].message.parsed.queries

In [None]:
# for chunk in tqdm(chunks):
#     chunk["queries"] = generate_queries(chunk)

In [21]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(generate_queries, chunk) for chunk in chunks]
    for future in tqdm(futures, total=len(chunks)):
        result = future.result()
        chunk = chunks[futures.index(future)]
        chunk["queries"] = result

100%|██████████| 1982/1982 [04:56<00:00,  6.68it/s]


In [29]:
json.dump(chunks, open("chunks_with_queries.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

In [22]:
queries = []
for chunk in chunks:
    for query in chunk['queries']:
        queries.append(query)
queries = list(set(queries))

In [23]:
queries_mapping = {query: str(uuid4()) for query in queries}

In [24]:
ground_truth = []
for chunk in chunks:
    for query in chunk['queries']:
        ground_truth.append({
            "query_id": queries_mapping[query],
            "chunk_id": chunk['chunk_id'],
            "ground_truth": chunk['text'],
            "query": query,
            "meta": chunk['meta']
        })

In [25]:
json.dump(ground_truth, open('query_with_ground_truth.json', 'w', encoding="utf-8"), indent=2, ensure_ascii=False)

Create Qrels

In [28]:
with open("qrels.txt", "wt") as out_file:
    tsv_writer = csv.writer(out_file, delimiter=' ')
    for gt in ground_truth:
        tsv_writer.writerow([gt["query_id"], "0", gt["chunk_id"], "1"])

Embed

In [31]:
chunk_key_mapping = {
    chunk["chunk_id"]: idx for idx, chunk in enumerate(chunks)
}

In [32]:
json.dump(chunk_key_mapping, open("chunk_key_mapping.json", "w"))

In [33]:
co = ClientV2()

In [34]:
def embed_document(chunk):
    full_text = chunk["text"]
    embedding_text = f"""
    عنوان المستند:
    {chunk["meta"]["title"]}
    ملخص المستند:
    {chunk["meta"]["summary"]}

    الفقرة:
    {full_text}
    """
    return np.array(co.embed(texts=[embedding_text], 
                             model="embed-multilingual-v3.0", 
                             input_type="search_document", 
                             embedding_types=['float']).embeddings.float_).squeeze()

In [None]:
# for chunk in tqdm(chunks):
#     chunk["embedding"] = embed_document(chunk).tolist()

In [36]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(embed_document, chunk): chunk for chunk in chunks}
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(chunks)):
        chunk = futures[future]
        chunk["embedding"] = future.result().tolist()

100%|██████████| 1982/1982 [00:40<00:00, 49.28it/s]


In [37]:
json.dump(chunks, open("chunks_with_queries_and_embeddings.json", "w", encoding="utf-8"), indent=2, ensure_ascii=False)

In [45]:
queries = json.load(open("query_with_ground_truth.json", "r", encoding="utf-8"))

In [46]:
def embed_query(query):
    return np.array(co.embed(texts=[query["query"]], 
                             model="embed-multilingual-v3.0", 
                             input_type="search_query", 
                             embedding_types=['float']).embeddings.float_).squeeze()

In [47]:
# for query in tqdm(queries):
#     query["embedding"] = embed_query(query).tolist()

In [48]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(embed_query, query): query for query in queries}
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(queries)):
        query = futures[future]
        query["embedding"] = future.result().tolist()

100%|██████████| 10529/10529 [03:29<00:00, 50.16it/s]


In [49]:
json.dump(queries, open("query_with_ground_truth_and_embeddings.json", "w", encoding="utf-8"), indent=2, ensure_ascii=False)

Index

In [8]:
chunks = json.load(open('chunks_with_queries_and_embeddings.json', encoding="utf-8"))
chunk_id_to_key = json.load(open('chunk_key_mapping.json'))
key_to_chunk_id = {v: k for k, v in chunk_id_to_key.items()}

In [9]:
chunks_index = Index(ndim=1024, metric='cos')

In [11]:
chunks_index.add([i for i in range(len(chunks))], np.array([chunk['embedding'] for chunk in chunks]))

array([   0,    1,    2, ..., 1979, 1980, 1981], dtype=uint64)

In [12]:
chunks_index.save("chunks.usearch")

Benchmark

In [14]:
queries = json.load(open("query_with_ground_truth_and_embeddings.json"))
chunk_id_to_key = json.load(open('chunk_key_mapping.json'))
key_to_chunk_id = {v: k for k, v in chunk_id_to_key.items()}

In [15]:
chunks_index = Index(ndim=1024, metric='cos')
chunks_index.load("chunks.usearch")

In [16]:
def retrieve(query, k=10):
    query_embedding = np.array(query["embedding"]).reshape(1, -1)
    matches = chunks_index.search(query_embedding, k)
    doc_ids = [key_to_chunk_id[key] for key in matches.keys.tolist()]
    scores = matches.distances.tolist()
    similarities = [1 - score for score in scores]
    run_entries = zip(doc_ids, similarities)
    return [ir_measures.ScoredDoc(str(query["query_id"]), str(entry[0]), entry[1]) for entry in run_entries]

In [19]:
results = []
for query in tqdm(queries):
    results.extend(retrieve(query))

100%|██████████| 10529/10529 [00:02<00:00, 4458.75it/s]


In [20]:
qrels = ir_measures.read_trec_qrels('qrels.txt')
ir_measures.calc_aggregate([P@1, P@3, P@5, R@1, R@3, R@5], qrels, results)

{R@5: 0.40713114018569113,
 P@1: 0.23096242716495882,
 R@3: 0.35321323325392046,
 P@5: 0.0820373719107945,
 R@1: 0.22994704210798242,
 P@3: 0.11837787154242964}