In [1]:
import json
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient
from tqdm import tqdm
from os import environ
import sys
import requests
import time
import itertools

load_dotenv()

True

In [3]:
with open("../dist/qdrant_records.json", 'r') as file:
    qdrant_records = json.load(file)

In [5]:
QDRANT_URL = environ.get('QDRANT_URL')
QDRANT_API_KEY = environ.get('QDRANT_API_KEY')
COLLECTION_NAME = 'lotr-characters'
EMBEDDING_DIMENSION = 512
JINA_EMBEDDING_MODEL = "jina-embeddings-v4"
JINA_URL = "https://api.jina.ai/v1/embeddings"
JINA_API_KEY = environ.get('JINA_API_KEY')
QUERYING_TASK = "retrieval.query"
OPENAI_MODEL = "gpt-4o-mini"
OPENAI_TEMPERATURE = 0.5

In [6]:
openai_client = OpenAI()
qd_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

In [7]:
def format_prompt (payload: dict[str,str])-> tuple[str, str]:
    raw_user_prompt = """
Payload:
{payload}
""".strip()

    system_prompt = """
You are an assistant that generates evaluation questions to test retrieval quality on character data.
You will receive a JSON object describing a fictional character, which may include fields such as name, race, gender, birth, death, spouse, realm, biography, and others.

Your task:
1. Read and understand the JSON payload carefully.
2. Generate exactly 5 diverse and specific questions that can be answered using the information in the payload.
3. Focus on factual, grounded details — such as relationships, timeline, characteristics, or key events mentioned in the biography.
4. Avoid trivial or repetitive questions.
5. Do not include any reasoning, explanations, or text outside the JSON array.

Output valid JSON only (no code blocks, no extra text):
["Question 1", "Question 2", "Question 3", "Question 4", "Question 5"]

If a field is null or missing, do not ask about it. If there is limited information, create general but relevant questions based on available content.
""".strip()
    user_prompt = raw_user_prompt.format(payload=payload).strip()
    
    return user_prompt, system_prompt

In [8]:
def format_records ()->list[dict]:
    """
    format character information
    """
    formatted_records = []
    for record in tqdm(qdrant_records):
        basic_fields = ['name', 'race', 'gender', 'realm', 'culture', 'birth', 'death', 'spouse', 'hair', 'height', 'biography', 'history']
        character = {
            "id": record["id"]
        }
        character.update([(field, record["payload"][field]) for field in basic_fields if record["payload"].get(field)])
        formatted_records.append(character)

    return formatted_records

In [9]:
def llm(user_prompt: str, system_prompt: str)-> list[str]:
    """
    llm function to call openAI with our specific prompts
    """
    res = openai_client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=OPENAI_TEMPERATURE
    )
    return res.choices[0].message.content

In [10]:
def generate_question(ctx: dict[str,str])->dict:
    user_prompt, system_prompt = format_prompt(ctx)
    questions = llm(user_prompt=user_prompt, system_prompt=system_prompt)
    return {
        "id": ctx['id'],
        "questions": json.loads(questions)
    }

In [11]:
def generate_questions_and_save_json():
    records = format_records()
    formatted_questions = []
    for record in tqdm(records):
        questions = generate_question(ctx=record)
        formatted_questions.append(questions)
    
    with open('../dist/golden_questions.json', 'w') as file:
        json.dump(formatted_questions, file, indent=2, ensure_ascii=False)

In [82]:
generate_questions_and_save_json()

100%|██████████| 749/749 [00:00<00:00, 251947.53it/s]
100%|██████████| 749/749 [24:29<00:00,  1.96s/it]


In [14]:
def open_json_file (file_path: str):
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    return json_data

def save_json_file (file_path: str, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=2, ensure_ascii=False)

In [13]:
def create_jina_embedding(input_text: str)-> list:
    """
    Create embedding using Jina API
    Returns a single embedding vector (list of floats)
    """
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {JINA_API_KEY}",
    }
    data = {
        "input": [input_text],
        "model": JINA_EMBEDDING_MODEL,
        "dimensions": EMBEDDING_DIMENSION,
        "task": QUERYING_TASK,
        "late_chunking": True,
    }
    try:
        res = requests.post(url=JINA_URL, headers=headers, json=data, timeout=30)
        if res.status_code == 200:
            embedding = res.json()["data"][0]["embedding"]
            return embedding
        else:
            raise Exception(f"Jina API error: {res.status_code} - {res.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {str(e)}")

In [16]:
def search(query: str, limit: int = 5, threshold = 0.3):
    """
    Updated search function to use Jina API for query embedding
    """
    try:
        # Create embedding for the search query using Jina API
        query_embedding = create_jina_embedding(input_text=query)
        
        query_points = qd_client.query_points(
            collection_name=COLLECTION_NAME,
            query=query_embedding,
            limit=limit,
            with_payload=True,
            score_threshold=threshold
        )
        
        # return query_points
        results = [{"id": point.id, "score": point.score, **point.payload} for point in query_points.points]
        return results
    except Exception as e:
        print(f"Error during search: {str(e)}")
        return None

In [29]:
def get_formatted_search_result(golden_questions=None, previous_results=None, start_index=0, requests_per_minute=400):
    search_results = previous_results if previous_results is not None else []
    current_index = start_index

    # Calculate delay between requests to stay under rate limit
    delay_seconds = 60.0 / requests_per_minute

    try:
        for obj in tqdm(golden_questions, desc="Processing documents"):
            doc_id = obj["id"]
            for q_idx, question in enumerate(obj["questions"]):
                if current_index < start_index:
                    current_index += 1
                    continue

                try:
                    results = search(query=question, limit=5, threshold=0.3)
                    if results is None:
                        raise ValueError("Search returned None")
                    search_result = {
                        "id": doc_id,
                        "question": question,
                        "question_idx": q_idx,
                        "search_results": results
                    }
                    search_results.append(search_result)
                    current_index += 1

                    # Add delay to respect rate limit
                    time.sleep(delay_seconds)
                except Exception as e:
                    print(f"\n❌ Error at index {current_index}")
                    print(f"   Document ID: {doc_id}")
                    print(f"   Question {q_idx + 1}/{len(obj['questions'])}: {question}")
                    print(f"   Error: {type(e).__name__}: {str(e)}")
                    print(f"\n💾 Processed {len(search_results)} questions before failure")
                    print(f"   Returning (relevance_total, {current_index}) for resume")
                    return search_results, current_index
    
    except KeyboardInterrupt:
        print(f"\n⚠️  Interrupted by user at index {current_index}")
        print(f"💾 Processed {len(search_results)} questions")
        return search_results, current_index
    
    return search_results, current_index

In [15]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + (1 / (rank + 1))
    return total_score / len(relevance_total)

In [16]:
def evaluate(relevance_total):
    return {
        "hit_rate": hit_rate(relevance_total=relevance_total),
        "mrr": mrr(relevance_total=relevance_total)
    }

In [24]:
all_golden_questions = open_json_file(file_path="../dist/golden_questions.json")
total_entries = len(all_golden_questions)
print(f"Total entries: {total_entries}")
batches = []
for batch_num in range(0, total_entries, 100):
    batch_end = min(batch_num + 100, total_entries)
    batches.append([batch_num, batch_end])

Total entries: 749


In [25]:
golden_questions_batch_1 = all_golden_questions[batches[0][0]:batches[0][1]]
golden_questions_batch_2 = all_golden_questions[batches[1][0]:batches[1][1]]

In [30]:
search_results, last_index = get_formatted_search_result(golden_questions=golden_questions_batch_1)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing documents: 100%|██████████| 100/100 [10:07<00:00,  6.07s/it]


In [None]:
save_json_file(file_path="../dist/retrieval_search_results.json", data=search_results)

In [8]:
def filter_results(data: list[dict], filters: dict):
    """
    Filter search results based on limit and threshold.
    Args:
        data: List of result objects
        filters: Dictionary with 'limit' (int) and 'threshold' (float)
    Returns:
        Filtered data with same structure
    """
    limit = filters.get('limit')
    threshold = filters.get('threshold')

    filtered_data = []

    for entry in tqdm(data, desc=f"Filtering results satifying {filters}"):
        filtered_entry = entry.copy()
        results = entry['search_results']

        # apply limit (top x items)
        if limit is not None:
            results = results[:limit]
        # apply threshold filter
        if threshold is not None:
            results = [r for r in results if r.get('score', 0) > threshold]
        # update the entry with filtered results
        filtered_entry['search_results'] = results
        filtered_data.append(filtered_entry)
    
    return filtered_data

In [9]:
def get_strategies_list():
    ## list of strategies
    limits = [3, 4, 5]
    thresholds = [0.3, 0.5, 0.7]
    strategies = []

    for limit, threshold in itertools.product(limits, thresholds):
        strategy = {"limit": limit, "threshold": threshold}
        strategies.append(strategy)
    return strategies

In [10]:
def make_relevance_matrix(data: list[dict]):
    """
    Create relevance matrix based on given data
    Args:
        data: List of result objects
    Returns:
        relevance matrix nested list of boolean
    """
    relevance_total = []
    for obj in tqdm(data):
        obj_id = obj["id"]
        relevance = [result['id'] == obj_id for result in obj["search_results"]]
        relevance_total.append(relevance)
    
    return relevance_total

In [11]:
def generate_evaluations_per_strategy():
    search_results = open_json_file(file_path="../dist/retrieval_search_results.json")
    strategies = get_strategies_list()
    
    results = []
    for strategy in tqdm(strategies):
        filtered_results = filter_results(data=search_results, filters=strategy)
        relevance_total = make_relevance_matrix(data=filtered_results)
        eval = evaluate(relevance_total=relevance_total)
        formatted_evaluation = {**strategy, **eval}
        results.append(formatted_evaluation)
    return results

In [23]:
def format_evaluation_result(results: list[dict]):
    highest_hit_rate = max(results, key=lambda x: x['hit_rate'])
    highest_mrr = max(results, key=lambda x: x['mrr'])

    print(f"Highest Hit Rate: {highest_hit_rate}")
    print(f"Highest MRR: {highest_mrr}")

In [17]:
eval_result = generate_evaluations_per_strategy()

Filtering results satifying {'limit': 3, 'threshold': 0.3}: 100%|██████████| 500/500 [00:00<00:00, 529583.84it/s]
100%|██████████| 500/500 [00:00<00:00, 1008246.15it/s]
Filtering results satifying {'limit': 3, 'threshold': 0.5}: 100%|██████████| 500/500 [00:00<00:00, 1131149.95it/s]
100%|██████████| 500/500 [00:00<00:00, 1923992.66it/s]
Filtering results satifying {'limit': 3, 'threshold': 0.7}: 100%|██████████| 500/500 [00:00<00:00, 883383.32it/s]
100%|██████████| 500/500 [00:00<00:00, 1343467.01it/s]
Filtering results satifying {'limit': 4, 'threshold': 0.3}: 100%|██████████| 500/500 [00:00<00:00, 1225687.90it/s]
100%|██████████| 500/500 [00:00<00:00, 904334.63it/s]
Filtering results satifying {'limit': 4, 'threshold': 0.5}: 100%|██████████| 500/500 [00:00<00:00, 718202.74it/s]
100%|██████████| 500/500 [00:00<00:00, 398698.10it/s]
Filtering results satifying {'limit': 4, 'threshold': 0.7}: 100%|██████████| 500/500 [00:00<00:00, 1091698.07it/s]
100%|██████████| 500/500 [00:00<00:00, 1

In [24]:
format_evaluation_result(results=eval_result)

Highest Hit Rate: {'limit': 5, 'threshold': 0.3, 'hit_rate': 0.898, 'mrr': 0.7617999999999995}
Highest MRR: {'limit': 5, 'threshold': 0.3, 'hit_rate': 0.898, 'mrr': 0.7617999999999995}
