In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/corpus-eng/corpus-engineering.json


In [2]:
# Cell 2: Install Libraries (VERBOSE)
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' # <--- ADD THIS LINE
print("Installing libraries with verbose output...")

print("\nInstalling hazm...")
!pip install hazm # For Persian NLP (normalization, tokenization)

print("\nInstalling sentence-transformers...")
!pip install sentence-transformers # For embedding models

print("\nInstalling chromadb...")
!pip install chromadb # For the vector database

print("\nInstalling tqdm...")
!pip install tqdm # For progress bars

#!pip install faiss-gpu

print("\nLibrary installation commands executed.")

print("\\nInstalling Google AI SDK...")
!pip install -q google-generativeai

Installing libraries with verbose output...

Installing hazm...
Collecting hazm
  Downloading hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Collecting fasttext-wheel<0.10.0,>=0.9.2 (from hazm)
  Downloading fasttext_wheel-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting flashtext<3.0,>=2.7 (from hazm)
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.24.3 (from hazm)
  Downloading numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting python-crfsuite<0.10.0,>=0.9.9 (from hazm)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.1->hazm)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60

In [3]:
import json
import re
import time
import gc
import numpy as np
import torch
from tqdm.auto import tqdm
from datasets import load_dataset
import chromadb
import google.generativeai as genai
from google.api_core import exceptions
from kaggle_secrets import UserSecretsClient

In [4]:
# Cell 3.5 (New Cell): Gemini API Setup and Embedding Function

import google.generativeai as genai
from google.api_core import exceptions
import time
from kaggle_secrets import UserSecretsClient

# --- Configure the Gemini API ---
try:
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("GEMINI_API_KEY")
    genai.configure(api_key=api_key)
    print("Gemini API configured successfully.")
except Exception as e:
    print(f"ERROR: Could not configure Gemini API. Make sure the secret 'GEMINI_API_KEY' is set. Details: {e}")
    # You might want to stop execution if the API key is not available
    # exit()

# --- New function to embed texts using the Gemini API ---
def embed_texts_gemini(texts, task_type, batch_size=100, model_id="models/embedding-001"):
    """
    Embeds a list of texts using the Gemini API, with batching and error handling.
    task_type must be one of: "RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY",
                              "CLASSIFICATION", "CLUSTERING"
    """
    print(f"Embedding {len(texts)} texts with Gemini API (task: {task_type}, batch_size: {batch_size})...")
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Gemini Embedding ({task_type})"):
        batch = texts[i:i+batch_size]
        
        # Retry logic for API calls
        for attempt in range(3): # Retry up to 3 times
            try:
                # Get embeddings for the batch
                result = genai.embed_content(
                    model=model_id,
                    content=batch,
                    task_type=task_type
                )
                # Append the embeddings from the result dictionary
                embeddings.extend(result['embedding'])
                break # Success, exit retry loop
            except (exceptions.ResourceExhausted, exceptions.InternalServerError) as e:
                print(f"API Error: {e}. Retrying in {5 * (attempt + 1)} seconds...")
                time.sleep(5 * (attempt + 1))
            except Exception as e:
                print(f"An unexpected error occurred during embedding: {e}")
                # For other errors, you might not want to retry
                # Add dummy embeddings to maintain list size if needed, or handle differently
                embeddings.extend([[0.0] * 768] * len(batch)) # 768 is embedding dim for gemini-001
                break
    
    return embeddings

Gemini API configured successfully.


In [5]:
import json
import os

def load_and_prepare_documents(filepath):
    """Loads documents from JSON, combines title and abstract."""
    documents = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        for doc in data:
            doc_id = str(doc.get('id', '')) # Ensure ID is a string for ChromaDB
            title = doc.get('title', '')
            abstract = doc.get('abstract', '')
            # Combine title and abstract, adding a separator for clarity
            combined_text = f"{title}\n\n{abstract}".strip()
            if doc_id and combined_text: # Only add if ID and text exist
                documents.append({
                    'id': doc_id,
                    'text': combined_text,
                    # Keep original fields if needed later, e.g., for metadata
                    'metadata': {
                        'original_title': title,
                        'first_subject': doc.get('FirstSubject', ''),
                        'second_subject': doc.get('SecondSubject', '')
                    }
                })
            else:
                print(f"Warning: Skipping document due to missing ID or text. Original data: {doc}")
        print(f"Loaded {len(documents)} documents from '{filepath}'.")
        return documents
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return []
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {filepath}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred during loading: {e}")
        return []

# Specify the path to your JSON file in Kaggle's directory structure
json_filename = "/kaggle/input/corpus-eng/corpus-engineering.json"

# Verify the file exists
if not os.path.exists(json_filename):
    raise ValueError(f"File not found: {json_filename}")
    
# Load the data from the file
corpus = load_and_prepare_documents(json_filename)

# Optional: Display the first loaded document to check
if corpus:
    print("\n--- Example Document (First one) ---")
    print(f"ID: {corpus[0]['id']}")
    print(f"Combined Text:\n{corpus[0]['text']}")
    print(f"Metadata: {corpus[0]['metadata']}")

Loaded 10846 documents from '/kaggle/input/corpus-eng/corpus-engineering.json'.

--- Example Document (First one) ---
ID: 802871
Combined Text:
شناسایی عوامل بیماری‌زای قارچی گیاهان دارویی کشت شده در استان کرمان

گیاهان دارویی در طول تاریخ همیشه با انسان قرابت خاصی داشته و آثار دارویی آن بر هیچ کس پوشیده نیست. گیاهان دارویی گیاهانی هستند که یک یا برخی از اندام های آن حاوی ماده موثره است این ماده که کمتر از یک درصد وزن خشک گیاه را تشکیل می دهد دارای خواص دارویی موثر بر موجودات زنده است همچنین کاشت داشت و برداشت این گیاهان به منظور استفاده از ماده موثره ی آنها انجام می گیرد و به رغم سابقه چند هزار ساله علم طب گیاهی و وجود کتب رساله ها و مقالات فراوان در این زمینه تحقیق در مورد بیماری های گیاهان دارویی قدمت زیادی ندارد به طور کلی یکی از مشکلات کشت انبوه گیاهان دارویی بیماری های مختلفی از جمله بیماری های قارچی می باشد که تحقیقات چندانی بروی آنها صورت نگرفته است و با توجه به این که بروز بیماری های گیاهان دارویی تابع شرایط محیطی می باشد ضروری است در هر منطقه تحقیقات جداگانه ای صورت بگیرد و ل

In [6]:
import re
from tqdm import tqdm
import hazm
# Initialize the Hazm Normalizer
normalizer = hazm.Normalizer()

def clean_and_normalize(text):
    """Applies Hazm normalization and basic cleaning."""
    # Apply Hazm normalizer (handles ی, ک, ZWNJ variations, etc.)
    normalized_text = normalizer.normalize(text)
    # Remove excessive whitespace (newlines, tabs, multiple spaces) -> replace with single space
    cleaned_text = re.sub(r'\s+', ' ', normalized_text).strip()
    # Optional: Add any other specific cleaning rules here if needed
    # e.g., removing specific punctuation, converting numbers, etc.
    return cleaned_text

# --- Apply cleaning to all documents ---
print("Cleaning and normalizing documents...")
cleaned_corpus = []
for doc in tqdm(corpus):
    cleaned_text = clean_and_normalize(doc['text'])
    if cleaned_text: # Only keep if text remains after cleaning
        cleaned_corpus.append({
            'id': doc['id'], # Use the original document ID as the chunk ID for now
            'text': cleaned_text,
            'metadata': doc['metadata'] # Carry over metadata
        })
print(f"\nFinished cleaning. Resulting number of documents/chunks: {len(cleaned_corpus)}")

# Optional: Display the cleaned version of the first document
if cleaned_corpus:
    print("\n--- Example Cleaned Document (First one) ---")
    print(f"ID: {cleaned_corpus[0]['id']}")
    print(f"Cleaned Text:\n{cleaned_corpus[0]['text']}")

Cleaning and normalizing documents...


100%|██████████| 10846/10846 [00:23<00:00, 462.19it/s]


Finished cleaning. Resulting number of documents/chunks: 10846

--- Example Cleaned Document (First one) ---
ID: 802871
Cleaned Text:
شناسایی عوامل بیماری‌زای قارچی گیاهان دارویی کشت شده در استان کرمان گیاهان دارویی در طول تاریخ همیشه با انسان قرابت خاصی داشته و آثار دارویی آن بر هیچ‌کس پوشیده نیست. گیاهان دارویی گیاهانی هستند که یک یا برخی از اندام‌های آن حاوی ماده موثره است این ماده که کمتر از یک درصد وزن خشک گیاه را تشکیل می‌دهد دارای خواص دارویی موثر بر موجودات زنده است همچنین کاشت داشت و برداشت این گیاهان به منظور استفاده از ماده موثره‌ی آنها انجام می‌گیرد و به رغم سابقه چند هزار ساله علم طب گیاهی و وجود کتب رساله‌ها و مقالات فراوان در این زمینه تحقیق در مورد بیماری‌های گیاهان دارویی قدمت زیادی ندارد به طور کلی یکی از مشکلات کشت انبوه گیاهان دارویی بیماری‌های مختلفی از جمله بیماری‌های قارچی می‌باشد که تحقیقات چندانی بروی آنها صورت نگرفته است و با توجه به این‌که بروز بیماری‌های گیاهان دارویی تابع شرایط محیطی می‌باشد ضروری است در هر منطقه تحقیقات جداگانه‌ای صورت بگیرد و لزوم تحقیق 




In [7]:

if 'cleaned_corpus' in globals():
    print(f"Variable 'cleaned_corpus' exists.")
    if isinstance(cleaned_corpus, list):
        print(f"It is a list with {len(cleaned_corpus)} items.")
        if len(cleaned_corpus) > 0:
            first_item = cleaned_corpus[0]
            print(f"First item type: {type(first_item)}")
            if isinstance(first_item, dict):
                print(f"First item is a dictionary with keys: {list(first_item.keys())}")
                # --- Check for the CORRECT keys ---
                if 'id' in first_item and 'text' in first_item:
                    print("Structure looks correct (has 'id' and 'text').") # Corrected message
                    print("\nFirst item example:")
                    # Print safely, handle potential missing keys if needed later
                    print(f"  id: {first_item.get('id')}")
                    print(f"  text: {first_item.get('text')[:100]}...") # Print first 100 chars of text
                    if 'metadata' in first_item:
                         print(f"  metadata keys: {list(first_item.get('metadata', {}).keys())}")
                else:
                    # This error should NOT appear now based on your previous output
                    print("Error: First item dictionary is missing 'id' or 'text' key.")
            else:
                print("Error: Items in the list are not dictionaries.")
        else:
            print("Warning: The list is empty.")
    else:
        print(f"Error: 'cleaned_corpus' is not a list, it is a {type(cleaned_corpus)}")
else:
    print("Error: Variable 'cleaned_corpus' does not exist.")

# --- End of corrected check cell ---

Variable 'cleaned_corpus' exists.
It is a list with 10846 items.
First item type: <class 'dict'>
First item is a dictionary with keys: ['id', 'text', 'metadata']
Structure looks correct (has 'id' and 'text').

First item example:
  id: 802871
  text: شناسایی عوامل بیماری‌زای قارچی گیاهان دارویی کشت شده در استان کرمان گیاهان دارویی در طول تاریخ همیشه...
  metadata keys: ['original_title', 'first_subject', 'second_subject']


In [8]:


def embed_texts_gemini(texts, task_type, batch_size=100):
    """Embeds texts using the Gemini API with batching and error handling."""
    model_id = "models/embedding-001"
    print(f"Embedding {len(texts)} texts with Gemini API (task: {task_type}, batch_size: {batch_size})...")
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Gemini Embedding ({task_type})"):
        batch = texts[i:i + batch_size]
        for attempt in range(3):
            try:
                result = genai.embed_content(model=model_id, content=batch, task_type=task_type)
                all_embeddings.extend(result['embedding'])
                break
            except Exception as e:
                print(f"API Error: {e}. Retry {attempt + 1}/3 in 5s...")
                time.sleep(5)
                if attempt == 2:
                    print(f"FATAL: Failed to embed batch after multiple retries. Aborting.")
                    raise e # Re-raise the exception to stop the process
    return all_embeddings

def load_qa_dataset(dataset_identifier, split):
    """Loads the Question Answering dataset."""
    print(f"\n--- Loading QA Dataset: {dataset_identifier} ---")
    try:
        qa_dataset = load_dataset(dataset_identifier, split=split)
        print(f"Loaded {len(qa_dataset)} QA pairs.")
        return qa_dataset
    except Exception as e:
        print(f"FATAL ERROR loading QA dataset: {e}")
        raise e

def setup_chroma_collection(client, collection_name, embeddings_list, ids_list, documents_list, metadatas_list=None): # Added metadatas_list
    """Creates/gets a ChromaDB collection and adds data in batches."""
    print(f"\n--- Setting up ChromaDB collection: {collection_name} ---")
    try:
        collection = client.get_or_create_collection(name=collection_name, metadata={"hnsw:space": "l2"})
        
        # ChromaDB's max batch size (can be slightly less for safety)
        # The error message said 5461, let's use 5000 for safety
        chroma_batch_size = 5000 
        num_docs = len(ids_list)
        
        print(f"Adding {num_docs} documents to collection in batches of {chroma_batch_size}...")
        for i in tqdm(range(0, num_docs, chroma_batch_size), desc="Adding to Chroma"):
            batch_ids = ids_list[i:i + chroma_batch_size]
            batch_embeddings = embeddings_list[i:i + chroma_batch_size]
            batch_documents = documents_list[i:i + chroma_batch_size]
            batch_metadatas = metadatas_list[i:i + chroma_batch_size] if metadatas_list else None

            if not batch_ids: # Should not happen if num_docs > 0
                continue

            collection.add(
                ids=batch_ids,
                embeddings=batch_embeddings,
                documents=batch_documents,
                metadatas=batch_metadatas # Pass metadatas if available
            )
        print("Finished adding documents to collection.")
        return collection
    except Exception as e:
        print(f"FATAL ERROR setting up ChromaDB: {e}")
        # Attempt to clean up if collection was partially created
        try:
            if collection_name and client: # ensure client exists
                client.delete_collection(collection_name)
                print(f"Attempted cleanup of partially created collection '{collection_name}'.")
        except Exception as e_del:
            print(f"Warning during cleanup of collection {collection_name}: {e_del}")
        return None
# --- Step 4: Main Execution Block ---

def main():
    # --- Configuration ---
    MODEL_TO_TEST = "google/gemini-embedding-001"
    QA_DATASET_IDENTIFIER = "safora/persian-scientific-qa"
    QA_DATASET_SPLIT = "train"
    TOP_K = 10
    
    # 1. Configure API
    print("--- Configuring Gemini API ---")
    try:
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("GEMINI_API_KEY")
        genai.configure(api_key=api_key)
        print("Gemini API configured successfully.")
    except Exception as e:
        print(f"FATAL ERROR: Could not configure Gemini API. Ensure secret 'GEMINI_API_KEY' is set. Details: {e}")
        return

    # 2. Load and Clean Data
    corpus = load_and_prepare_documents("/kaggle/input/corpus-eng/corpus-engineering.json")
    if not corpus: return

    hazm_normalizer = hazm.Normalizer()
    #cleaned_corpus = [{'id': doc['id'], 'text': clean_and_normalize(doc['text'], hazm_normalizer), 'metadata': doc['metadata']} for doc in corpus]
    qa_dataset = load_qa_dataset(QA_DATASET_IDENTIFIER, QA_DATASET_SPLIT)
    
    # --- Start of timed operations ---
    print(f"\n--- Starting Timed Evaluation for Model: {MODEL_TO_TEST} ---")
    
    # 3. Embed Corpus & Questions
    embedding_start_time = time.time()
    corpus_texts = [doc['text'] for doc in cleaned_corpus]
    corpus_embeddings = embed_texts_gemini(corpus_texts, task_type="RETRIEVAL_DOCUMENT")
    
    all_questions_texts = [item['query'] for item in qa_dataset]
    question_embeddings = embed_texts_gemini(all_questions_texts, task_type="RETRIEVAL_QUERY")
    embedding_time = time.time() - embedding_start_time

    # 4. Build Vector Index
    collection_name = "gemini_final_eval_fixed"
    chroma_client = chromadb.Client()
    
    try: chroma_client.delete_collection(name=collection_name)
    except Exception: pass
        
    print(f"\n--- Building Vector Index: {collection_name} ---")
    corpus_doc_ids = [doc['id'] for doc in cleaned_corpus]
    corpus_doc_texts = [doc['text'] for doc in cleaned_corpus]
    corpus_doc_metadatas = [doc['metadata'] for doc in cleaned_corpus] # Assuming metadata is needed

    collection = setup_chroma_collection( # CALLING THE CORRECTED FUNCTION
        chroma_client,
        collection_name,
        corpus_embeddings, # This is the list of embeddings for the corpus
        corpus_doc_ids,
        corpus_doc_texts,
        corpus_doc_metadatas
    )
  
    del corpus_embeddings, corpus_texts # Free up memory
    gc.collect()

    # 5. Run Search and Calculate Metrics
    search_start_time = time.time()
    print(f"\n--- Evaluating retrieval on {len(qa_dataset)} QA pairs ---")
    
    all_metrics = []
    for i in tqdm(range(len(qa_dataset)), desc="Evaluating QA pairs"):
        true_id = str(qa_dataset[i]['abstract_id'])
        query_embedding = [question_embeddings[i]]
        
        retrieved_ids = collection.query(query_embeddings=query_embedding, n_results=TOP_K)['ids'][0]
        
        # Calculate metrics for this single query
        gains = [1 if doc_id == true_id else 0 for doc_id in retrieved_ids]
        
        try:
            found_rank = gains.index(1) + 1
            mrr = 1.0 / found_rank
        except ValueError:
            found_rank = 0
            mrr = 0.0

        all_metrics.append({
            'mrr': mrr,
            'recall_at_1': 1 if found_rank == 1 else 0,
            'recall_at_3': 1 if 1 <= found_rank <= 3 else 0,
            'recall_at_5': 1 if 1 <= found_rank <= 5 else 0,
            'recall_at_10': 1 if 1 <= found_rank <= 10 else 0,
        })
    search_time = time.time() - search_start_time

    # 5. Run Search and Calculate Metrics
    search_start_time = time.time()
    print(f"\n--- Evaluating retrieval on {len(qa_dataset)} QA pairs ---")
    
    all_mrr = []
    all_recall_at_1 = []
    all_recall_at_3 = []
    all_recall_at_5 = []
    all_recall_at_10 = []
    all_precision_at_1 = []
    all_precision_at_3 = []
    all_precision_at_5 = []
    all_ndcg_at_10 = []

    for i in tqdm(range(len(qa_dataset)), desc="Evaluating QA pairs"):
        true_id = str(qa_dataset[i]['abstract_id'])
        # Ensure query_embedding is a list of lists for ChromaDB
        current_query_embedding = [question_embeddings[i]] 
        
        retrieved_ids = collection.query(query_embeddings=current_query_embedding, n_results=TOP_K)['ids'][0]
        
        # Calculate metrics for this single query
        gains = [1 if doc_id == true_id else 0 for doc_id in retrieved_ids]
        # Pad gains with 0s if fewer than TOP_K results were retrieved
        while len(gains) < TOP_K:
            gains.append(0)

        found_rank = 0
        try:
            found_rank = gains.index(1) + 1 # 1-based rank
            mrr = 1.0 / found_rank
        except ValueError: # Relevant document not found in top K
            mrr = 0.0
        
        all_mrr.append(mrr)
        all_recall_at_1.append(1 if 1 <= found_rank <= 1 else 0)
        all_recall_at_3.append(1 if 1 <= found_rank <= 3 else 0)
        all_recall_at_5.append(1 if 1 <= found_rank <= 5 else 0)
        all_recall_at_10.append(1 if 1 <= found_rank <= TOP_K else 0) # TOP_K instead of hardcoded 10

        # Precision@k
        all_precision_at_1.append(sum(gains[:1]) / 1.0)
        all_precision_at_3.append(sum(gains[:3]) / 3.0)
        all_precision_at_5.append(sum(gains[:5]) / 5.0)

        # nDCG@10
        dcg_at_10 = 0.0
        for k_idx_ndcg in range(TOP_K): # Iterate up to TOP_K
            dcg_at_10 += gains[k_idx_ndcg] / np.log2(k_idx_ndcg + 2) # rank is k_idx_ndcg + 1

        idcg_at_10 = 0.0
        if any(g == 1 for g in gains): # If the relevant document was retrieved at all
            idcg_at_10 = 1.0 / np.log2(1 + 1) # Ideal position for the single relevant document is rank 1
        
        ndcg_at_10_score = dcg_at_10 / idcg_at_10 if idcg_at_10 > 0 else 0.0
        all_ndcg_at_10.append(ndcg_at_10_score)

    search_time = time.time() - search_start_time

    # 6. Aggregate and Print Final Report
    num_queries = len(qa_dataset)
    final_mrr = np.mean(all_mrr) if all_mrr else 0.0
    final_recall_at_1 = np.mean(all_recall_at_1) if all_recall_at_1 else 0.0
    final_recall_at_3 = np.mean(all_recall_at_3) if all_recall_at_3 else 0.0
    final_recall_at_5 = np.mean(all_recall_at_5) if all_recall_at_5 else 0.0
    final_recall_at_10 = np.mean(all_recall_at_10) if all_recall_at_10 else 0.0
    
    final_precision_at_1 = np.mean(all_precision_at_1) if all_precision_at_1 else 0.0
    final_precision_at_3 = np.mean(all_precision_at_3) if all_precision_at_3 else 0.0
    final_precision_at_5 = np.mean(all_precision_at_5) if all_precision_at_5 else 0.0
    final_ndcg_at_10 = np.mean(all_ndcg_at_10) if all_ndcg_at_10 else 0.0


    print("\n" + "="*50)
    print("--- FINAL EVALUATION REPORT ---")
    print(f"Model Tested: {MODEL_TO_TEST}")
    print(f"Embedding Time: {embedding_time:.2f} seconds")
    print(f"Search & Metric Calculation Time: {search_time:.2f} seconds")
    print("--- METRICS ---")
    print(f"MRR@{TOP_K}: {final_mrr:.4f}")
    print(f"Recall@1 (Accuracy): {final_recall_at_1:.4f}")
    print(f"Recall@3: {final_recall_at_3:.4f}")
    print(f"Recall@5: {final_recall_at_5:.4f}")
    print(f"Recall@10: {final_recall_at_10:.4f}")
    print(f"Precision@1: {final_precision_at_1:.4f}")
    print(f"Precision@3: {final_precision_at_3:.4f}")
    print(f"Precision@5: {final_precision_at_5:.4f}")
    print(f"nDCG@10: {final_ndcg_at_10:.4f}") # Using TOP_K which is 10
    print("="*50)
    # 7. Cleanup
    print("\n--- Cleaning up resources ---")
    try:
        chroma_client.delete_collection(name=collection_name)
        print(f"Successfully deleted collection '{collection_name}'.")
    except Exception as e:
        print(f"Warning during cleanup: {e}")

# Run the main function
if __name__ == "__main__":
    main()

--- Configuring Gemini API ---
Gemini API configured successfully.
Loaded 10846 documents from '/kaggle/input/corpus-eng/corpus-engineering.json'.

--- Loading QA Dataset: safora/persian-scientific-qa ---


README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

persian-scientific-qa.json:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39883 [00:00<?, ? examples/s]

Loaded 39883 QA pairs.

--- Starting Timed Evaluation for Model: google/gemini-embedding-001 ---
Embedding 10846 texts with Gemini API (task: RETRIEVAL_DOCUMENT, batch_size: 100)...


Gemini Embedding (RETRIEVAL_DOCUMENT): 100%|██████████| 109/109 [02:03<00:00,  1.14s/it]


Embedding 39883 texts with Gemini API (task: RETRIEVAL_QUERY, batch_size: 100)...


Gemini Embedding (RETRIEVAL_QUERY): 100%|██████████| 399/399 [07:04<00:00,  1.06s/it]



--- Building Vector Index: gemini_final_eval_fixed ---

--- Setting up ChromaDB collection: gemini_final_eval_fixed ---
Adding 10846 documents to collection in batches of 5000...


Adding to Chroma: 100%|██████████| 3/3 [00:20<00:00,  6.79s/it]


Finished adding documents to collection.

--- Evaluating retrieval on 39883 QA pairs ---


Evaluating QA pairs: 100%|██████████| 39883/39883 [01:35<00:00, 419.55it/s]



--- Evaluating retrieval on 39883 QA pairs ---


Evaluating QA pairs: 100%|██████████| 39883/39883 [01:36<00:00, 413.67it/s]



--- FINAL EVALUATION REPORT ---
Model Tested: google/gemini-embedding-001
Embedding Time: 549.78 seconds
Search & Metric Calculation Time: 96.41 seconds
--- METRICS ---
MRR@10: 0.0194
Recall@1 (Accuracy): 0.0148
Recall@3: 0.0221
Recall@5: 0.0254
Recall@10: 0.0301
Precision@1: 0.0148
Precision@3: 0.0074
Precision@5: 0.0051
nDCG@10: 0.0219

--- Cleaning up resources ---
Successfully deleted collection 'gemini_final_eval_fixed'.
