In [2]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    ps = PorterStemmer()
    stems = [ps.stem(token) for token in filtered_tokens]
    return " ".join(stems)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# services/search_classes.py (Content in a Jupyter Notebook cell)

import faiss
import numpy as np
import sqlite3
from sentence_transformers import SentenceTransformer # Also needed for BertSearch
from rank_bm25 import BM25Okapi # Needed for Bm25Search if not already imported
from sklearn.feature_extraction.text import TfidfVectorizer # Needed for TfIdfSearch if not already imported

# Ensure preprocess function is defined in a preceding cell or imported if from a separate .py file
# from services.preprocessing_service import preprocess

class TfIdfSearch:
    def __init__(self, data):
        self.vec = data["vectorizer"]
        self.mat = data["matrix"]
        self.doc_ids = data["doc_ids"]

    def execute_search(self, query):
        # preprocess function must be available in the notebook's scope
        vec = self.vec.transform([preprocess(query)])
        scores = (self.mat @ vec.T).toarray().flatten()
        top_idx = scores.argsort()[::-1][:10]
        return [self.doc_ids[i] for i in top_idx]

class Bm25Search:
    def __init__(self, data):
        self.bm25 = data["bm25"]
        self.doc_ids = data["doc_ids"]
        self.tokenized = data["tokenized_docs"]

    def execute_search(self, query):
        # preprocess function must be available in the notebook's scope
        tokens = preprocess(query).split()
        scores = self.bm25.get_scores(tokens)
        top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:10]
        return [self.doc_ids[i] for i in top_idx]

class BertSearch:
    def __init__(self, dataset):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

        index_path = f"faiss_store/{dataset}/index.faiss"
        self.index = faiss.read_index(index_path)

        if self.index.ntotal > 0:
            print(f"FAISS index loaded for dataset: {dataset} (size: {self.index.ntotal})")

        conn = sqlite3.connect("offline/ir_project.db")
        cursor = conn.execute(f"SELECT doc_id, doc FROM {dataset}")
        self.docs = {row[0]: row[1] for row in cursor}
        self.doc_ids = list(self.docs.keys())
        conn.close()

    def execute_search(self, query):
        # preprocess function must be available in the notebook's scope
        processed = preprocess(query)
        q_emb = self.model.encode([processed]).astype("float32")
        d, I = self.index.search(q_emb, 10)
        top_ids = [self.doc_ids[i] for i in I[0] if i < len(self.doc_ids)]
        return top_ids # Changed to return doc_ids as per evaluation needs

class HybridSearch:
    def __init__(self, dataset):
        # Ensure get_search_service is defined in a preceding cell or accessible
        from services.search_factory import get_search_service

        self.bert = get_search_service("bert", dataset)
        self.bm25 = get_search_service("bm25", dataset)
        self.tfidf = get_search_service("tfidf", dataset)

    def execute_search(self, query):
        bert_results = self.bert.execute_search(query)
        bm25_results = self.bm25.execute_search(query)
        tfidf_results = self.tfidf.execute_search(query)
        combined = list(dict.fromkeys(bert_results + bm25_results + tfidf_results))[:10]
        return combined

In [4]:
def get_search_service(search_type, dataset):
    if search_type == "tfidf":
        return TfIdfSearch(dataset)
    elif search_type == "bm25":
        return Bm25Search(dataset)
    elif search_type == "bert":
        return BertSearch(dataset)
    elif search_type == "hybrid":
        return HybridSearch(dataset)
    else:
        raise ValueError("Invalid search type")

In [5]:
import joblib
def load_data(search_type, dataset):
    if search_type == "bert" or search_type == "hybrid":
        return dataset
    joblib_data = joblib.load(f"offline_data/{search_type}_{dataset}.joblib")
    joblib_data["dataset"] = dataset 
    return joblib_data

async def search(
    query: str,
    dataset: str,
    search_type: str
):
    data = load_data("bert" if search_type == "bert" else search_type, dataset)
    service = get_search_service(search_type, data)
    return service.execute_search(query)

In [6]:
import os
import asyncio
import json
from collections import defaultdict
import joblib # Ensure joblib is imported if not already

# --- Your load_data function (as provided by you) ---
def load_data(search_type, dataset):
    if search_type == "bert" or search_type == "hybrid":
        return dataset # For BERT/Hybrid, the dataset itself is passed directly
    joblib_data = joblib.load(f"offline_data/{search_type}_{dataset}.joblib")
    joblib_data["dataset"] = dataset
    return joblib_data

# --- Evaluation Data Loaders (rest of your functions) ---

def load_queries_from_tsv(filepath):
    """Loads queries from a TSV file (query_id\tquery_text)."""
    queries = {}
    if not os.path.exists(filepath):
        print(f"Warning: Query file not found at {filepath}")
        return queries
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                query_id, query_text = parts
                queries[query_id] = query_text
    return queries

def load_qrels_from_tsv(filepath):
    """Loads qrels from a TSV file (query_id\t0\tdoc_id\trelevance_score)."""
    qrels = defaultdict(set) # Maps query_id to a set of relevant doc_ids
    if not os.path.exists(filepath):
        print(f"Warning: Qrels file not found at {filepath}")
        return qrels
    with open(filepath, 'r', encoding='utf-8') as f:
        # Try to skip header if it exists (e.g., "query-id\tcorpus-id\tscore")
        first_line_pos = f.tell() # Store current position
        first_line = f.readline().strip()
        # This line checks for common header patterns and skips the first line if it's a header
        if not first_line.startswith(('query-id', 'query_id', '#')): 
            f.seek(first_line_pos) # Go back to the start if no header detected
        
        for line in f:
            parts = line.strip().split() # This splits by any whitespace, accommodating both tab and space separators
            if len(parts) == 4:
                query_id, _, doc_id, relevance = parts # Unpacks 4 columns
                try:
                    if int(relevance) == 1: 
                        qrels[query_id].add(doc_id)
                    elif int(relevance) == 2: 
                        qrels[query_id].add(doc_id)
                        qrels[query_id].add(doc_id)
                    elif int(relevance) == 3:
                        qrels[query_id].add(doc_id)
                        qrels[query_id].add(doc_id)
                        qrels[query_id].add(doc_id)
                    elif int(relevance) == 4: 
                        qrels[query_id].add(doc_id)
                        qrels[query_id].add(doc_id)
                        qrels[query_id].add(doc_id)
                        qrels[query_id].add(doc_id)
                except ValueError:
                    continue
            elif len(parts) == 3: # Handles 3-column format
                query_id, doc_id, relevance = parts # Unpacks 3 columns
                if int(relevance) == 1: # Consider any score >= 1 as relevant
                        qrels[query_id].add(doc_id)
            else:
                continue # Skips any lines that don't match 3 or 4 columns

            try:
                if int(relevance) >= 1: # Consider any score >= 1 as relevant
                    qrels[query_id].add(doc_id)
            except ValueError:
                continue # Skips lines where relevance score isn't an integer
    return qrels

def load_queries_from_jsonl_file(filepath):
    """Loads queries from a JSONL file (e.g., Quora format)."""
    queries = {}
    if not os.path.exists(filepath):
        print(f"Warning: Query JSONL file not found at {filepath}")
        return queries
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip(): # Skip empty lines
                try:
                    data = json.loads(line)
                    query_id = data.get("_id")
                    query_text = data.get("text")
                    if query_id and query_text:
                        queries[query_id] = query_text
                except json.JSONDecodeError:
                    print(f"Warning: Could not parse JSONL line from {filepath}: {line[:50]}...")
    return queries

# --- Evaluation Metrics ---

def calculate_precision_at_k(retrieved_docs, relevant_docs, k):
    """Calculates Precision@k."""
    if not relevant_docs or not retrieved_docs:
        return 0.0
    
    # Ensure retrieved_docs are distinct and truncate to k
    retrieved_docs_at_k = list(dict.fromkeys(retrieved_docs))[:k]
    
    hits = 0
    for doc_id in retrieved_docs_at_k:
        if doc_id in relevant_docs:
            hits += 1
    return hits / k if k > 0 else 0.0

def calculate_mrr(retrieved_docs, relevant_docs):
    """Calculates Mean Reciprocal Rank (MRR) for a single query."""
    if not relevant_docs:
        return 0.0
    
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            return 1.0 / (i + 1)
    return 0.0 # No relevant document found at all

def calculate_recall(retrieved_docs, relevant_docs):
    """Calculates Recall."""
    if not relevant_docs:
        return 0.0
    
    hits = 0
    for doc_id in retrieved_docs:
        if doc_id in relevant_docs:
            hits += 1
    return hits / len(relevant_docs)

def calculate_average_precision(retrieved_docs, relevant_docs):
    """Calculates Average Precision (AP) for a single query."""
    if not relevant_docs or not retrieved_docs:
        return 0.0

    sum_precisions = 0.0
    num_relevant_found = 0
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            num_relevant_found += 1
            precision_at_i = num_relevant_found / (i + 1.0)
            sum_precisions += precision_at_i
    
    return sum_precisions / len(relevant_docs) if len(relevant_docs) > 0 else 0.0


# --- Main Evaluation Logic ---

async def run_full_evaluation(datasets_info, search_types_to_evaluate, k_values=[1, 5, 10]):
    """
    Runs a full evaluation across specified datasets and search types.
    datasets_info: A dictionary mapping dataset names to their query/qrels file paths.
                   e.g., {"dataset_name": {"queries_path": "path/to/queries.txt", "qrels_path": "path/to/qrels.tsv", "query_format": "tsv"}}
    k_values: List of K values for Precision@K.
    """
    overall_results = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
    
    for dataset_name, dataset_config in datasets_info.items():
        print(f"\n======== Evaluating Dataset: {dataset_name} ========")
        
        queries = {}
        if dataset_config.get("query_format") == "jsonl":
            queries = load_queries_from_jsonl_file(dataset_config["queries_path"])
        else: # Default to TSV if not specified or different
            queries = load_queries_from_tsv(dataset_config["queries_path"])

        qrels = load_qrels_from_tsv(dataset_config["qrels_path"])

        if not queries:
            print(f"Skipping {dataset_name}: No queries loaded from {dataset_config['queries_path']}.")
            continue
        if not qrels:
            print(f"Skipping {dataset_name}: No qrels loaded from {dataset_config['qrels_path']}.")
            continue
        
        for search_type in search_types_to_evaluate:
            print(f"\n--- Search Type: {search_type.upper()} ---")
            
            total_mrr = 0.0
            total_recall = 0.0
            total_map = 0.0
            total_precision_at_k = defaultdict(float)
            query_count_for_metrics = 0

            try:
                # Call load_data with both search_type and dataset_name
                loaded_search_components = load_data(search_type, dataset_name)
                # Ensure get_search_service is defined and returns valid search service objects
                current_search_service = get_search_service(search_type, loaded_search_components) 
            except Exception as e:
                print(f"Error initializing search service for {search_type} on {dataset_name}: {e}. Skipping.")
                continue

            for query_id, query_text in queries.items():
                if query_id not in qrels:
                    # print(f"Warning: Query ID {query_id} not found in qrels. Skipping for metrics for this query.")
                    continue 

                relevant_docs_for_query = qrels[query_id]
                
                try:
                    # --- CORRECTED: Removed 'await' here ---
                    retrieved_docs = current_search_service.execute_search(query_text)
                    
                    mrr_score = calculate_mrr(retrieved_docs, relevant_docs_for_query)
                    total_mrr += mrr_score

                    recall_score = calculate_recall(retrieved_docs, relevant_docs_for_query)
                    total_recall += recall_score

                    ap_score = calculate_average_precision(retrieved_docs, relevant_docs_for_query)
                    total_map += ap_score


                    for k in k_values:
                        pk_score = calculate_precision_at_k(retrieved_docs, relevant_docs_for_query, k)
                        total_precision_at_k[k] += pk_score
                    
                    query_count_for_metrics += 1

                except Exception as e:
                    print(f"Error during search for query '{query_id}' ({search_type}) in {dataset_name}: {e}")
            
            if query_count_for_metrics > 0:
                avg_mrr = total_mrr / query_count_for_metrics
                print(f"  Average MRR: {avg_mrr:.4f}")
                overall_results[dataset_name][search_type]['MRR'] = avg_mrr

                avg_recall = total_recall / query_count_for_metrics
                print(f"  Average Recall: {avg_recall:.4f}")
                overall_results[dataset_name][search_type]['Recall'] = avg_recall

                avg_map = total_map / query_count_for_metrics
                print(f"  Average MAP: {avg_map:.4f}")
                overall_results[dataset_name][search_type]['MAP'] = avg_map


                for k in k_values:
                    avg_pk = total_precision_at_k[k] / query_count_for_metrics
                    print(f"  Average P@{k}: {avg_pk:.4f}")
                    overall_results[dataset_name][search_type][f'P@{k}'] = avg_pk
            else:
                print(f"  No queries with relevance judgments found for {dataset_name} ({search_type}) to calculate metrics.")
            
            print("-" * 40) # Separator for search types
    
    print("\n======== Full Evaluation Summary ========")
    for dataset, search_types_results in overall_results.items():
        print(f"Dataset: {dataset}")
        for search_type, metrics in search_types_results.items():
            metric_str = ", ".join([f"{m}: {v:.4f}" for m, v in metrics.items()])
            print(f"  {search_type.upper()}: {metric_str}")
        print("-" * 30)

# --- Configuration for your evaluation ---

EVAL_DATASETS_INFO = {
    "antique": { 
        "queries_path": "data/antique/queries.txt",
        "qrels_path": "data/antique/qrels.tsv",
        "query_format": "tsv"
    },
    "quora": { 
        "queries_path": "data/quora/queries.jsonl",
        "qrels_path": "data/quora/qrels/test.tsv", 
        "query_format": "jsonl"
    }
}
# --- CORRECTED: Removed 'bm25' and 'hybrid' as requested ---
EVAL_SEARCH_TYPES = ["tfidf", "bert"]
EVAL_K_VALUES = [1, 5, 10]

# --- Execute the full evaluation ---
if __name__ == "__main__":
    # If running this in a single Jupyter cell and encountering issues with asyncio,
    # you might need to uncomment and run the following lines once at the beginning of your notebook:
    # import nest_asyncio
    # nest_asyncio.apply()
    
    print("Starting full evaluation...\n")
    # You still need to ensure that get_search_service and your search classes (TfIdfSearch, Bm25Search, etc.)
    # are defined and accessible in your environment.
    # Also, ensure 'offline_data' directory exists and contains your .joblib files
    # if you are using 'tfidf' or 'bm25' search types.
    await run_full_evaluation(EVAL_DATASETS_INFO, EVAL_SEARCH_TYPES, EVAL_K_VALUES)
    print("\nFull evaluation complete!")

Starting full evaluation...



--- Search Type: TFIDF ---
  Average MRR: 0.4110
  Average Recall: 0.0653
  Average MAP: 0.0463
  Average P@1: 0.3250
  Average P@5: 0.2260
  Average P@10: 0.1910
----------------------------------------

--- Search Type: BERT ---
FAISS index loaded for dataset: antique (size: 403666)
  Average MRR: 0.5412
  Average Recall: 0.0971
  Average MAP: 0.0728
  Average P@1: 0.4500
  Average P@5: 0.3450
  Average P@10: 0.2895
----------------------------------------


--- Search Type: TFIDF ---
  Average MRR: 0.3018
  Average Recall: 0.4144
  Average MAP: 0.2787
  Average P@1: 0.2385
  Average P@5: 0.0889
  Average P@10: 0.0541
----------------------------------------

--- Search Type: BERT ---
FAISS index loaded for dataset: quora (size: 522931)
  Average MRR: 0.5520
  Average Recall: 0.6975
  Average MAP: 0.5249
  Average P@1: 0.4648
  Average P@5: 0.1635
  Average P@10: 0.0949
----------------------------------------

Dataset: antique
  TFIDF: MRR: 0.4110, Rec

In [1]:
import os
print(os.path.exists("data/quora/qrels/test.tsv"))

True
