In [1]:
# Imports
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score
import ir_datasets
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = os.path.dirname(os.path.abspath('.'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import our services
from services.vectorization.tfidf_vectorizer import load_tfidf_model
from services.vectorization.embedding_vectorizer import load_embedding_model
from services.matcher import Matcher
from services.query_processor import QueryProcessor

# Set up plotting style
plt.style.use('default')

print("✓ All imports completed successfully!")
print(f"✓ Project root: {project_root}")

INFO:services.indexing_service:Loading required NLTK resources...


✓ All imports completed successfully!
✓ Project root: c:\Users\Ahmad\Desktop\ir_system_project


In [2]:
# Load documents from database and queries/qrels from evaluation folder
import sqlite3
import pandas as pd
from collections import defaultdict
import os

dataset_name = "antique/test"
print(f"Loading data for: {dataset_name}")

# 1. Load documents from database
print("Loading documents from database...")
db_path = os.path.join(project_root, "data", "ir_documents.db")
print(f"Database path: {db_path}")

conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT doc_id, text FROM documents WHERE dataset = ?", (dataset_name,))
rows = cursor.fetchall()
conn.close()

documents = {row[0]: row[1] for row in rows}
print(f"Loaded {len(documents)} documents from database")

# 2. Load queries from evaluation folder
print("Loading queries from evaluation folder...")
queries_path = os.path.join(project_root, "evaluation", "antique_test.queries.tsv")
queries_df = pd.read_csv(queries_path, sep='\t', header=None, names=['query_id', 'query_text'])
queries = dict(zip(queries_df['query_id'], queries_df['query_text']))

# 3. Load qrels from evaluation folder
print("Loading qrels from evaluation folder...")
qrels_path = os.path.join(project_root, "evaluation", "antique_test.qrels")
qrels_df = pd.read_csv(qrels_path, sep=' ', header=None, names=['query_id', 'run_id', 'doc_id', 'relevance'])

# Group by query_id, handle NaN values
qrels = {}
for _, row in qrels_df.iterrows():
    query_id = str(row['query_id'])
    doc_id = str(row['doc_id'])
    
    # Handle NaN values in relevance
    if pd.isna(row['relevance']):
        relevance = 0
    else:
        relevance = int(row['relevance'])
    
    if query_id not in qrels:
        qrels[query_id] = {}
    qrels[query_id][doc_id] = relevance

print(f"✓ Loaded: {len(documents)} documents, {len(queries)} queries, {len(qrels)} query-relevance sets")
print(f"Sample query: {list(queries.items())[0]}")
print(f"Sample qrel: {list(qrels.items())[0]}")

Loading data for: antique/test
Loading documents from database...
Database path: c:\Users\Ahmad\Desktop\ir_system_project\data\ir_documents.db
Loaded 403666 documents from database
Loading queries from evaluation folder...
Loading qrels from evaluation folder...
✓ Loaded: 403666 documents, 200 queries, 200 query-relevance sets
Sample query: (3990512, 'how can we get concentration onsomething?')
Sample qrel: ('1964316', {'1964316_5': 4, '1674088_11': 1, '1218838_13': 2, '1519022_15': 2, '3059341_5': 2, '4126855_1': 2, '2434719_9': 2, '3786452_1': 2, '1964316_3': 4, '1964316_2': 4, '767911_0': 2, '1964316_0': 4, '1964316_1': 3, '1964316_4': 3, '1248144_1': 2, '2768257_0': 2, '1519022_3': 2, '2245059_0': 2, '1013722_5': 2, '650233_14': 2, '2305171_0': 2, '3435824_3': 2, '636973_2': 1, '1724160_7': 2, '3592532_6': 2, '636973_1': 1, '1148987_10': 2, '2787567_1': 2, '647686_0': 2, '369616_4': 4, '1759521_19': 3, '2929011_0': 4, '1810312_4': 2})


In [None]:
Loading Hybrid models and vectors...
❌ Error loading TF-IDF: TF-IDF model not found: antique/test
❌ Error loading Embedding: Model not found: models\antique_test_embedding_model.joblib
❌ Cannot proceed with hybrid evaluation - missing models