In [None]:
from scripts.embed_documents import preprocess_and_embed_documents
from scripts.search_query import preprocess_query, perform_search
from src.evaluation import evaluate_precision_recall
from sentence_transformers import SentenceTransformer
import pandas as pd

# Step 1: Load the Data
documents_df = pd.read_csv('../data/documents.csv')
queries_df = pd.read_csv('../data/queries.csv')
ground_truth = pd.read_csv('../data/ground_truth.csv')

# Extract document texts and query texts
documents = documents_df['Document'].tolist()
queries = queries_df['Query'].tolist()

# Step 2: Preprocess and Embed Documents
print("Step 2: Preprocessing and Embedding Documents...")

document_embeddings = preprocess_and_embed_documents(documents)

print(f"Number of Document Embeddings: {len(document_embeddings)}")
print(f"Sample Document Embedding Shape: {document_embeddings[0].shape}")

# Step 3: Search and Generate Query Embeddings
print("\nStep 3: Preprocessing Queries and Generating Embeddings...")

# Initialize SBERT Model
model = SentenceTransformer('all-mpnet-base-v2')
results = {}

for query_number, query_text in zip(queries_df['Query number'], queries):
    # Preprocess and embed query
    preprocessed_query = preprocess_query(query_text)
    query_embedding = model.encode(preprocessed_query, convert_to_tensor=True)

    # Perform search
    search_results = perform_search(query_embedding, document_embeddings, top_k=10)
    results[query_number] = search_results

# Step 4: Evaluate Precision and Recall
print("\nStep 4: Evaluating Precision and Recall...")
k_values = [1, 3, 5, 10]
precision, recall = evaluate_precision_recall(results, ground_truth, k_values)

print("Precision@k:", precision)
print("Recall@k:", recall)

# Debugging Outputs
print("\nSample Results:")
for query_num, retrieved_docs in list(results.items())[:5]:
    print(f"Query {query_num}: {retrieved_docs}")


ModuleNotFoundError: No module named 'scripts'