In [1]:
# Import required modules
from scripts.embed_documents import preprocess_and_embed_documents
from scripts.search_query import preprocess_and_search_query
from src.evaluation import evaluate_precision_recall
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from collections import defaultdict
import numpy as np
import pandas as pd
import os

ModuleNotFoundError: No module named 'scripts.embed_documents'

In [None]:

# 1. EMBEDDINGS
# Load and embed documents
def load_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
                texts.append(file.read())
    return texts

class EmbeddingModel:
    def __init__(self, model_name='all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts):
        return np.array(self.model.encode(texts, convert_to_numpy=True))

    def save_embeddings(self, embeddings, filepath):
        np.save(filepath, embeddings)

    def load_embeddings(self, filepath):
        return np.load(filepath)

# Load documents and generate embeddings
data_folder = "../data/full_docs_small"
documents = load_texts_from_folder(data_folder)
embedding_model = EmbeddingModel()

if not os.path.exists("../data/full_docs_embeddings.npy"):
    document_embeddings = embedding_model.generate_embeddings(documents)
    embedding_model.save_embeddings(document_embeddings, "../data/full_docs_embeddings.npy")
else:
    document_embeddings = embedding_model.load_embeddings("../data/full_docs_embeddings.npy")

print(f"Document embeddings shape: {document_embeddings.shape}")

In [None]:

# 2. SEARCHING
class SearchEngine:
    def __init__(self, document_embeddings):
        self.document_embeddings = document_embeddings

    def search(self, query_embedding, top_k=10):
        similarities = [
            1 - cosine(query_embedding, doc_embedding)
            for doc_embedding in self.document_embeddings
        ]
        ranked_results = sorted(
            enumerate(similarities), key=lambda x: x[1], reverse=True
        )
        return ranked_results[:top_k]

# Load queries
queries_path = "../data/dev_small_queries - dev_small_queries.csv"
queries_df = pd.read_csv(queries_path)
queries = queries_df['Query'].tolist()

# Test search with the first query
query_embedding = embedding_model.generate_embeddings([queries[0]])[0]
search_engine = SearchEngine(document_embeddings)
results = search_engine.search(query_embedding)

print("Query:", queries[0])
print("Top Results:", results)


In [None]:
# 3. EVALUATION
def evaluate_precision_recall(results, ground_truth, k_values):
    precision_at_k = {}
    recall_at_k = {}
    
    for k in k_values:
        precisions = []
        recalls = []
        for query_id, retrieved_docs in results.items():
            relevant_docs = set(ground_truth[query_id])  # Relevant documents for this query
            retrieved_at_k = {doc[0] for doc in retrieved_docs[:k]}  # Top-k retrieved docs
            
            # Calculate precision and recall
            precision = len(retrieved_at_k & relevant_docs) / max(1, k)
            recall = len(retrieved_at_k & relevant_docs) / max(1, len(relevant_docs))
            
            precisions.append(precision)
            recalls.append(recall)
        
        # Average precision and recall across all queries for this value of k
        precision_at_k[k] = sum(precisions) / len(precisions)
        recall_at_k[k] = sum(recalls) / len(recalls)
    
    return precision_at_k, recall_at_k

In [None]:
# Initialize results as an empty dictionary
results = {}

# Debugging: Check query data
print(f"Queries DataFrame shape: {queries_df.shape}")
print(f"Number of queries: {len(queries_df)}")
print(f"Sample Query Numbers: {queries_df['Query number'][:5]}")
print(f"Sample Queries: {queries_df['Query'][:5]}")

# Perform search for all queries
for query_number, query_text in zip(queries_df['Query number'], queries_df['Query']):
    print(f"Query Number: {query_number}, Query Text: {query_text}")
    
    # Preprocess query and perform search
    search_results = preprocess_and_search_query(query_text, document_embeddings)
    print(f"Search Results: {search_results[:5]}")
    
    # Add results to dictionary
    results[query_number] = search_results

# Debugging: Print a sample of the results
print(f"Sample results: {list(results.items())[:5]}")

In [None]:
ground_truth = defaultdict(list)
for _, row in ground_truth_df.iterrows():
    ground_truth[row['Query_number']].append(row['doc_number'])

In [None]:
k_values = [1, 3, 5, 10]

# Evaluate Precision and Recall
print("Step 3: Evaluating Precision and Recall...")
precision, recall = evaluate_precision_recall(results, ground_truth, k_values)

print("Precision@k:", precision)
print("Recall@k:", recall)

In [None]:
# Initialize results as an empty dictionary
results = {}

# Debugging: Check query data
print(f"Queries DataFrame shape: {queries_df.shape}")
print(f"Number of queries: {len(queries)}")
print(f"Sample Query Numbers: {queries_df['Query number'][:5]}")
print(f"Sample Queries: {queries[:5]}")

# Perform search for all queries
for query_number, query_text in zip(queries_df['Query number'], queries):
    print(f"Query Number: {query_number}, Query Text: {query_text}")
    
    # Generate query embedding
    query_embedding = embedding_model.generate_embeddings([query_text])[0]
    
    # Perform search
    search_results = search_engine.search(query_embedding)
    print(f"Search Results: {search_results}")
    
    # Add results to dictionary
    results[query_number] = search_results

# Debugging: Print a sample of the results
print(f"Sample results: {list(results.items())[:5]}")