In [None]:
# Import required modules
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
import os

# 1. EMBEDDINGS
# Load and embed documents
def load_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
                texts.append(file.read())
    return texts

class EmbeddingModel:
    def __init__(self, model_name='all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts):
        return np.array(self.model.encode(texts, convert_to_numpy=True))

    def save_embeddings(self, embeddings, filepath):
        np.save(filepath, embeddings)

    def load_embeddings(self, filepath):
        return np.load(filepath)

# Load documents and generate embeddings
data_folder = "data/full_docs_small"
documents = load_texts_from_folder(data_folder)
embedding_model = EmbeddingModel()

if not os.path.exists("data/full_docs_embeddings.npy"):
    document_embeddings = embedding_model.generate_embeddings(documents)
    embedding_model.save_embeddings(document_embeddings, "data/full_docs_embeddings.npy")
else:
    document_embeddings = embedding_model.load_embeddings("data/full_docs_embeddings.npy")

print(f"Document embeddings shape: {document_embeddings.shape}")

# 2. SEARCHING
class SearchEngine:
    def __init__(self, document_embeddings):
        self.document_embeddings = document_embeddings

    def search(self, query_embedding, top_k=10):
        similarities = [
            1 - cosine(query_embedding, doc_embedding)
            for doc_embedding in self.document_embeddings
        ]
        ranked_results = sorted(
            enumerate(similarities), key=lambda x: x[1], reverse=True
        )
        return ranked_results[:top_k]

# Load queries
queries_path = "data/dev_small_queries.csv"
queries_df = pd.read_csv(queries_path)
queries = queries_df['query'].tolist()

# Test search with the first query
query_embedding = embedding_model.generate_embeddings([queries[0]])[0]
search_engine = SearchEngine(document_embeddings)
results = search_engine.search(query_embedding)

print("Query:", queries[0])
print("Top Results:", results)

# 3. EVALUATION
def evaluate_precision_recall(results, ground_truth, k_values):
    precision_at_k = {}
    recall_at_k = {}
    
    for k in k_values:
        precisions = []
        recalls = []
        for query_id, retrieved_docs in results.items():
            relevant_docs = set(ground_truth[query_id])
            retrieved_at_k = [doc[0] for doc in retrieved_docs[:k]]

            precision = len(set(retrieved_at_k) & relevant_docs) / k
            recall = len(set(retrieved_at_k) & relevant_docs) / len(relevant_docs)

            precisions.append(precision)
            recalls.append(recall)

        precision_at_k[k] = sum(precisions) / len(precisions)
        recall_at_k[k] = sum(recalls) / len(recalls)

    return precision_at_k, recall_at_k

# Load ground truth relevance data
ground_truth_path = "data/dev_query_results_small.csv"
ground_truth_df = pd.read_csv(ground_truth_path)
ground_truth = {
    row['query_id']: row['relevant_docs'].split(",")
    for _, row in ground_truth_df.iterrows()
}

# Evaluate search results
k_values = [1, 3, 5, 10]
precision, recall = evaluate_precision_recall(results, ground_truth, k_values)

print("Precision@k:", precision)
print("Recall@k:", recall)

: 