In [6]:
import os
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class DocumentDataset(Dataset):
    def __init__(self, folder_path):
        self.documents = []
        self.document_ids = []
        self._load_documents(folder_path)

    def _load_documents(self, folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".txt"):
                file_path = os.path.join(folder_path, file_name)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        self.documents.append(f.read().strip())
                        self.document_ids.append(file_name)
                except Exception as e:
                    print(f"Error loading file {file_name}: {e}")

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, idx):
        return self.documents[idx], self.document_ids[idx]

# Embedding Function
def embed_texts_in_batches(dataset, tokenizer, model, batch_size=32):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)
    all_embeddings = []
    document_ids = []
    with torch.no_grad():
        for batch in dataloader:
            texts, ids = zip(*batch)
            inputs = tokenizer(list(texts), padding=True, truncation=True, return_tensors='pt')
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(embeddings)
            document_ids.extend(ids)
    return torch.cat(all_embeddings, dim=0).numpy(), document_ids

# loading Path for Documents
folder_path = "full_docs/"
embeddings_file = "document_embeddings.npy"
ids_file = "document_ids.npy"

if os.path.exists(embeddings_file) and os.path.exists(ids_file):
    print("Loading existing embeddings and document IDs...")
    document_embeddings = np.load(embeddings_file)
    document_ids = np.load(ids_file)
else:
    print("Embeddings not found. Creating embeddings...")
    # Initialize Model and Tokenizer
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Load dataset and create embeddings
    dataset = DocumentDataset(folder_path)
    batch_size = 64
    document_embeddings, document_ids = embed_texts_in_batches(dataset, tokenizer, model, batch_size)

    # Saving Embeddings and Document IDs
    np.save(embeddings_file, document_embeddings)
    np.save(ids_file, np.array(document_ids))

print("Embeddings and document IDs are ready!")


Loading existing embeddings and document IDs...
Embeddings and document IDs are ready!


In [8]:
from sklearn.cluster import KMeans
import pickle

# File paths for saved clustering results
labels_file = "cluster_labels.npy"
centroids_file = "cluster_centroids.npy"
mapping_file = "cluster_to_docs.pkl"

# Check if clustering results already exist
if os.path.exists(labels_file) and os.path.exists(centroids_file) and os.path.exists(mapping_file):
    print("Loading existing clustering results...")
    cluster_labels = np.load(labels_file)
    cluster_centroids = np.load(centroids_file)
    with open(mapping_file, "rb") as f:
        cluster_to_docs = pickle.load(f)
else:
    print("Clustering results not found. Performing clustering...")
    
    document_embeddings = np.load("document_embeddings.npy")
    
    num_clusters = int(np.sqrt(len(document_embeddings)))
    
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, verbose=1)
    cluster_labels = kmeans.fit_predict(document_embeddings)
    cluster_centroids = kmeans.cluster_centers_
    
    # Save cluster labels and centroids
    np.save(labels_file, cluster_labels)
    np.save(centroids_file, cluster_centroids)
    
    # Create cluster-to-document mapping
    cluster_to_docs = {i: [] for i in range(num_clusters)}
    for doc_id, cluster_id in zip(document_ids, cluster_labels):
        cluster_to_docs[cluster_id].append(doc_id)
    
    # Save cluster-to-docs mapping
    with open(mapping_file, "wb") as f:
        pickle.dump(cluster_to_docs, f)
    
    print(f"Clustering complete! Number of clusters: {num_clusters}")

print("Clustering data is ready!")


Loading existing clustering results...
Clustering data is ready!


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

#  Embeding the query using the embedding model
def embed_query(query, tokenizer, model):
    inputs = tokenizer([query], padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        query_embedding = model(**inputs).last_hidden_state.mean(dim=1).numpy()
    return query_embedding

#  Performing the search within clusters
def search_within_clusters(
    query_vector, cluster_to_docs, cluster_centroids, document_embeddings, document_ids, top_k_clusters=5, top_k_docs=10
):
    # cosine similarity between the query vector and cluster centroids
    cluster_similarities = cosine_similarity(query_vector, cluster_centroids)[0]

    # Select top-k clusters based on similarity
    top_cluster_indices = np.argsort(cluster_similarities)[-top_k_clusters:][::-1]

    # Collecting documents from the top-k clusters
    candidate_docs = []
    candidate_embeddings = []
    cluster_info = []  # To store the cluster information
    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(document_ids)}
    for cluster_idx in top_cluster_indices:
        cluster_docs = cluster_to_docs[cluster_idx]
        candidate_docs.extend(cluster_docs)
        candidate_embeddings.extend([document_embeddings[doc_id_to_index[doc_id]] for doc_id in cluster_docs])
        cluster_info.extend([cluster_idx] * len(cluster_docs))  # Map each document to its cluster

    # Computing similarity with candidate documents
    candidate_embeddings = np.array(candidate_embeddings)
    doc_similarities = cosine_similarity(query_vector, candidate_embeddings)[0]
    top_doc_indices = np.argsort(doc_similarities)[-top_k_docs:][::-1]

    # Retrieving top documents with cluster info
    top_docs = [(candidate_docs[i], doc_similarities[i], cluster_info[i]) for i in top_doc_indices]

    return top_docs

#  Process a query
query = "how long does it take to get your bsrn if you already have a bachelors degree"  # Example query

# Initialize the tokenizer and model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load necessary data
cluster_centroids = np.load("cluster_centroids.npy")  
with open("cluster_to_docs.pkl", "rb") as f:
    cluster_to_docs = pickle.load(f)  
document_embeddings = np.load("document_embeddings.npy")
document_ids = np.load("document_ids.npy")

# Generate query vector
query_vector = embed_query(query, tokenizer, model)

# Perform the search within clusters
top_documents = search_within_clusters(
    query_vector,
    cluster_to_docs,
    cluster_centroids,
    document_embeddings,
    document_ids,
    top_k_clusters=10, 
    top_k_docs=10
)

# Display the results
print(f"Top documents for the query '{query}':")
for doc_id, similarity, cluster_id in top_documents:
    print(f"Document ID: {doc_id}, Cluster ID: {cluster_id}, Similarity: {similarity:.4f}")


Top documents for the query 'how long does it take to get your bsrn if you already have a bachelors degree':
Document ID: output_89888.txt, Cluster ID: 424, Similarity: 0.5980
Document ID: output_360137.txt, Cluster ID: 424, Similarity: 0.5771
Document ID: output_292295.txt, Cluster ID: 424, Similarity: 0.5562
Document ID: output_410331.txt, Cluster ID: 424, Similarity: 0.5516
Document ID: output_476462.txt, Cluster ID: 424, Similarity: 0.5509
Document ID: output_374546.txt, Cluster ID: 424, Similarity: 0.5366
Document ID: output_308294.txt, Cluster ID: 424, Similarity: 0.5294
Document ID: output_227058.txt, Cluster ID: 424, Similarity: 0.5261
Document ID: output_494208.txt, Cluster ID: 424, Similarity: 0.5234
Document ID: output_421813.txt, Cluster ID: 424, Similarity: 0.5233


In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load queries from a TSV file
#queries_file = 'dev_queries.tsv' 
queries_file = 'queries.csv'
#queries_df = pd.read_csv(queries_file, sep='\t') 
queries_df = pd.read_csv(queries_file, sep='\t') 

# Extract the first 1000 queries
sampled_queries = queries_df[['Query number', 'Query']] 

# Prepare to store results
results_data = []

# Process each query
for row in sampled_queries.itertuples(index=False):
    query_number = row[0]  
    query_text = row[1]   

    # Generate query vector
    query_vector = embed_query(query_text, tokenizer, model)

    # Perform the search within clusters
    top_documents = search_within_clusters(
        query_vector,
        cluster_to_docs,
        cluster_centroids,
        document_embeddings,
        document_ids,
        top_k_clusters=10,  
        top_k_docs=10
    )

    # Append results to the results data
    for doc_id, similarity, cluster_id in top_documents:
        results_data.append({
            'Query_number': query_number,
            'Document_ID': doc_id,
            'Cluster_ID': cluster_id,
            'Similarity': similarity
        })


# Save the results to a CSV file
output_file = 'result.csv'
results_df = pd.DataFrame(results_data)
results_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")

Results saved to result.csv


In [19]:
import pandas as pd
import numpy as np

def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
    return relevant_retrieved / k

def recall_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_retrieved = len(set(retrieved_k) & set(relevant_docs))
    return relevant_retrieved / len(relevant_docs) if len(relevant_docs) > 0 else 0

def evaluate_precision_recall_at_k(all_retrieved_docs, all_relevant_docs, k_values=[1, 3, 5, 10]):
    mean_precision = {k: 0 for k in k_values}
    mean_recall = {k: 0 for k in k_values}
    num_queries = len(all_retrieved_docs)

    for retrieved_docs, relevant_docs in zip(all_retrieved_docs, all_relevant_docs):
        for k in k_values:
            mean_precision[k] += precision_at_k(retrieved_docs, relevant_docs, k)
            mean_recall[k] += recall_at_k(retrieved_docs, relevant_docs, k)

    for k in k_values:
        mean_precision[k] /= num_queries
        mean_recall[k] /= num_queries

    return mean_precision, mean_recall

# Load files
retrieved_docs_file = "top_10_results_per_query.csv"
relevant_docs_file = "dev_query_results.csv"

retrieved_df = pd.read_csv(retrieved_docs_file)
relevant_df = pd.read_csv(relevant_docs_file)

# Normalize Document IDs in retrieved documents
retrieved_df['Document_ID'] = retrieved_df['Document_ID'].apply(lambda x: x.split('_')[-1].split('.')[0])

# Convert Relevant Document IDs to Strings
relevant_df['doc_number'] = relevant_df['doc_number'].astype(str)

# Group documents by Query_number
retrieved_grouped = retrieved_df.groupby('Query_number')['Document_ID'].apply(list)
relevant_grouped = relevant_df.groupby('Query_number')['doc_number'].apply(list)

# Align retrieved and relevant queries by Query_number
all_retrieved_docs = []
all_relevant_docs = []

for query_number in retrieved_grouped.index:
    if query_number in relevant_grouped:
        all_retrieved_docs.append(retrieved_grouped[query_number])
        all_relevant_docs.append(relevant_grouped[query_number])

# Evaluate Precision@k and Recall@k
k_values = [1, 3, 5, 10]
mean_precision, mean_recall = evaluate_precision_recall_at_k(all_retrieved_docs, all_relevant_docs, k_values)

# Print results
print("Evaluation Results:")
for k in k_values:
    print(f"Precision@{k}: {mean_precision[k]:.4f}, Recall@{k}: {mean_recall[k]:.4f}")


Evaluation Results:
Precision@1: 0.4760, Recall@1: 0.0315
Precision@3: 0.3590, Recall@3: 0.0710
Precision@5: 0.3010, Recall@5: 0.0994
Precision@10: 0.2146, Recall@10: 0.1403
