In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Reading all texts files into notebook

import os

folder_path = '/content/drive/My Drive/small_docs/'

# Reading all documents
documents = []
file_names = []

for file_name in sorted(os.listdir(folder_path)):
    if file_name.endswith('.txt'):
        file_names.append(file_name)
        with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as f:
            documents.append(f.read())


In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Correcting model name for Sentence-BERT
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [4]:
from torch.utils.data import DataLoader

def embed_texts_in_batches(texts, tokenizer, model, batch_size=32):
    dataloader = DataLoader(texts, batch_size=batch_size, shuffle=False)
    all_embeddings = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(embeddings)
    return torch.cat(all_embeddings, dim=0)

batch_size = 16
document_embeddings = embed_texts_in_batches(documents, tokenizer, model, batch_size=batch_size)


In [5]:
import numpy as np

# Converting to NumPy and save
document_embeddings_np = document_embeddings.numpy()
np.save('document_embeddings.npy', document_embeddings_np)

# Saveing file names to match embeddings with documents
with open('/content/drive/My Drive/small_docs/file_names.txt', 'w') as f:
    for name in file_names:
        f.write(name + '\n')


In [6]:
#printing embeddings for the first file
first_document = documents[0]
inputs = tokenizer(first_document, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    embeddings = model(**inputs).last_hidden_state.mean(dim=1)

print("Embeddings for the first document:")
print(embeddings)


Embeddings for the first document:
tensor([[-8.4179e-02, -8.2721e-02, -5.2317e-02, -3.5017e-02, -1.7434e-01,
         -1.4647e-01,  5.5992e-02, -2.0399e-02, -1.8544e-01,  1.2042e-01,
          1.2232e-01,  9.0499e-02,  1.1968e-01, -1.7686e-01, -1.8404e-01,
          1.3483e-03, -3.9279e-01,  1.4928e-01, -1.7530e-01, -1.9189e-01,
          2.0677e-02,  2.1093e-01, -1.8323e-02, -2.0375e-01,  9.7382e-02,
          1.8305e-01,  7.6555e-02,  1.2846e-01, -2.5810e-01, -2.1809e-01,
         -1.0664e-01,  1.6551e-01,  8.2901e-02,  8.9456e-03,  7.3162e-02,
         -1.1301e-01,  1.8325e-02, -4.1430e-02,  8.7429e-02,  2.0378e-01,
          1.0946e-01, -1.0846e-01,  1.2455e-01, -1.4075e-01,  1.7200e-01,
         -2.2291e-01, -2.1281e-01, -1.4772e-01,  1.0915e-01, -1.4039e-01,
         -8.8324e-02, -2.4602e-03, -1.4982e-01,  1.9547e-01, -7.8984e-02,
         -3.8072e-01,  9.7403e-02, -7.8756e-02, -1.3045e-02,  1.0633e-01,
         -9.2485e-02,  6.2383e-02, -8.1385e-02, -1.4009e-01, -3.2045e-02,
   

In [24]:
from sklearn.metrics.pairwise import cosine_similarity
# Define embed_texts function
def embed_texts(texts, tokenizer, model):
    # Tokenize and encode
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    # Generate embeddings using the model
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings
# Extract titles from documents
titles = file_names

def search(query, document_embeddings, documents, titles, tokenizer, model, top_k=10):
    query_embedding = embed_texts([query], tokenizer, model).numpy()

    similarities = cosine_similarity(query_embedding, document_embeddings)

    sorted_indices = similarities.argsort()[0][::-1][:top_k]

    results = [(titles[idx], documents[idx], similarities[0][idx]) for idx in sorted_indices]
    return results

query = "what are the products and by products of photosynthesis?"
results = search(query, document_embeddings_np, documents, titles, tokenizer, model)

print("Top documents for:" + query)
for title, doc, score in results:
    print(f"Score: {score:.4f} {title}")



Top documents for:what are the products and by products of photosynthesis?
Score: 0.5919 output_466.txt
Score: 0.4934 output_407.txt
Score: 0.4357 output_1248.txt
Score: 0.4357 output_90.txt
Score: 0.3451 output_1198.txt
Score: 0.3270 output_481.txt
Score: 0.3086 output_263.txt
Score: 0.2914 output_776.txt
Score: 0.2863 output_505.txt
Score: 0.2712 output_558.txt


In [33]:
import pandas as pd

# Loading queries from the CSV file
queries_file = '/content/drive/My Drive/dev_small_queries.csv'
queries_df = pd.read_csv(queries_file)

# Extracting the Query_number and Query
queries = queries_df[['Query number', 'Query']].values

results_data = []

# Processing each query
for query_number, query_text in queries:
    search_results = search(query_text, document_embeddings_np, documents, titles, tokenizer, model, top_k=10)


    for doc_number, _, _ in search_results:
        results_data.append({
            'Query_number': query_number,
            'Document_number': doc_number
        })

# Saving the results to a CSV file
output_file = '/content/drive/My Drive/small_query_results.csv'
results_df = pd.DataFrame(results_data)
results_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


Results saved to /content/drive/My Drive/small_query_results.csv


In [30]:
import pandas as pd

# Loading the retrieved and relevant document files
retrieved_df = pd.read_csv('/content/drive/My Drive/small_query_results.csv')
relevant_df = pd.read_csv('/content/drive/My Drive/dev_query_results_small.csv')

# Creating a dictionary mapping query_number to the relevant document
relevant_docs_map = relevant_df.set_index('Query_number')['doc_number'].to_dict()

results = []

# Process each unique query
for query_number in retrieved_df['Query_number'].unique():
    top_retrieved_docs = retrieved_df[retrieved_df['Query_number'] == query_number]['Document_number'].tolist()

    relevant_doc = f"output_{relevant_docs_map[query_number]}.txt"

    for k in [1, 3, 5, 10]:
        retrieved_at_k = top_retrieved_docs[:k]
        retrieved_relevant = int(relevant_doc in retrieved_at_k)

        precision = retrieved_relevant / k
        recall = retrieved_relevant / 1

        results.append({
            'Query_number': query_number,
            'k': k,
            'Precision@k': precision,
            'Recall@k': recall
        })

results_df = pd.DataFrame(results)

results_df.to_csv('/content/drive/My Drive/precision_recall_results.csv', index=False)

mean_metrics = results_df.groupby('k')[['Precision@k', 'Recall@k']].mean()
print(mean_metrics)


    Precision@k  Recall@k
k                        
1      0.516129  0.516129
3      0.216398  0.649194
5      0.143548  0.717742
10     0.077016  0.770161
