In [7]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple

def get_embeddings(text: str, model: SentenceTransformer) -> np.ndarray:
    return model.encode([text],show_progress_bar=True)

import re

def get_phrases(text: str) -> List[str]:
    # Replace line breaks with spaces
    text = text.replace('\n', ' ')
    # Split the text into phrases using a regex that matches punctuation marks
    phrases = re.split('[.!?]', text)
    # Remove leading and trailing spaces from each phrase
    phrases = [phrase.strip() for phrase in phrases if phrase.strip()]
    return phrases

def get_most_relevant_files(query: str, folder_path: str, model: SentenceTransformer) -> List[Tuple[str, List[Tuple[str, float]]]]:
    query_embedding = get_embeddings(query, model)
    file_results = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
                content = f.read()
                phrases = get_phrases(content)
                phrase_embeddings = model.encode(phrases, show_progress_bar=True)
                similarities = cosine_similarity(query_embedding, phrase_embeddings)
                sorted_indices = np.argsort(similarities[0])[::-1]
                sorted_phrases_similarities = [(phrases[i], similarities[0][i]) for i in sorted_indices]
                file_results.append((filename, sorted_phrases_similarities))

    return file_results


def main():
    # Load the model
    model_name = "sentence-transformers/paraphrase-distilroberta-base-v2"
    model = SentenceTransformer(model_name)

    # Define the query and folder path
    query = "Remote sensing for fertilizer management"
    folder_path = "text_files"

    # Get the most relevant files
    most_relevant_files = get_most_relevant_files(query, folder_path, model)

    # Sort the results by the highest similarity score in each document
    most_relevant_files.sort(key=lambda x: x[1][0][1], reverse=True)

    # Print the top 5 results
    for filename, phrase_similarities in most_relevant_files[:5]:
        print(f"Filename: {filename}")
        for phrase, similarity in phrase_similarities[:1]:  # Print only the top relevant sentence
            print(f"  {phrase}: {similarity}")
        print()

if __name__ == "__main__":
    main()


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Batches:   0%|          | 0/90 [00:00<?, ?it/s]