In [1]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the MiniLM-L6-v2 model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
def encode_chunks(chunks):
    # Encode each chunk using the Sentence Transformer model
    chunk_embeddings = model.encode(chunks)
    return chunk_embeddings

In [4]:
def cosine_similarity_search(query_embedding, chunk_embeddings, chunk_texts):
    # Calculate cosine similarity between query embedding and chunk embeddings
    similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
    # Sort the similarities in descending order
    sorted_indices = np.argsort(similarities)[::-1]
    # Return sorted chunk texts, similarities, and corresponding file names
    sorted_chunk_texts = [chunk_texts[i] for i in sorted_indices]
    sorted_similarities = [similarities[i] for i in sorted_indices]
    return sorted_chunk_texts, sorted_similarities, sorted_indices

In [5]:
def search_engine(input_folder, query, num_chunks_per_file, top_k=5):
    # Initialize a list to store the similarity scores for each file
    file_similarities = []

    # Process each file in the input folder
    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        if os.path.isfile(input_file_path):
            # Read file content
            with open(input_file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Find the starting and ending index of the chunks
            start_index = content.find("[")
            end_index = content.rfind("]")

            # Extract the chunk content
            chunk_content = content[start_index + 1:end_index]

            # Split the chunk content into individual chunks
            chunks = [chunk.strip()[1:-1] for chunk in chunk_content.split(",")]

            # Encode the query
            query_embedding = model.encode([query])

            # Calculate cosine similarity for each chunk
            chunk_similarities = []
            for chunk_text in chunks[:num_chunks_per_file]:
                # Encode the chunk
                chunk_embedding = model.encode([chunk_text])
                # Calculate cosine similarity with the query embedding
                similarity = cosine_similarity(query_embedding, chunk_embedding)[0][0]
                chunk_similarities.append(similarity)

            # Average similarity for the specified number of chunks per file
            average_similarity = np.mean(chunk_similarities)

            # Append file name and average similarity to the list
            file_similarities.append((filename, average_similarity))

    # Sort the file similarities based on average similarity in descending order
    sorted_file_similarities = sorted(file_similarities, key=lambda x: x[1], reverse=True)

    # Print top-k results
    print(f"Search results for '{query}':")
    for i in range(min(top_k, len(sorted_file_similarities))):
        filename, average_similarity = sorted_file_similarities[i]
        print(f"File: {filename} - Average similarity: {average_similarity:.4f}")
        print()

In [6]:
# Example usage
input_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_demo\overlapping_chunks"
query = 'The quick brown fox jumps over the lazy dog'  # Change this to your query

**Not Implemented Cleaning Function For Query, Just To See The Raw Results**

In [7]:
num_chunks_per_file = 5  # Number of chunks to take from each file
top_k = 10  # Number of top search results to display

In [8]:
search_engine(input_folder, query, num_chunks_per_file, top_k)

Search results for 'The quick brown fox jumps over the lazy dog':
File: a.fish.swimming.upside.down.(2020).eng.1cd_overlapping_chunk.txt - Average similarity: 0.2102

File: a.herbivorous.dragon.of.5000.years.gets.unfairly.villainized.s01.e01.please.eat.me.up.great.evil.dragon.(2022).eng.1cd_overlapping_chunk.txt - Average similarity: 0.1730

File: a.league.of.their.own.s01.e01.batter.up.(2022).eng.1cd_overlapping_chunk.txt - Average similarity: 0.1627

File: kingslayer.(2022).eng.1cd_overlapping_chunk.txt - Average similarity: 0.1616

File: a.fairly.odd.christmas.(2012).eng.1cd_overlapping_chunk.txt - Average similarity: 0.1363

File: a.e.i.o.u.a.quick.alphabet.of.love.(2022).eng.1cd_overlapping_chunk.txt - Average similarity: 0.1344

File: a.different.world.s06.e05.really.gross.anatomy.(1992).eng.1cd_overlapping_chunk.txt - Average similarity: 0.1250

File: a.million.little.things.s05.e01.the.last.dance.(2023).eng.1cd_overlapping_chunk.txt - Average similarity: 0.0919

File: abunai.de