# Imports

In [8]:
# Third-Party Imports
import spacy
import nltk
import torch
from sentence_transformers import SentenceTransformer, util

# Standard Library Imports
import os
import sys
from math import inf

# Local Imports
from queries import get_text_cli
from get_documents import search

# Utility Functions

In [19]:
def get_text():
    term = get_text_cli('Enter a search term')
    return search(term)

def load_docs(dirname='corpus'):
    corpus = dict()
    main_path = os.path.join(os.path.dirname('__file__'), dirname)

    for file in os.listdir(main_path):
        with open(os.path.join(main_path, file), 'r') as f:
            corpus[file] = f.read()
    
    return corpus

def chunk_text(text, chunk_len):
    chunks = []
    current_chunk = ""
    sents = nltk.sent_tokenize(text)
    
    for sent in sents:
        if len(nltk.word_tokenize(current_chunk + f" {sent}")) >= chunk_len:
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            current_chunk += f" {sent}"
    
    chunks.append(current_chunk)
    
    return chunks

def cosine_similarity(text_1, text_2, model=None):
    if not model:
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    embedding_1 = model.encode(text_1, convert_to_tensor=True)
    embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
    return float(util.pytorch_cos_sim(embedding_1, embedding_2))