# Semantic Search TF-IDF Model First Draft

This Jupyter notebook is meant to serve as an introduction to reading Github `.md` documentation and analyzing it...

In [1]:
import doc_reader as reader

doc_data = reader.collect_doc_data("docs/docs")

In [2]:
import md_cleaner as cleaner
import md_preprocessor as preprocessor

def clean_and_preproc_data(input_data):
    """
    Helper function to combine cleaning and preprocessing of data.

    Parameters:
        doc_data (dict | str) : raw documentation data or string
    
    Returns:
        cp_doc_data (dict | str) : cleaned and preprocessed doc data or string
    """
    if isinstance(input_data, dict):
        cleaned_data = cleaner.clean_doc_data(input_data)
        cp_data = preprocessor.preprocess_doc_data(cleaned_data)
    elif isinstance(input_data, str):
        cleaned_data = cleaner.clean_str(input_data)
        cp_data = preprocessor.preprocess_str(cleaned_data)

    return cp_data

In [3]:
cp_doc_data = clean_and_preproc_data(doc_data)

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def initialize_vectorizer(docs):
    """
    Initializes a TF-IDF vectorizer model on inputted documents.
    """
    vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
    tokenized_corpus = list(docs.values())
    corpus = [' '.join(tokens) for tokens in tokenized_corpus]
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return vectorizer, tfidf_matrix

vectorizer, tfidf_matrix = initialize_vectorizer(cp_doc_data)

In [5]:
def semantic_search(query, docs, vectorizer, tfidf_matrix):
    """
    Runs a semantic search with a query on inputted docs.

    Parameters:
        query (str): The query string
        docs (list[str]): List of document names corresponding to the tfidf_matrix rows
        vectorizer (TfidfVectorizer): The initialized TF-IDF vectorizer
        tfidf_matrix (scipy.sparse.csr_matrix): The TF-IDF matrix

    Returns:
        results (list): List of tuples containing similar documents and their similarity scores
    """
    corrected_query = cleaner.correct_spelling(query)
    cp_query = clean_and_preproc_data(corrected_query)
    query_vector = vectorizer.transform([" ".join(cp_query)])
    
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)
    sorted_indexes = np.argsort(similarity_scores, axis=1)[0][::-1]

    filenames = list(docs.keys())
    similar_docs = [(filenames[i], similarity_scores[0][i]) for i in sorted_indexes]

    return similar_docs

In [6]:
def get_relevant_files(query, docs, vectorizer, tfidf_matrix, top_k=5, include_score=False, verbose=False):
    """
    Gets the top 'k' relevant files from an inputted query. Defaults to top
    5 most relevant files.

    Parameters:
        query (str) : question to search PW documentation for
        top_k (int) : top 'k' most relevant files to return (default: 5)
        include_score (bool) : if True, includes similarity score of file
        verbose (bool) : if True, prints files in addition to returning
    
    Returns:
        rel_files (list) : top 'k' most relevant files
    """
    try:
        similar_docs = semantic_search(query, docs, vectorizer, tfidf_matrix)
    except TypeError:
        print("Your query does not match anything in our system.")
        return []

    if include_score:
        rel_files = similar_docs[:top_k]
        if verbose:
            print(f"Top {top_k} most relevant files to your query with similarity scores included:\n")
            for i, (file, sim_score) in enumerate(rel_files):
                print(f"{i + 1}. {file}: {sim_score}")
        return rel_files
    else:
        rel_files = [filename for filename, _ in similar_docs[:top_k]]
        if verbose:
            print(f"Top {top_k} most relevant files to your query:\n")
            for i, file in enumerate(rel_files):
                print(f"{i + 1}. {file}")
    return rel_files

In [7]:
query = "Where can I find AWS key buckets?"

get_relevant_files(query, cp_doc_data, vectorizer, tfidf_matrix, include_score=True, verbose=True);

Top 5 most relevant files to your query with similarity scores included:

1. creating-storage.md: 0.1989001322520514
2. adding-cloud-accounts.md: 0.19838891885986862
3. transferring-data-aws.md: 0.19001672142716608
4. starting-stopping-clusters.md: 0.13871795200352793
5. configuring-billing-infrastructure.md: 0.10918386509903194
